diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst index 4911e797cb..c4ac7fdf38 100644 --- a/docs/tools/virtiofsd.rst +++ b/docs/tools/virtiofsd.rst @@ -101,6 +101,9 @@ Options Enable/disable extended attributes (xattr) on files and directories. The default is ``no_xattr``. + * posix_acl|no_posix_acl - + Enable/disable posix acl support. Posix ACLs are disabled by default`. + .. option:: --socket-path=PATH Listen on vhost-user UNIX domain socket at PATH. @@ -127,8 +130,8 @@ Options timeout. ``always`` sets a long cache lifetime at the expense of coherency. The default is ``auto``. -xattr-mapping -------------- +Extended attribute (xattr) mapping +---------------------------------- By default the name of xattr's used by the client are passed through to the server file system. This can be a problem where either those xattr names are used @@ -136,6 +139,9 @@ by something on the server (e.g. selinux client/server confusion) or if the virtiofsd is running in a container with restricted privileges where it cannot access some attributes. +Mapping syntax +~~~~~~~~~~~~~~ + A mapping of xattr names can be made using -o xattrmap=mapping where the ``mapping`` string consists of a series of rules. @@ -232,8 +238,48 @@ Note: When the 'security.capability' xattr is remapped, the daemon has to do extra work to remove it during many operations, which the host kernel normally does itself. -xattr-mapping Examples ----------------------- +Security considerations +~~~~~~~~~~~~~~~~~~~~~~~ + +Operating systems typically partition the xattr namespace using +well defined name prefixes. Each partition may have different +access controls applied. For example, on Linux there are multiple +partitions + + * ``system.*`` - access varies depending on attribute & filesystem + * ``security.*`` - only processes with CAP_SYS_ADMIN + * ``trusted.*`` - only processes with CAP_SYS_ADMIN + * ``user.*`` - any process granted by file permissions / ownership + +While other OS such as FreeBSD have different name prefixes +and access control rules. + +When remapping attributes on the host, it is important to +ensure that the remapping does not allow a guest user to +evade the guest access control rules. + +Consider if ``trusted.*`` from the guest was remapped to +``user.virtiofs.trusted*`` in the host. An unprivileged +user in a Linux guest has the ability to write to xattrs +under ``user.*``. Thus the user can evade the access +control restriction on ``trusted.*`` by instead writing +to ``user.virtiofs.trusted.*``. + +As noted above, the partitions used and access controls +applied, will vary across guest OS, so it is not wise to +try to predict what the guest OS will use. + +The simplest way to avoid an insecure configuration is +to remap all xattrs at once, to a given fixed prefix. +This is shown in example (1) below. + +If selectively mapping only a subset of xattr prefixes, +then rules must be added to explicitly block direct +access to the target of the remapping. This is shown +in example (2) below. + +Mapping examples +~~~~~~~~~~~~~~~~ 1) Prefix all attributes with 'user.virtiofs.' @@ -271,7 +317,9 @@ stripping of 'user.virtiofs.'. The second rule hides unprefixed 'trusted.' attributes on the host. The third rule stops a guest from explicitly setting -the 'user.virtiofs.' path directly. +the 'user.virtiofs.' path directly to prevent access +control bypass on the target of the earlier prefix +remapping. Finally, the fourth rule lets all remaining attributes through. diff --git a/migration/migration.c b/migration/migration.c index 4228635d18..5ff7ba9d5c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -456,10 +456,6 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p = NULL; - if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { - return; - } - qapi_event_send_migration(MIGRATION_STATUS_SETUP); if (strstart(uri, "tcp:", &p) || strstart(uri, "unix:", NULL) || @@ -474,7 +470,6 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) } else if (strstart(uri, "fd:", &p)) { fd_start_incoming_migration(p, errp); } else { - yank_unregister_instance(MIGRATION_YANK_INSTANCE); error_setg(errp, "unknown migration protocol: %s", uri); } } @@ -2083,9 +2078,14 @@ void qmp_migrate_incoming(const char *uri, Error **errp) return; } + if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { + return; + } + qemu_start_incoming_migration(uri, &local_err); if (local_err) { + yank_unregister_instance(MIGRATION_YANK_INSTANCE); error_propagate(errp, local_err); return; } @@ -2097,6 +2097,13 @@ void qmp_migrate_recover(const char *uri, Error **errp) { MigrationIncomingState *mis = migration_incoming_get_current(); + /* + * Don't even bother to use ERRP_GUARD() as it _must_ always be set by + * callers (no one should ignore a recover failure); if there is, it's a + * programming error. + */ + assert(errp); + if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) { error_setg(errp, "Migrate recover can only be run " "when postcopy is paused."); @@ -2114,8 +2121,13 @@ void qmp_migrate_recover(const char *uri, Error **errp) * only re-setup the migration stream and poke existing migration * to continue using that newly established channel. */ - yank_unregister_instance(MIGRATION_YANK_INSTANCE); qemu_start_incoming_migration(uri, errp); + + /* Safe to dereference with the assert above */ + if (*errp) { + /* Reset the flag so user could still retry */ + qatomic_set(&mis->postcopy_recover_triggered, false); + } } void qmp_migrate_pause(Error **errp) @@ -3664,6 +3676,39 @@ bool migration_rate_limit(void) return urgent; } +/* + * if failover devices are present, wait they are completely + * unplugged + */ + +static void qemu_savevm_wait_unplug(MigrationState *s, int old_state, + int new_state) +{ + if (qemu_savevm_state_guest_unplug_pending()) { + migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG); + + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && + qemu_savevm_state_guest_unplug_pending()) { + qemu_sem_timedwait(&s->wait_unplug_sem, 250); + } + if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) { + int timeout = 120; /* 30 seconds */ + /* + * migration has been canceled + * but as we have started an unplug we must wait the end + * to be able to plug back the card + */ + while (timeout-- && qemu_savevm_state_guest_unplug_pending()) { + qemu_sem_timedwait(&s->wait_unplug_sem, 250); + } + } + + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state); + } else { + migrate_set_state(&s->state, old_state, new_state); + } +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -3710,22 +3755,10 @@ static void *migration_thread(void *opaque) qemu_savevm_state_setup(s->to_dst_file); - if (qemu_savevm_state_guest_unplug_pending()) { - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_WAIT_UNPLUG); - - while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && - qemu_savevm_state_guest_unplug_pending()) { - qemu_sem_timedwait(&s->wait_unplug_sem, 250); - } - - migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, - MIGRATION_STATUS_ACTIVE); - } + qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_ACTIVE); s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_ACTIVE); trace_migration_thread_setup_complete(); @@ -3833,21 +3866,9 @@ static void *bg_migration_thread(void *opaque) qemu_savevm_state_header(s->to_dst_file); qemu_savevm_state_setup(s->to_dst_file); - if (qemu_savevm_state_guest_unplug_pending()) { - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_WAIT_UNPLUG); + qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_ACTIVE); - while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && - qemu_savevm_state_guest_unplug_pending()) { - qemu_sem_timedwait(&s->wait_unplug_sem, 250); - } - - migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, - MIGRATION_STATUS_ACTIVE); - } else { - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_ACTIVE); - } s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; trace_migration_thread_setup_complete(); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index d6e03dbc0e..1eacf9e831 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -416,6 +416,11 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size, { f->iov[f->iovcnt - 1].iov_len += size; } else { + if (f->iovcnt >= MAX_IOV_SIZE) { + /* Should only happen if a previous fflush failed */ + assert(f->shutdown || !qemu_file_is_writable(f)); + return 1; + } if (may_free) { set_bit(f->iovcnt, f->may_free); } diff --git a/migration/rdma.c b/migration/rdma.c index d90b29a4b5..b6cc4bef4a 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -1006,7 +1006,7 @@ route: if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { ERROR(errp, "result not equal to event_addr_resolved %s", rdma_event_str(cm_event->event)); - perror("rdma_resolve_addr"); + error_report("rdma_resolve_addr"); rdma_ack_cm_event(cm_event); ret = -EINVAL; goto err_resolve_get_addr; @@ -2544,7 +2544,7 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path) } if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { - perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect"); + error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect"); ERROR(errp, "connecting to destination!"); rdma_ack_cm_event(cm_event); goto err_rdma_source_connect; diff --git a/tests/migration/guestperf/engine.py b/tests/migration/guestperf/engine.py index 208e095794..7c991c4407 100644 --- a/tests/migration/guestperf/engine.py +++ b/tests/migration/guestperf/engine.py @@ -113,7 +113,7 @@ class Engine(object): vcpus = src.command("query-cpus-fast") src_threads = [] for vcpu in vcpus: - src_threads.append(vcpu["thread_id"]) + src_threads.append(vcpu["thread-id"]) # XXX how to get dst timings on remote host ? @@ -153,7 +153,7 @@ class Engine(object): max_bandwidth=scenario._bandwidth * 1024 * 1024) resp = src.command("migrate-set-parameters", - downtime_limit=scenario._downtime / 1024.0) + downtime_limit=scenario._downtime) if scenario._compression_mt: resp = src.command("migrate-set-capabilities", diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 2b028df687..328d6dbe97 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -27,6 +27,10 @@ #include "migration-helpers.h" #include "tests/migration/migration-test.h" +#if defined(__linux__) +#include "linux/kvm.h" +#endif + /* TODO actually test the results and get rid of this */ #define qtest_qmp_discard_response(...) qobject_unref(qtest_qmp(__VA_ARGS__)) @@ -467,6 +471,8 @@ typedef struct { bool use_shmem; /* only launch the target process */ bool only_target; + /* Use dirty ring if true; dirty logging otherwise */ + bool use_dirty_ring; char *opts_source; char *opts_target; } MigrateStart; @@ -573,11 +579,13 @@ static int test_migrate_start(QTestState **from, QTestState **to, shmem_opts = g_strdup(""); } - cmd_source = g_strdup_printf("-accel kvm -accel tcg%s%s " + cmd_source = g_strdup_printf("-accel kvm%s -accel tcg%s%s " "-name source,debug-threads=on " "-m %s " "-serial file:%s/src_serial " "%s %s %s %s", + args->use_dirty_ring ? + ",dirty-ring-size=4096" : "", machine_opts ? " -machine " : "", machine_opts ? machine_opts : "", memory_size, tmpfs, @@ -587,12 +595,14 @@ static int test_migrate_start(QTestState **from, QTestState **to, *from = qtest_init(cmd_source); } - cmd_target = g_strdup_printf("-accel kvm -accel tcg%s%s " + cmd_target = g_strdup_printf("-accel kvm%s -accel tcg%s%s " "-name target,debug-threads=on " "-m %s " "-serial file:%s/dest_serial " "-incoming %s " "%s %s %s %s", + args->use_dirty_ring ? + ",dirty-ring-size=4096" : "", machine_opts ? " -machine " : "", machine_opts ? machine_opts : "", memory_size, tmpfs, uri, @@ -785,12 +795,14 @@ static void test_baddest(void) test_migrate_end(from, to, false); } -static void test_precopy_unix(void) +static void test_precopy_unix_common(bool dirty_ring) { g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); MigrateStart *args = migrate_start_new(); QTestState *from, *to; + args->use_dirty_ring = dirty_ring; + if (test_migrate_start(&from, &to, uri, args)) { return; } @@ -825,6 +837,18 @@ static void test_precopy_unix(void) test_migrate_end(from, to, true); } +static void test_precopy_unix(void) +{ + /* Using default dirty logging */ + test_precopy_unix_common(false); +} + +static void test_precopy_unix_dirty_ring(void) +{ + /* Using dirty ring tracking */ + test_precopy_unix_common(true); +} + #if 0 /* Currently upset on aarch64 TCG */ static void test_ignore_shared(void) @@ -1369,6 +1393,29 @@ static void test_multifd_tcp_cancel(void) test_migrate_end(from, to2, true); } +static bool kvm_dirty_ring_supported(void) +{ +#if defined(__linux__) + int ret, kvm_fd = open("/dev/kvm", O_RDONLY); + + if (kvm_fd < 0) { + return false; + } + + ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, KVM_CAP_DIRTY_LOG_RING); + close(kvm_fd); + + /* We test with 4096 slots */ + if (ret < 4096) { + return false; + } + + return true; +#else + return false; +#endif +} + int main(int argc, char **argv) { char template[] = "/tmp/migration-test-XXXXXX"; @@ -1439,6 +1486,11 @@ int main(int argc, char **argv) qtest_add_func("/migration/multifd/tcp/zstd", test_multifd_tcp_zstd); #endif + if (kvm_dirty_ring_supported()) { + qtest_add_func("/migration/dirty_ring", + test_precopy_unix_dirty_ring); + } + ret = g_test_run(); g_assert_cmpint(ret, ==, 0); diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h index fa9671872e..0c2665b977 100644 --- a/tools/virtiofsd/fuse_common.h +++ b/tools/virtiofsd/fuse_common.h @@ -372,6 +372,11 @@ struct fuse_file_info { */ #define FUSE_CAP_HANDLE_KILLPRIV_V2 (1 << 28) +/** + * Indicates that file server supports extended struct fuse_setxattr_in + */ +#define FUSE_CAP_SETXATTR_EXT (1 << 29) + /** * Ioctl flags * diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c index 7fe2cef1eb..e4679c73ab 100644 --- a/tools/virtiofsd/fuse_lowlevel.c +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -1084,6 +1084,12 @@ static void do_open(fuse_req_t req, fuse_ino_t nodeid, return; } + /* File creation is handled by do_create() or do_mknod() */ + if (arg->flags & (O_CREAT | O_TMPFILE)) { + fuse_reply_err(req, EINVAL); + return; + } + memset(&fi, 0, sizeof(fi)); fi.flags = arg->flags; fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; @@ -1419,8 +1425,13 @@ static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, struct fuse_setxattr_in *arg; const char *name; const char *value; + bool setxattr_ext = req->se->conn.want & FUSE_CAP_SETXATTR_EXT; - arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (setxattr_ext) { + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + } else { + arg = fuse_mbuf_iter_advance(iter, FUSE_COMPAT_SETXATTR_IN_SIZE); + } name = fuse_mbuf_iter_advance_str(iter); if (!arg || !name) { fuse_reply_err(req, EINVAL); @@ -1434,7 +1445,9 @@ static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, } if (req->se->op.setxattr) { - req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags); + uint32_t setxattr_flags = setxattr_ext ? arg->setxattr_flags : 0; + req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags, + setxattr_flags); } else { fuse_reply_err(req, ENOSYS); } @@ -1981,6 +1994,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2; } + if (arg->flags & FUSE_SETXATTR_EXT) { + se->conn.capable |= FUSE_CAP_SETXATTR_EXT; + } #ifdef HAVE_SPLICE #ifdef HAVE_VMSPLICE se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; @@ -2116,6 +2132,10 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, outarg.flags |= FUSE_HANDLE_KILLPRIV_V2; } + if (se->conn.want & FUSE_CAP_SETXATTR_EXT) { + outarg.flags |= FUSE_SETXATTR_EXT; + } + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h index 3bf786b034..4b4e8c9724 100644 --- a/tools/virtiofsd/fuse_lowlevel.h +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -798,7 +798,8 @@ struct fuse_lowlevel_ops { * fuse_reply_err */ void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, - const char *value, size_t size, int flags); + const char *value, size_t size, int flags, + uint32_t setxattr_flags); /** * Get an extended attribute diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index 5e98ed702b..a8295d975a 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -186,6 +186,7 @@ void fuse_cmdline_help(void) " to virtiofsd from guest applications.\n" " default: no_allow_direct_io\n" " -o announce_submounts Announce sub-mount points to the guest\n" + " -o posix_acl/no_posix_acl Enable/Disable posix_acl. (default: disabled)\n" ); } diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 49c21fd855..38b2af8599 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -122,6 +122,7 @@ struct lo_inode { struct lo_cred { uid_t euid; gid_t egid; + mode_t umask; }; enum { @@ -172,6 +173,9 @@ struct lo_data { /* An O_PATH file descriptor to /proc/self/fd/ */ int proc_self_fd; int user_killpriv_v2, killpriv_v2; + /* If set, virtiofsd is responsible for setting umask during creation */ + bool change_umask; + int user_posix_acl, posix_acl; }; static const struct fuse_opt lo_opts[] = { @@ -204,6 +208,8 @@ static const struct fuse_opt lo_opts[] = { { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 }, { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 }, { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 }, + { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 }, + { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 }, FUSE_OPT_END }; static bool use_syslog = false; @@ -702,6 +708,32 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2; lo->killpriv_v2 = 0; } + + if (lo->user_posix_acl == 1) { + /* + * User explicitly asked for this option. Enable it unconditionally. + * If connection does not have this capability, print error message + * now. It will fail later in fuse_lowlevel.c + */ + if (!(conn->capable & FUSE_CAP_POSIX_ACL) || + !(conn->capable & FUSE_CAP_DONT_MASK) || + !(conn->capable & FUSE_CAP_SETXATTR_EXT)) { + fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl." + " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK" + " or FUSE_SETXATTR_EXT capability.\n"); + } else { + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n"); + } + + conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK | + FUSE_CAP_SETXATTR_EXT; + lo->change_umask = true; + lo->posix_acl = true; + } else { + /* User either did not specify anything or wants it disabled */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n"); + conn->want &= ~FUSE_CAP_POSIX_ACL; + } } static void lo_getattr(fuse_req_t req, fuse_ino_t ino, @@ -1134,7 +1166,8 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) * ownership of caller. * TODO: What about selinux context? */ -static int lo_change_cred(fuse_req_t req, struct lo_cred *old) +static int lo_change_cred(fuse_req_t req, struct lo_cred *old, + bool change_umask) { int res; @@ -1154,11 +1187,14 @@ static int lo_change_cred(fuse_req_t req, struct lo_cred *old) return errno_save; } + if (change_umask) { + old->umask = umask(req->ctx.umask); + } return 0; } /* Regain Privileges */ -static void lo_restore_cred(struct lo_cred *old) +static void lo_restore_cred(struct lo_cred *old, bool restore_umask) { int res; @@ -1173,6 +1209,54 @@ static void lo_restore_cred(struct lo_cred *old) fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid); exit(1); } + + if (restore_umask) + umask(old->umask); +} + +/* + * A helper to change cred and drop capability. Returns 0 on success and + * errno on error + */ +static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old, + bool change_umask, const char *cap_name, + bool *cap_dropped) +{ + int ret; + bool __cap_dropped; + + assert(cap_name); + + ret = drop_effective_cap(cap_name, &__cap_dropped); + if (ret) { + return ret; + } + + ret = lo_change_cred(req, old, change_umask); + if (ret) { + if (__cap_dropped) { + if (gain_effective_cap(cap_name)) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name); + } + } + } + + if (cap_dropped) { + *cap_dropped = __cap_dropped; + } + return ret; +} + +static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask, + const char *cap_name) +{ + assert(cap_name); + + lo_restore_cred(old, restore_umask); + + if (gain_effective_cap(cap_name)) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name); + } } static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, @@ -1202,7 +1286,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, return; } - saverr = lo_change_cred(req, &old); + saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode)); if (saverr) { goto out; } @@ -1211,7 +1295,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, saverr = errno; - lo_restore_cred(&old); + lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode)); if (res == -1) { goto out; @@ -1917,7 +2001,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, return; } - err = lo_change_cred(req, &old); + err = lo_change_cred(req, &old, lo->change_umask); if (err) { goto out; } @@ -1928,7 +2012,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode); err = fd == -1 ? errno : 0; - lo_restore_cred(&old); + lo_restore_cred(&old, lo->change_umask); /* Ignore the error if file exists and O_EXCL was not given */ if (err && (err != EEXIST || (fi->flags & O_EXCL))) { @@ -2727,6 +2811,63 @@ static int xattr_map_server(const struct lo_data *lo, const char *server_name, assert(fchdir_res == 0); \ } while (0) +static bool block_xattr(struct lo_data *lo, const char *name) +{ + /* + * If user explicitly enabled posix_acl or did not provide any option, + * do not block acl. Otherwise block system.posix_acl_access and + * system.posix_acl_default xattrs. + */ + if (lo->user_posix_acl) { + return false; + } + if (!strcmp(name, "system.posix_acl_access") || + !strcmp(name, "system.posix_acl_default")) + return true; + + return false; +} + +/* + * Returns number of bytes in xattr_list after filtering on success. This + * could be zero as well if nothing is left after filtering. + * + * Returns negative error code on failure. + * xattr_list is modified in place. + */ +static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list, + unsigned in_size) +{ + size_t out_index, in_index; + + /* + * As of now we only filter out acl xattrs. If acls are enabled or + * they have not been explicitly disabled, there is nothing to + * filter. + */ + if (lo->user_posix_acl) { + return in_size; + } + + out_index = 0; + in_index = 0; + while (in_index < in_size) { + char *in_ptr = xattr_list + in_index; + + /* Length of current attribute name */ + size_t in_len = strlen(xattr_list + in_index) + 1; + + if (!block_xattr(lo, in_ptr)) { + if (in_index != out_index) { + memmove(xattr_list + out_index, xattr_list + in_index, in_len); + } + out_index += in_len; + } + in_index += in_len; + } + return out_index; +} + static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, size_t size) { @@ -2740,6 +2881,11 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, int saverr; int fd = -1; + if (block_xattr(lo, in_name)) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + mapped_name = NULL; name = in_name; if (lo->xattrmap) { @@ -2791,15 +2937,17 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, goto out_err; } ret = fgetxattr(fd, name, value, size); + saverr = ret == -1 ? errno : 0; } else { /* fchdir should not fail here */ FCHDIR_NOFAIL(lo->proc_self_fd); ret = getxattr(procname, name, value, size); + saverr = ret == -1 ? errno : 0; FCHDIR_NOFAIL(lo->root.fd); } if (ret == -1) { - goto out_err; + goto out; } if (size) { saverr = 0; @@ -2864,15 +3012,17 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) goto out_err; } ret = flistxattr(fd, value, size); + saverr = ret == -1 ? errno : 0; } else { /* fchdir should not fail here */ FCHDIR_NOFAIL(lo->proc_self_fd); ret = listxattr(procname, value, size); + saverr = ret == -1 ? errno : 0; FCHDIR_NOFAIL(lo->root.fd); } if (ret == -1) { - goto out_err; + goto out; } if (size) { saverr = 0; @@ -2926,6 +3076,12 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) goto out; } } + + ret = remove_blocked_xattrs(lo, value, ret); + if (ret <= 0) { + saverr = -ret; + goto out; + } fuse_reply_buf(req, value, ret); } else { /* @@ -2951,7 +3107,8 @@ out: } static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, - const char *value, size_t size, int flags) + const char *value, size_t size, int flags, + uint32_t extra_flags) { char procname[64]; const char *name; @@ -2961,6 +3118,14 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, ssize_t ret; int saverr; int fd = -1; + bool switched_creds = false; + bool cap_fsetid_dropped = false; + struct lo_cred old = {}; + + if (block_xattr(lo, in_name)) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } mapped_name = NULL; name = in_name; @@ -2991,6 +3156,26 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, ", name=%s value=%s size=%zd)\n", ino, name, value, size); sprintf(procname, "%i", inode->fd); + /* + * If we are setting posix access acl and if SGID needs to be + * cleared, then switch to caller's gid and drop CAP_FSETID + * and that should make sure host kernel clears SGID. + * + * This probably will not work when we support idmapped mounts. + * In that case we will need to find a non-root gid and switch + * to it. (Instead of gid in request). Fix it when we support + * idmapped mounts. + */ + if (lo->posix_acl && !strcmp(name, "system.posix_acl_access") + && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) { + ret = lo_drop_cap_change_cred(req, &old, false, "FSETID", + &cap_fsetid_dropped); + if (ret) { + saverr = ret; + goto out; + } + switched_creds = true; + } if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { fd = openat(lo->proc_self_fd, procname, O_RDONLY); if (fd < 0) { @@ -2998,14 +3183,20 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, goto out; } ret = fsetxattr(fd, name, value, size, flags); + saverr = ret == -1 ? errno : 0; } else { /* fchdir should not fail here */ FCHDIR_NOFAIL(lo->proc_self_fd); ret = setxattr(procname, name, value, size, flags); + saverr = ret == -1 ? errno : 0; FCHDIR_NOFAIL(lo->root.fd); } - - saverr = ret == -1 ? errno : 0; + if (switched_creds) { + if (cap_fsetid_dropped) + lo_restore_cred_gain_cap(&old, false, "FSETID"); + else + lo_restore_cred(&old, false); + } out: if (fd >= 0) { @@ -3028,6 +3219,11 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name) int saverr; int fd = -1; + if (block_xattr(lo, in_name)) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + mapped_name = NULL; name = in_name; if (lo->xattrmap) { @@ -3064,15 +3260,15 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name) goto out; } ret = fremovexattr(fd, name); + saverr = ret == -1 ? errno : 0; } else { /* fchdir should not fail here */ FCHDIR_NOFAIL(lo->proc_self_fd); ret = removexattr(procname, name); + saverr = ret == -1 ? errno : 0; FCHDIR_NOFAIL(lo->root.fd); } - saverr = ret == -1 ? errno : 0; - out: if (fd >= 0) { close(fd); @@ -3559,10 +3755,6 @@ static void setup_nofile_rlimit(unsigned long rlimit_nofile) static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) { g_autofree char *localfmt = NULL; - struct timespec ts; - struct tm tm; - char sec_fmt[sizeof "2020-12-07 18:17:54"]; - char zone_fmt[sizeof "+0100"]; if (current_log_level < level) { return; @@ -3574,23 +3766,10 @@ static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), fmt); } else { - /* try formatting a broken-down timestamp */ - if (clock_gettime(CLOCK_REALTIME, &ts) != -1 && - localtime_r(&ts.tv_sec, &tm) != NULL && - strftime(sec_fmt, sizeof sec_fmt, "%Y-%m-%d %H:%M:%S", - &tm) != 0 && - strftime(zone_fmt, sizeof zone_fmt, "%z", &tm) != 0) { - localfmt = g_strdup_printf("[%s.%02ld%s] [ID: %08ld] %s", - sec_fmt, - ts.tv_nsec / (10L * 1000 * 1000), - zone_fmt, syscall(__NR_gettid), - fmt); - } else { - /* fall back to a flat timestamp */ - localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s", - get_clock(), syscall(__NR_gettid), - fmt); - } + g_autoptr(GDateTime) now = g_date_time_new_now_utc(); + g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z"); + localfmt = g_strdup_printf("[%s] [ID: %08ld] %s", + nowstr, syscall(__NR_gettid), fmt); } fmt = localfmt; } @@ -3722,6 +3901,7 @@ int main(int argc, char *argv[]) .allow_direct_io = 0, .proc_self_fd = -1, .user_killpriv_v2 = -1, + .user_posix_acl = -1, }; struct lo_map_elem *root_elem; struct lo_map_elem *reserve_elem; @@ -3850,6 +4030,12 @@ int main(int argc, char *argv[]) exit(1); } + if (lo.user_posix_acl == 1 && !lo.xattr) { + fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled." + "\n"); + exit(1); + } + lo.use_statx = true; se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c index 62441cfcdb..f49ed94b5e 100644 --- a/tools/virtiofsd/passthrough_seccomp.c +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -114,6 +114,7 @@ static const int syscall_allowlist[] = { SCMP_SYS(utimensat), SCMP_SYS(write), SCMP_SYS(writev), + SCMP_SYS(umask), }; /* Syscalls used when --syslog is enabled */