virtiofsd pull 2021-02-16

Vivek's support for new FUSE KILLPRIV_V2
 and some smaller cleanups.
 
 Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEERfXHG0oMt/uXep+pBRYzHrxb/ecFAmAsEDgACgkQBRYzHrxb
 /ee1Hg/+PXJXLzTZMPdM97BLGyE5k7jgUXfYSCC+VTC87PR/nOUV7LAU12Lqdmiz
 NZG9GoqdRusw4vjc1FF/GcqJYWERrknWDjbpNB3rUfP1I87mWuMXk9HKfubnqP78
 JeeLx6O2q7O8V9CLj6lFcX96fa8umSJjYvpHB7jGuvmfRUgNOa3f9QiU7I9ySThn
 lYCcpqbd/k27eFGAzjy5T5l1SVZTnbVWsM0QoIjRZP27aublECDKJva9owX8m2AC
 50QAoS5hhNIc1mYe45xAB3fibWFh8oD3Onfr1HPRjDLOyL9F8rXZeQibzLLIFwIc
 uC2fYQ07Mywt91f6s6ns9IAxfbnUtujK2Wxcc38xr7Cs5WoHXFhqjPBYPuWTl3Mo
 XFBRH2J5CQKsU0BswOTdYo6DRwJqASSXyD6aEm/Rl5GibIOPE86+UedV7mPLJNYM
 6cWbC9zxlS7ImjUxrTHu4zaVCLJz4AO+Z12uT4KbbieDS6mRznrBwZbkiEho9wR1
 A5UVTs7vAWpthdHJFvImV4SwjkmcYx+LhQcIqK7HdJIG9fRP4jnrnsHhCxaQQPIn
 4yl+jgQbN45imRazdIiFTCIJ1MwGU7dxOX3OQI6FjTpGENHve3XuO1h1c025TtcD
 7gheE0sgVSsddEMAvl8zQ1ZW/eQyL/aTI4ctgUkxiwN5gct9L+k=
 =3eKx
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20210216' into staging

virtiofsd pull 2021-02-16

Vivek's support for new FUSE KILLPRIV_V2
and some smaller cleanups.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

# gpg: Signature made Tue 16 Feb 2021 18:34:32 GMT
# gpg:                using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7
# gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full]
# Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A  9FA9 0516 331E BC5B FDE7

* remotes/dgilbert-gitlab/tags/pull-virtiofs-20210216:
  virtiofsd: Do not use a thread pool by default
  viriofsd: Add support for FUSE_HANDLE_KILLPRIV_V2
  virtiofsd: Save error code early at the failure callsite
  tools/virtiofsd: Replace the word 'whitelist'
  virtiofsd: vu_dispatch locking should never fail
  virtiofsd: Allow to build it without the tools

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2021-02-17 14:44:18 +00:00
commit 1af5629673
7 changed files with 158 additions and 38 deletions

View File

@ -10,8 +10,11 @@ if get_option('virtiofsd').enabled()
error('virtiofsd requires Linux')
elif not seccomp.found() or not libcap_ng.found()
error('virtiofsd requires libcap-ng-devel and seccomp-devel')
elif not have_tools or 'CONFIG_VHOST_USER' not in config_host
error('virtiofsd needs tools and vhost-user support')
elif 'CONFIG_VHOST_USER' not in config_host
error('virtiofsd needs vhost-user support')
else
# Disabled all the tools but virtiofsd.
have_virtiofsd = true
endif
endif
elif get_option('virtiofsd').disabled() or not have_system

View File

@ -357,6 +357,21 @@ struct fuse_file_info {
*/
#define FUSE_CAP_SUBMOUNTS (1 << 27)
/**
* Indicates that the filesystem is responsible for clearing
* security.capability xattr and clearing setuid and setgid bits. Following
* are the rules.
* - clear "security.capability" on write, truncate and chown unconditionally
* - clear suid/sgid if following is true. Note, sgid is cleared only if
* group executable bit is set.
* o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set.
* o setattr has FATTR_UID or FATTR_GID
* o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID
* o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set.
* o write has FUSE_WRITE_KILL_SUIDGID
*/
#define FUSE_CAP_HANDLE_KILLPRIV_V2 (1 << 28)
/**
* Ioctl flags
*

View File

@ -18,7 +18,7 @@
#include <sys/file.h>
#define THREAD_POOL_SIZE 64
#define THREAD_POOL_SIZE 0
#define OFFSET_MAX 0x7fffffffffffffffLL
@ -855,7 +855,7 @@ static void do_setattr(fuse_req_t req, fuse_ino_t nodeid,
FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE |
FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME |
FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW |
FUSE_SET_ATTR_CTIME;
FUSE_SET_ATTR_CTIME | FUSE_SET_ATTR_KILL_SUIDGID;
req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi);
} else {
@ -1069,6 +1069,7 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid,
memset(&fi, 0, sizeof(fi));
fi.flags = arg->flags;
fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID;
req->ctx.umask = arg->umask;
@ -1092,6 +1093,7 @@ static void do_open(fuse_req_t req, fuse_ino_t nodeid,
memset(&fi, 0, sizeof(fi));
fi.flags = arg->flags;
fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID;
if (req->se->op.open) {
req->se->op.open(req, nodeid, &fi);
@ -1983,6 +1985,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
if (arg->flags & FUSE_SUBMOUNTS) {
se->conn.capable |= FUSE_CAP_SUBMOUNTS;
}
if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) {
se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2;
}
#ifdef HAVE_SPLICE
#ifdef HAVE_VMSPLICE
se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
@ -2114,6 +2119,10 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
outarg.congestion_threshold = se->conn.congestion_threshold;
outarg.time_gran = se->conn.time_gran;
if (se->conn.want & FUSE_CAP_HANDLE_KILLPRIV_V2) {
outarg.flags |= FUSE_HANDLE_KILLPRIV_V2;
}
fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor);
fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags);
fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead);

View File

@ -143,6 +143,7 @@ struct fuse_forget_data {
#define FUSE_SET_ATTR_ATIME_NOW (1 << 7)
#define FUSE_SET_ATTR_MTIME_NOW (1 << 8)
#define FUSE_SET_ATTR_CTIME (1 << 10)
#define FUSE_SET_ATTR_KILL_SUIDGID (1 << 11)
/*
* Request methods and replies

View File

@ -187,6 +187,31 @@ static void copy_iov(struct iovec *src_iov, int src_count,
}
}
/*
* pthread_rwlock_rdlock() and pthread_rwlock_wrlock can fail if
* a deadlock condition is detected or the current thread already
* owns the lock. They can also fail, like pthread_rwlock_unlock(),
* if the mutex wasn't properly initialized. None of these are ever
* expected to happen.
*/
static void vu_dispatch_rdlock(struct fv_VuDev *vud)
{
int ret = pthread_rwlock_rdlock(&vud->vu_dispatch_rwlock);
assert(ret == 0);
}
static void vu_dispatch_wrlock(struct fv_VuDev *vud)
{
int ret = pthread_rwlock_wrlock(&vud->vu_dispatch_rwlock);
assert(ret == 0);
}
static void vu_dispatch_unlock(struct fv_VuDev *vud)
{
int ret = pthread_rwlock_unlock(&vud->vu_dispatch_rwlock);
assert(ret == 0);
}
/*
* Called back by ll whenever it wants to send a reply/message back
* The 1st element of the iov starts with the fuse_out_header
@ -240,12 +265,12 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
copy_iov(iov, count, in_sg, in_num, tosend_len);
pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_rdlock(qi->virtio_dev);
pthread_mutex_lock(&qi->vq_lock);
vu_queue_push(dev, q, elem, tosend_len);
vu_queue_notify(dev, q);
pthread_mutex_unlock(&qi->vq_lock);
pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_unlock(qi->virtio_dev);
req->reply_sent = true;
@ -403,12 +428,12 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
ret = 0;
pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_rdlock(qi->virtio_dev);
pthread_mutex_lock(&qi->vq_lock);
vu_queue_push(dev, q, elem, tosend_len);
vu_queue_notify(dev, q);
pthread_mutex_unlock(&qi->vq_lock);
pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_unlock(qi->virtio_dev);
err:
if (ret == 0) {
@ -558,12 +583,12 @@ out:
fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
elem->index);
pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_rdlock(qi->virtio_dev);
pthread_mutex_lock(&qi->vq_lock);
vu_queue_push(dev, q, elem, 0);
vu_queue_notify(dev, q);
pthread_mutex_unlock(&qi->vq_lock);
pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_unlock(qi->virtio_dev);
}
pthread_mutex_destroy(&req->ch.lock);
@ -596,7 +621,6 @@ static void *fv_queue_thread(void *opaque)
qi->qidx, qi->kick_fd);
while (1) {
struct pollfd pf[2];
int ret;
pf[0].fd = qi->kick_fd;
pf[0].events = POLLIN;
@ -645,8 +669,7 @@ static void *fv_queue_thread(void *opaque)
break;
}
/* Mutual exclusion with virtio_loop() */
ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
assert(ret == 0); /* there is no possible error case */
vu_dispatch_rdlock(qi->virtio_dev);
pthread_mutex_lock(&qi->vq_lock);
/* out is from guest, in is too guest */
unsigned int in_bytes, out_bytes;
@ -672,7 +695,7 @@ static void *fv_queue_thread(void *opaque)
}
pthread_mutex_unlock(&qi->vq_lock);
pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_unlock(qi->virtio_dev);
/* Process all the requests. */
if (!se->thread_pool_size && req_list != NULL) {
@ -799,7 +822,6 @@ int virtio_loop(struct fuse_session *se)
while (!fuse_session_exited(se)) {
struct pollfd pf[1];
bool ok;
int ret;
pf[0].fd = se->vu_socketfd;
pf[0].events = POLLIN;
pf[0].revents = 0;
@ -825,12 +847,11 @@ int virtio_loop(struct fuse_session *se)
assert(pf[0].revents & POLLIN);
fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
/* Mutual exclusion with fv_queue_thread() */
ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock);
assert(ret == 0); /* there is no possible error case */
vu_dispatch_wrlock(se->virtio_dev);
ok = vu_dispatch(&se->virtio_dev->dev);
pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock);
vu_dispatch_unlock(se->virtio_dev);
if (!ok) {
fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);

View File

@ -168,6 +168,7 @@ struct lo_data {
/* An O_PATH file descriptor to /proc/self/fd/ */
int proc_self_fd;
int user_killpriv_v2, killpriv_v2;
};
static const struct fuse_opt lo_opts[] = {
@ -198,6 +199,8 @@ static const struct fuse_opt lo_opts[] = {
{ "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
{ "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
{ "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
{ "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
{ "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
FUSE_OPT_END
};
static bool use_syslog = false;
@ -630,6 +633,34 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
"does not support it\n");
lo->announce_submounts = false;
}
if (lo->user_killpriv_v2 == 1) {
/*
* User explicitly asked for this option. Enable it unconditionally.
* If connection does not have this capability, it should fail
* in fuse_lowlevel.c
*/
fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
lo->killpriv_v2 = 1;
} else if (lo->user_killpriv_v2 == -1 &&
conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) {
/*
* User did not specify a value for killpriv_v2. By default enable it
* if connection offers this capability
*/
fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
lo->killpriv_v2 = 1;
} else {
/*
* Either user specified to disable killpriv_v2, or connection does
* not offer this capability. Disable killpriv_v2 in both the cases
*/
fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
lo->killpriv_v2 = 0;
}
}
static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
@ -698,6 +729,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
}
if (res == -1) {
saverr = errno;
goto out_err;
}
}
@ -707,27 +739,47 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
if (res == -1) {
saverr = errno;
goto out_err;
}
}
if (valid & FUSE_SET_ATTR_SIZE) {
int truncfd;
bool kill_suidgid;
bool cap_fsetid_dropped = false;
kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
if (fi) {
truncfd = fd;
} else {
truncfd = lo_inode_open(lo, inode, O_RDWR);
if (truncfd < 0) {
errno = -truncfd;
saverr = -truncfd;
goto out_err;
}
}
if (kill_suidgid) {
res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
if (res != 0) {
saverr = res;
if (!fi) {
close(truncfd);
}
goto out_err;
}
}
res = ftruncate(truncfd, attr->st_size);
saverr = res == -1 ? errno : 0;
if (cap_fsetid_dropped) {
if (gain_effective_cap("FSETID")) {
fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
}
}
if (!fi) {
saverr = errno;
close(truncfd);
errno = saverr;
}
if (res == -1) {
goto out_err;
@ -760,6 +812,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
res = utimensat(lo->proc_self_fd, procname, tv, 0);
}
if (res == -1) {
saverr = errno;
goto out_err;
}
}
@ -768,7 +821,6 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
return lo_getattr(req, ino, fi);
out_err:
saverr = errno;
lo_inode_put(lo, &inode);
fuse_reply_err(req, saverr);
}
@ -1708,11 +1760,27 @@ static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
{
ssize_t fh;
int fd = existing_fd;
int err;
bool cap_fsetid_dropped = false;
bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
update_open_flags(lo->writeback, lo->allow_direct_io, fi);
if (fd < 0) {
if (kill_suidgid) {
err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
if (err) {
return err;
}
}
fd = lo_inode_open(lo, inode, fi->flags);
if (cap_fsetid_dropped) {
if (gain_effective_cap("FSETID")) {
fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
}
}
if (fd < 0) {
return -fd;
}
@ -1746,8 +1814,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
int err;
struct lo_cred old = {};
fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
name);
fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
" kill_priv=%d\n", parent, name, fi->kill_priv);
if (!is_safe_path_component(name)) {
fuse_reply_err(req, EINVAL);
@ -1980,8 +2048,8 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
struct lo_inode *inode = lo_inode(req, ino);
int err;
fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
fi->flags);
fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
"\n", ino, fi->flags, fi->kill_priv);
if (!inode) {
fuse_reply_err(req, EBADF);
@ -2120,12 +2188,14 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
out_buf.buf[0].pos = off;
fuse_log(FUSE_LOG_DEBUG,
"lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
out_buf.buf[0].size, (unsigned long)off);
"lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
/*
* If kill_priv is set, drop CAP_FSETID which should lead to kernel
* clearing setuid/setgid on file.
* clearing setuid/setgid on file. Note, for WRITE, we need to do
* this even if killpriv_v2 is not enabled. fuse direct write path
* relies on this.
*/
if (fi->kill_priv) {
res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
@ -3204,7 +3274,7 @@ static void setup_mounts(const char *source)
}
/*
* Only keep whitelisted capabilities that are needed for file system operation
* Only keep capabilities in allowlist that are needed for file system operation
* The (possibly NULL) modcaps_in string passed in is free'd before exit.
*/
static void setup_capabilities(char *modcaps_in)
@ -3214,8 +3284,8 @@ static void setup_capabilities(char *modcaps_in)
capng_restore_state(&cap.saved);
/*
* Whitelist file system-related capabilities that are needed for a file
* server to act like root. Drop everything else like networking and
* Add to allowlist file system-related capabilities that are needed for a
* file server to act like root. Drop everything else like networking and
* sysadmin capabilities.
*
* Exclusions:
@ -3533,6 +3603,7 @@ int main(int argc, char *argv[])
.posix_lock = 0,
.allow_direct_io = 0,
.proc_self_fd = -1,
.user_killpriv_v2 = -1,
};
struct lo_map_elem *root_elem;
struct lo_map_elem *reserve_elem;

View File

@ -21,7 +21,7 @@
#endif
#endif
static const int syscall_whitelist[] = {
static const int syscall_allowlist[] = {
/* TODO ireg sem*() syscalls */
SCMP_SYS(brk),
SCMP_SYS(capget), /* For CAP_FSETID */
@ -117,12 +117,12 @@ static const int syscall_whitelist[] = {
};
/* Syscalls used when --syslog is enabled */
static const int syscall_whitelist_syslog[] = {
static const int syscall_allowlist_syslog[] = {
SCMP_SYS(send),
SCMP_SYS(sendto),
};
static void add_whitelist(scmp_filter_ctx ctx, const int syscalls[], size_t len)
static void add_allowlist(scmp_filter_ctx ctx, const int syscalls[], size_t len)
{
size_t i;
@ -153,10 +153,10 @@ void setup_seccomp(bool enable_syslog)
exit(1);
}
add_whitelist(ctx, syscall_whitelist, G_N_ELEMENTS(syscall_whitelist));
add_allowlist(ctx, syscall_allowlist, G_N_ELEMENTS(syscall_allowlist));
if (enable_syslog) {
add_whitelist(ctx, syscall_whitelist_syslog,
G_N_ELEMENTS(syscall_whitelist_syslog));
add_allowlist(ctx, syscall_allowlist_syslog,
G_N_ELEMENTS(syscall_allowlist_syslog));
}
/* libvhost-user calls this for post-copy migration, we don't need it */