virtiofsd: Pull 2020-05-01 (includes CVE fix)

This set includes a security fix, other fixes and improvements.
 
 Security fix:
 The security fix is for CVE-2020-10717 where, on low RAM hosts,
 the guest can potentially exceed the maximum fd limit.
 This fix adds some more configuration so that the user
 can explicitly set the limit.
 
 Fixes:
 
 Recursive mounting of the exported directory is now used in
 the sandbox, such that if there was a mount underneath present at
 the time the virtiofsd was started, that mount is also
 visible to the guest; in the existing code, only mounts that
 happened after startup were visible.
 
 Security improvements:
 
 The jailing for /proc/self/fd is improved - but it's something
 that shouldn't be accessible anyway.
 
 Most capabilities are now dropped at startup; again this shouldn't
 change any behaviour but is extra protection.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEERfXHG0oMt/uXep+pBRYzHrxb/ecFAl6sc0YACgkQBRYzHrxb
 /eeP2Q/+KK//vrNblyKsAFV3G9fPi5VTLz6j/fwiA1JduyX3X8w7kGGwfxSuwvJP
 K7t07W/bkyVfFe+8kMVOkFFU63XPMIR0s6NGrnmLMJOY8eLXNtqFodE6yBzOPplA
 +ugxKRirHegx3mEtEvWO0EJU0EWcJ8QuaF0NT35ef3I/ufmyggapZ9cdYfPCgow/
 QHMQjvEl2CwC7kYwm4s0bkS/kP67m2SorVx/fr6Q0kSLUssqYhmbkK3KLdv+eUIo
 K1AFpoPBcpxdh6TxhtMM4tJ/2OycZOvmdKzEdzciq1MRTH1AgVU7ZBQPv/76bsJ8
 yOsaNUqz0K2aNERF+0uBJ6POMcbfqg9IMwpgbM+vv2Yp7UjjUpdZNLplQIil1QvP
 9Eun2KNJ9sF3ZM2DcusQuxOcs4cVPVui1k1+KqFRBV1oxoj5ikosrmkI3OOjjdZd
 lLJRmQQZ451zMb/aKndu+jG08ciWh+2l3NGZ2e7Sllo+X5lE5yhToXlRgkK5kR+U
 1aS+CDj/kWQ/PIP9KYgss6nxHmsPb7P7KnR42MnrU8GSroayzqdLn2hQYx9CZQAB
 /2QpvZGAX95urKTz9tpCRXzq2JMAY+qe2/YYgiNPhABWqIsu63YYuYZhahg+UFeq
 d29aUGxUOyLsuQWlyB3aFoOIIjNp89n3V5cgXIWHM2vbMVq8P4E=
 =tou1
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200501' into staging

virtiofsd: Pull 2020-05-01 (includes CVE fix)

This set includes a security fix, other fixes and improvements.

Security fix:
The security fix is for CVE-2020-10717 where, on low RAM hosts,
the guest can potentially exceed the maximum fd limit.
This fix adds some more configuration so that the user
can explicitly set the limit.

Fixes:

Recursive mounting of the exported directory is now used in
the sandbox, such that if there was a mount underneath present at
the time the virtiofsd was started, that mount is also
visible to the guest; in the existing code, only mounts that
happened after startup were visible.

Security improvements:

The jailing for /proc/self/fd is improved - but it's something
that shouldn't be accessible anyway.

Most capabilities are now dropped at startup; again this shouldn't
change any behaviour but is extra protection.

# gpg: Signature made Fri 01 May 2020 20:06:46 BST
# gpg:                using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7
# gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full]
# Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A  9FA9 0516 331E BC5B FDE7

* remotes/dgilbert-gitlab/tags/pull-virtiofs-20200501:
  virtiofsd: drop all capabilities in the wait parent process
  virtiofsd: only retain file system capabilities
  virtiofsd: Show submounts
  virtiofsd: jail lo->proc_self_fd
  virtiofsd: stay below fs.file-max sysctl value (CVE-2020-10717)
  virtiofsd: add --rlimit-nofile=NUM option

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2020-05-01 23:10:22 +01:00
commit 6897541d90
3 changed files with 135 additions and 19 deletions

View File

@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts {
int syslog;
int log_level;
unsigned int max_idle_threads;
unsigned long rlimit_nofile;
};
/**

View File

@ -23,6 +23,8 @@
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#define FUSE_HELPER_OPT(t, p) \
@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = {
FUSE_HELPER_OPT("subtype=", nodefault_subtype),
FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP),
FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads),
FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile),
FUSE_HELPER_OPT("--syslog", syslog),
FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG),
FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO),
@ -171,6 +174,10 @@ void fuse_cmdline_help(void)
" default: no_writeback\n"
" -o xattr|no_xattr enable/disable xattr\n"
" default: no_xattr\n"
" --rlimit-nofile=<num> set maximum number of file descriptors\n"
" (0 leaves rlimit unchanged)\n"
" default: min(1000000, fs.file-max - 16384)\n"
" if the current rlimit is lower\n"
);
}
@ -191,11 +198,51 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key,
}
}
static unsigned long get_default_rlimit_nofile(void)
{
g_autofree gchar *file_max_str = NULL;
const rlim_t reserved_fds = 16384; /* leave at least this many fds free */
rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */
rlim_t file_max;
struct rlimit rlim;
/*
* Reduce max_fds below the system-wide maximum, if necessary. This
* ensures there are fds available for other processes so we don't
* cause resource exhaustion.
*/
if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str,
NULL, NULL)) {
fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n");
exit(1);
}
file_max = g_ascii_strtoull(file_max_str, NULL, 10);
if (file_max < 2 * reserved_fds) {
fuse_log(FUSE_LOG_ERR,
"The fs.file-max sysctl is too low (%lu) to allow a "
"reasonable number of open files.\n",
(unsigned long)file_max);
exit(1);
}
max_fds = MIN(file_max - reserved_fds, max_fds);
if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
exit(1);
}
if (rlim.rlim_cur >= max_fds) {
return 0; /* we have more fds available than required! */
}
return max_fds;
}
int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts)
{
memset(opts, 0, sizeof(struct fuse_cmdline_opts));
opts->max_idle_threads = 10;
opts->rlimit_nofile = get_default_rlimit_nofile();
opts->foreground = 1;
if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) ==

View File

@ -2530,12 +2530,25 @@ static void print_capabilities(void)
printf("}\n");
}
/*
* Drop all Linux capabilities because the wait parent process only needs to
* sit in waitpid(2) and terminate.
*/
static void setup_wait_parent_capabilities(void)
{
capng_setpid(syscall(SYS_gettid));
capng_clear(CAPNG_SELECT_BOTH);
capng_apply(CAPNG_SELECT_BOTH);
}
/*
* Move to a new mount, net, and pid namespaces to isolate this process.
*/
static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
{
pid_t child;
char template[] = "virtiofsd-XXXXXX";
char *tmpdir;
/*
* Create a new pid namespace for *child* processes. We'll have to
@ -2561,6 +2574,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
pid_t waited;
int wstatus;
setup_wait_parent_capabilities();
/* The parent waits for the child */
do {
waited = waitpid(child, &wstatus, 0);
@ -2597,12 +2612,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
exit(1);
}
/* Now we can get our /proc/self/fd directory file descriptor */
lo->proc_self_fd = open("/proc/self/fd", O_PATH);
if (lo->proc_self_fd == -1) {
fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
tmpdir = mkdtemp(template);
if (!tmpdir) {
fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template);
exit(1);
}
if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n",
tmpdir);
exit(1);
}
/* Now we can get our /proc/self/fd directory file descriptor */
lo->proc_self_fd = open(tmpdir, O_PATH);
if (lo->proc_self_fd == -1) {
fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir);
exit(1);
}
if (umount2(tmpdir, MNT_DETACH) < 0) {
fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir);
exit(1);
}
if (rmdir(tmpdir) < 0) {
fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir);
}
}
/*
@ -2643,7 +2679,7 @@ static void setup_mounts(const char *source)
int oldroot;
int newroot;
if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
exit(1);
}
@ -2695,6 +2731,43 @@ static void setup_mounts(const char *source)
close(oldroot);
}
/*
* Only keep whitelisted capabilities that are needed for file system operation
*/
static void setup_capabilities(void)
{
pthread_mutex_lock(&cap.mutex);
capng_restore_state(&cap.saved);
/*
* Whitelist file system-related capabilities that are needed for a file
* server to act like root. Drop everything else like networking and
* sysadmin capabilities.
*
* Exclusions:
* 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
* and we don't support that.
* 2. CAP_MAC_OVERRIDE is not included because it only seems to be
* used by the Smack LSM. Omit it until there is demand for it.
*/
capng_setpid(syscall(SYS_gettid));
capng_clear(CAPNG_SELECT_BOTH);
capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
CAP_CHOWN,
CAP_DAC_OVERRIDE,
CAP_DAC_READ_SEARCH,
CAP_FOWNER,
CAP_FSETID,
CAP_SETGID,
CAP_SETUID,
CAP_MKNOD,
CAP_SETFCAP);
capng_apply(CAPNG_SELECT_BOTH);
cap.saved = capng_save_state();
pthread_mutex_unlock(&cap.mutex);
}
/*
* Lock down this process to prevent access to other processes or files outside
* source directory. This reduces the impact of arbitrary code execution bugs.
@ -2705,26 +2778,21 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
setup_namespaces(lo, se);
setup_mounts(lo->source);
setup_seccomp(enable_syslog);
setup_capabilities();
}
/* Raise the maximum number of open file descriptors */
static void setup_nofile_rlimit(void)
/* Set the maximum number of open file descriptors */
static void setup_nofile_rlimit(unsigned long rlimit_nofile)
{
const rlim_t max_fds = 1000000;
struct rlimit rlim;
struct rlimit rlim = {
.rlim_cur = rlimit_nofile,
.rlim_max = rlimit_nofile,
};
if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
exit(1);
}
if (rlim.rlim_cur >= max_fds) {
if (rlimit_nofile == 0) {
return; /* nothing to do */
}
rlim.rlim_cur = max_fds;
rlim.rlim_max = max_fds;
if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
/* Ignore SELinux denials */
if (errno == EPERM) {
@ -2977,7 +3045,7 @@ int main(int argc, char *argv[])
fuse_daemonize(opts.foreground);
setup_nofile_rlimit();
setup_nofile_rlimit(opts.rlimit_nofile);
/* Must be before sandbox since it wants /proc */
setup_capng();