diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 2ffd96ade9..7bb994bbf2 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -314,8 +314,8 @@ static V9fsFidState *alloc_fid(V9fsState *s, int32_t fid) f->next = s->fid_list; s->fid_list = f; - v9fs_readdir_init(&f->fs.dir); - v9fs_readdir_init(&f->fs_reclaim.dir); + v9fs_readdir_init(s->proto_version, &f->fs.dir); + v9fs_readdir_init(s->proto_version, &f->fs_reclaim.dir); return f; } @@ -972,30 +972,6 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp, return 0; } -static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp, - struct dirent *dent, V9fsQID *qidp) -{ - struct stat stbuf; - V9fsPath path; - int err; - - v9fs_path_init(&path); - - err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path); - if (err < 0) { - goto out; - } - err = v9fs_co_lstat(pdu, &path, &stbuf); - if (err < 0) { - goto out; - } - err = stat_to_qid(pdu, &stbuf, qidp); - -out: - v9fs_path_free(&path); - return err; -} - V9fsPDU *pdu_alloc(V9fsState *s) { V9fsPDU *pdu = NULL; @@ -2252,7 +2228,14 @@ static void coroutine_fn v9fs_read(void *opaque) goto out_nofid; } if (fidp->fid_type == P9_FID_DIR) { - + if (s->proto_version != V9FS_PROTO_2000U) { + warn_report_once( + "9p: bad client: T_read request on directory only expected " + "with 9P2000.u protocol version" + ); + err = -EOPNOTSUPP; + goto out; + } if (off == 0) { v9fs_co_rewinddir(pdu, fidp); } @@ -2313,7 +2296,13 @@ out_nofid: pdu_complete(pdu, err); } -static size_t v9fs_readdir_data_size(V9fsString *name) +/** + * Returns size required in Rreaddir response for the passed dirent @p name. + * + * @param name - directory entry's name (i.e. file name, directory name) + * @returns required size in bytes + */ +size_t v9fs_readdir_response_size(V9fsString *name) { /* * Size of each dirent on the wire: size of qid (13) + size of offset (8) @@ -2322,62 +2311,74 @@ static size_t v9fs_readdir_data_size(V9fsString *name) return 24 + v9fs_string_size(name); } +static void v9fs_free_dirents(struct V9fsDirEnt *e) +{ + struct V9fsDirEnt *next = NULL; + + for (; e; e = next) { + next = e->next; + g_free(e->dent); + g_free(e->st); + g_free(e); + } +} + static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, - int32_t max_count) + off_t offset, int32_t max_count) { size_t size; V9fsQID qid; V9fsString name; int len, err = 0; int32_t count = 0; - off_t saved_dir_pos; struct dirent *dent; + struct stat *st; + struct V9fsDirEnt *entries = NULL; - /* save the directory position */ - saved_dir_pos = v9fs_co_telldir(pdu, fidp); - if (saved_dir_pos < 0) { - return saved_dir_pos; + /* + * inode remapping requires the device id, which in turn might be + * different for different directory entries, so if inode remapping is + * enabled we have to make a full stat for each directory entry + */ + const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES; + + /* + * Fetch all required directory entries altogether on a background IO + * thread from fs driver. We don't want to do that for each entry + * individually, because hopping between threads (this main IO thread + * and background IO driver thread) would sum up to huge latencies. + */ + count = v9fs_co_readdir_many(pdu, fidp, &entries, offset, max_count, + dostat); + if (count < 0) { + err = count; + count = 0; + goto out; } + count = 0; - while (1) { - v9fs_readdir_lock(&fidp->fs.dir); - - err = v9fs_co_readdir(pdu, fidp, &dent); - if (err || !dent) { - break; - } - v9fs_string_init(&name); - v9fs_string_sprintf(&name, "%s", dent->d_name); - if ((count + v9fs_readdir_data_size(&name)) > max_count) { - v9fs_readdir_unlock(&fidp->fs.dir); - - /* Ran out of buffer. Set dir back to old position and return */ - v9fs_co_seekdir(pdu, fidp, saved_dir_pos); - v9fs_string_free(&name); - return count; - } + for (struct V9fsDirEnt *e = entries; e; e = e->next) { + dent = e->dent; if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) { - /* - * dirent_to_qid() implies expensive stat call for each entry, - * we must do that here though since inode remapping requires - * the device id, which in turn might be different for - * different entries; we cannot make any assumption to avoid - * that here. - */ - err = dirent_to_qid(pdu, fidp, dent, &qid); + st = e->st; + /* e->st should never be NULL, but just to be sure */ + if (!st) { + err = -1; + break; + } + + /* remap inode */ + err = stat_to_qid(pdu, st, &qid); if (err < 0) { - v9fs_readdir_unlock(&fidp->fs.dir); - v9fs_co_seekdir(pdu, fidp, saved_dir_pos); - v9fs_string_free(&name); - return err; + break; } } else { /* * Fill up just the path field of qid because the client uses * only that. To fill the entire qid structure we will have * to stat each dirent found, which is expensive. For the - * latter reason we don't call dirent_to_qid() here. Only drawback + * latter reason we don't call stat_to_qid() here. Only drawback * is that no multi-device export detection of stat_to_qid() * would be done and provided as error to the user here. But * user would get that error anyway when accessing those @@ -2390,25 +2391,26 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, qid.version = 0; } + v9fs_string_init(&name); + v9fs_string_sprintf(&name, "%s", dent->d_name); + /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */ len = pdu_marshal(pdu, 11 + count, "Qqbs", &qid, dent->d_off, dent->d_type, &name); - v9fs_readdir_unlock(&fidp->fs.dir); + v9fs_string_free(&name); if (len < 0) { - v9fs_co_seekdir(pdu, fidp, saved_dir_pos); - v9fs_string_free(&name); - return len; + err = len; + break; } + count += len; - v9fs_string_free(&name); - saved_dir_pos = dent->d_off; } - v9fs_readdir_unlock(&fidp->fs.dir); - +out: + v9fs_free_dirents(entries); if (err < 0) { return err; } @@ -2451,12 +2453,15 @@ static void coroutine_fn v9fs_readdir(void *opaque) retval = -EINVAL; goto out; } - if (initial_offset == 0) { - v9fs_co_rewinddir(pdu, fidp); - } else { - v9fs_co_seekdir(pdu, fidp, initial_offset); + if (s->proto_version != V9FS_PROTO_2000L) { + warn_report_once( + "9p: bad client: T_readdir request only expected with 9P2000.L " + "protocol version" + ); + retval = -EOPNOTSUPP; + goto out; } - count = v9fs_do_readdir(pdu, fidp, max_count); + count = v9fs_do_readdir(pdu, fidp, (off_t) initial_offset, max_count); if (count < 0) { retval = count; goto out; diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h index ee2271663c..3dd1b50b1a 100644 --- a/hw/9pfs/9p.h +++ b/hw/9pfs/9p.h @@ -197,24 +197,63 @@ typedef struct V9fsXattr typedef struct V9fsDir { DIR *stream; - CoMutex readdir_mutex; + P9ProtoVersion proto_version; + /* readdir mutex type used for 9P2000.u protocol variant */ + CoMutex readdir_mutex_u; + /* readdir mutex type used for 9P2000.L protocol variant */ + QemuMutex readdir_mutex_L; } V9fsDir; static inline void v9fs_readdir_lock(V9fsDir *dir) { - qemu_co_mutex_lock(&dir->readdir_mutex); + if (dir->proto_version == V9FS_PROTO_2000U) { + qemu_co_mutex_lock(&dir->readdir_mutex_u); + } else { + qemu_mutex_lock(&dir->readdir_mutex_L); + } } static inline void v9fs_readdir_unlock(V9fsDir *dir) { - qemu_co_mutex_unlock(&dir->readdir_mutex); + if (dir->proto_version == V9FS_PROTO_2000U) { + qemu_co_mutex_unlock(&dir->readdir_mutex_u); + } else { + qemu_mutex_unlock(&dir->readdir_mutex_L); + } } -static inline void v9fs_readdir_init(V9fsDir *dir) +static inline void v9fs_readdir_init(P9ProtoVersion proto_version, V9fsDir *dir) { - qemu_co_mutex_init(&dir->readdir_mutex); + dir->proto_version = proto_version; + if (proto_version == V9FS_PROTO_2000U) { + qemu_co_mutex_init(&dir->readdir_mutex_u); + } else { + qemu_mutex_init(&dir->readdir_mutex_L); + } } +/** + * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests, + * which is a chained list of directory entries. + */ +typedef struct V9fsDirEnt { + /* mandatory (must not be NULL) information for all readdir requests */ + struct dirent *dent; + /* + * optional (may be NULL): A full stat of each directory entry is just + * done if explicitly told to fs driver. + */ + struct stat *st; + /* + * instead of an array, directory entries are always returned as + * chained list, that's because the amount of entries retrieved by fs + * drivers is dependent on the individual entries' name (since response + * messages are size limited), so the final amount cannot be estimated + * before hand + */ + struct V9fsDirEnt *next; +} V9fsDirEnt; + /* * Filled by fs driver on open and other * calls. @@ -419,6 +458,7 @@ void v9fs_path_init(V9fsPath *path); void v9fs_path_free(V9fsPath *path); void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...); void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src); +size_t v9fs_readdir_response_size(V9fsString *name); int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath, const char *name, V9fsPath *path); int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t, diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c index 73f9a751e1..1f70a58df5 100644 --- a/hw/9pfs/codir.c +++ b/hw/9pfs/codir.c @@ -18,28 +18,209 @@ #include "qemu/main-loop.h" #include "coth.h" +/* + * Intended to be called from bottom-half (e.g. background I/O thread) + * context. + */ +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent) +{ + int err = 0; + V9fsState *s = pdu->s; + struct dirent *entry; + + errno = 0; + entry = s->ops->readdir(&s->ctx, &fidp->fs); + if (!entry && errno) { + *dent = NULL; + err = -errno; + } else { + *dent = entry; + } + return err; +} + +/* + * TODO: This will be removed for performance reasons. + * Use v9fs_co_readdir_many() instead. + */ int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent) { int err; - V9fsState *s = pdu->s; if (v9fs_request_cancelled(pdu)) { return -EINTR; } - v9fs_co_run_in_worker( - { - struct dirent *entry; + v9fs_co_run_in_worker({ + err = do_readdir(pdu, fidp, dent); + }); + return err; +} - errno = 0; - entry = s->ops->readdir(&s->ctx, &fidp->fs); - if (!entry && errno) { +/* + * This is solely executed on a background IO thread. + * + * See v9fs_co_readdir_many() (as its only user) below for details. + */ +static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, + struct V9fsDirEnt **entries, off_t offset, + int32_t maxsize, bool dostat) +{ + V9fsState *s = pdu->s; + V9fsString name; + int len, err = 0; + int32_t size = 0; + off_t saved_dir_pos; + struct dirent *dent; + struct V9fsDirEnt *e = NULL; + V9fsPath path; + struct stat stbuf; + + *entries = NULL; + v9fs_path_init(&path); + + /* + * TODO: Here should be a warn_report_once() if lock failed. + * + * With a good 9p client we should not get into concurrency here, + * because a good client would not use the same fid for concurrent + * requests. We do the lock here for safety reasons though. However + * the client would then suffer performance issues, so better log that + * issue here. + */ + v9fs_readdir_lock(&fidp->fs.dir); + + /* seek directory to requested initial position */ + if (offset == 0) { + s->ops->rewinddir(&s->ctx, &fidp->fs); + } else { + s->ops->seekdir(&s->ctx, &fidp->fs, offset); + } + + /* save the directory position */ + saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs); + if (saved_dir_pos < 0) { + err = saved_dir_pos; + goto out; + } + + while (true) { + /* interrupt loop if request was cancelled by a Tflush request */ + if (v9fs_request_cancelled(pdu)) { + err = -EINTR; + break; + } + + /* get directory entry from fs driver */ + err = do_readdir(pdu, fidp, &dent); + if (err || !dent) { + break; + } + + /* + * stop this loop as soon as it would exceed the allowed maximum + * response message size for the directory entries collected so far, + * because anything beyond that size would need to be discarded by + * 9p controller (main thread / top half) anyway + */ + v9fs_string_init(&name); + v9fs_string_sprintf(&name, "%s", dent->d_name); + len = v9fs_readdir_response_size(&name); + v9fs_string_free(&name); + if (size + len > maxsize) { + /* this is not an error case actually */ + break; + } + + /* append next node to result chain */ + if (!e) { + *entries = e = g_malloc0(sizeof(V9fsDirEnt)); + } else { + e = e->next = g_malloc0(sizeof(V9fsDirEnt)); + } + e->dent = g_malloc0(sizeof(struct dirent)); + memcpy(e->dent, dent, sizeof(struct dirent)); + + /* perform a full stat() for directory entry if requested by caller */ + if (dostat) { + err = s->ops->name_to_path( + &s->ctx, &fidp->path, dent->d_name, &path + ); + if (err < 0) { err = -errno; - } else { - *dent = entry; - err = 0; + break; } - }); + + err = s->ops->lstat(&s->ctx, &path, &stbuf); + if (err < 0) { + err = -errno; + break; + } + + e->st = g_malloc0(sizeof(struct stat)); + memcpy(e->st, &stbuf, sizeof(struct stat)); + } + + size += len; + saved_dir_pos = dent->d_off; + } + + /* restore (last) saved position */ + s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos); + +out: + v9fs_readdir_unlock(&fidp->fs.dir); + v9fs_path_free(&path); + if (err < 0) { + return err; + } + return size; +} + +/** + * @brief Reads multiple directory entries in one rush. + * + * Retrieves the requested (max. amount of) directory entries from the fs + * driver. This function must only be called by the main IO thread (top half). + * Internally this function call will be dispatched to a background IO thread + * (bottom half) where it is eventually executed by the fs driver. + * + * @discussion Acquiring multiple directory entries in one rush from the fs + * driver, instead of retrieving each directory entry individually, is very + * beneficial from performance point of view. Because for every fs driver + * request latency is added, which in practice could lead to overall + * latencies of several hundred ms for reading all entries (of just a single + * directory) if every directory entry was individually requested from fs + * driver. + * + * @note You must @b ALWAYS call @c v9fs_free_dirents(entries) after calling + * v9fs_co_readdir_many(), both on success and on error cases of this + * function, to avoid memory leaks once @p entries are no longer needed. + * + * @param pdu - the causing 9p (T_readdir) client request + * @param fidp - already opened directory where readdir shall be performed on + * @param entries - output for directory entries (must not be NULL) + * @param offset - initial position inside the directory the function shall + * seek to before retrieving the directory entries + * @param maxsize - maximum result message body size (in bytes) + * @param dostat - whether a stat() should be performed and returned for + * each directory entry + * @returns resulting response message body size (in bytes) on success, + * negative error code otherwise + */ +int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, + struct V9fsDirEnt **entries, + off_t offset, int32_t maxsize, + bool dostat) +{ + int err = 0; + + if (v9fs_request_cancelled(pdu)) { + return -EINTR; + } + v9fs_co_run_in_worker({ + err = do_readdir_many(pdu, fidp, entries, offset, maxsize, dostat); + }); return err; } diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h index c2cdc7a9ea..c51289903d 100644 --- a/hw/9pfs/coth.h +++ b/hw/9pfs/coth.h @@ -19,7 +19,7 @@ #include "qemu/coroutine.h" #include "9p.h" -/* +/** * we want to use bottom half because we want to make sure the below * sequence of events. * @@ -28,6 +28,16 @@ * 3. Enter the coroutine in the worker thread. * we cannot swap step 1 and 2, because that would imply worker thread * can enter coroutine while step1 is still running + * + * @b PERFORMANCE @b CONSIDERATIONS: As a rule of thumb, keep in mind + * that hopping between threads adds @b latency! So when handling a + * 9pfs request, avoid calling v9fs_co_run_in_worker() too often, because + * this might otherwise sum up to a significant, huge overall latency for + * providing the response for just a single request. For that reason it + * is highly recommended to fetch all data from fs driver with a single + * fs driver request on a background I/O thread (bottom half) in one rush + * first and then eventually assembling the final response from that data + * on main I/O thread (top half). */ #define v9fs_co_run_in_worker(code_block) \ do { \ @@ -49,6 +59,9 @@ void co_run_in_worker_bh(void *); int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *); int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **); +int coroutine_fn v9fs_co_readdir_many(V9fsPDU *, V9fsFidState *, + struct V9fsDirEnt **, off_t, int32_t, + bool); off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *); void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t); void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *); diff --git a/tests/qtest/virtio-9p-test.c b/tests/qtest/virtio-9p-test.c index 2167322985..de30b717b6 100644 --- a/tests/qtest/virtio-9p-test.c +++ b/tests/qtest/virtio-9p-test.c @@ -578,6 +578,7 @@ static bool fs_dirents_contain_name(struct V9fsDirent *e, const char* name) return false; } +/* basic readdir test where reply fits into a single response message */ static void fs_readdir(void *obj, void *data, QGuestAllocator *t_alloc) { QVirtio9P *v9p = obj; @@ -631,6 +632,89 @@ static void fs_readdir(void *obj, void *data, QGuestAllocator *t_alloc) g_free(wnames[0]); } +/* readdir test where overall request is split over several messages */ +static void fs_readdir_split(void *obj, void *data, QGuestAllocator *t_alloc, + uint32_t count) +{ + QVirtio9P *v9p = obj; + alloc = t_alloc; + char *const wnames[] = { g_strdup(QTEST_V9FS_SYNTH_READDIR_DIR) }; + uint16_t nqid; + v9fs_qid qid; + uint32_t nentries, npartialentries; + struct V9fsDirent *entries, *tail, *partialentries; + P9Req *req; + int fid; + uint64_t offset; + + fs_attach(v9p, NULL, t_alloc); + + fid = 1; + offset = 0; + entries = NULL; + nentries = 0; + tail = NULL; + + req = v9fs_twalk(v9p, 0, fid, 1, wnames, 0); + v9fs_req_wait_for_reply(req, NULL); + v9fs_rwalk(req, &nqid, NULL); + g_assert_cmpint(nqid, ==, 1); + + req = v9fs_tlopen(v9p, fid, O_DIRECTORY, 0); + v9fs_req_wait_for_reply(req, NULL); + v9fs_rlopen(req, &qid, NULL); + + /* + * send as many Treaddir requests as required to get all directory + * entries + */ + while (true) { + npartialentries = 0; + partialentries = NULL; + + req = v9fs_treaddir(v9p, fid, offset, count, 0); + v9fs_req_wait_for_reply(req, NULL); + v9fs_rreaddir(req, &count, &npartialentries, &partialentries); + if (npartialentries > 0 && partialentries) { + if (!entries) { + entries = partialentries; + nentries = npartialentries; + tail = partialentries; + } else { + tail->next = partialentries; + nentries += npartialentries; + } + while (tail->next) { + tail = tail->next; + } + offset = tail->offset; + } else { + break; + } + } + + g_assert_cmpint( + nentries, ==, + QTEST_V9FS_SYNTH_READDIR_NFILES + 2 /* "." and ".." */ + ); + + /* + * Check all file names exist in returned entries, ignore their order + * though. + */ + g_assert_cmpint(fs_dirents_contain_name(entries, "."), ==, true); + g_assert_cmpint(fs_dirents_contain_name(entries, ".."), ==, true); + for (int i = 0; i < QTEST_V9FS_SYNTH_READDIR_NFILES; ++i) { + char *name = g_strdup_printf(QTEST_V9FS_SYNTH_READDIR_FILE, i); + g_assert_cmpint(fs_dirents_contain_name(entries, name), ==, true); + g_free(name); + } + + v9fs_free_dirents(entries); + + g_free(wnames[0]); +} + static void fs_walk_no_slash(void *obj, void *data, QGuestAllocator *t_alloc) { QVirtio9P *v9p = obj; @@ -793,6 +877,24 @@ static void fs_flush_ignored(void *obj, void *data, QGuestAllocator *t_alloc) g_free(wnames[0]); } +static void fs_readdir_split_128(void *obj, void *data, + QGuestAllocator *t_alloc) +{ + fs_readdir_split(obj, data, t_alloc, 128); +} + +static void fs_readdir_split_256(void *obj, void *data, + QGuestAllocator *t_alloc) +{ + fs_readdir_split(obj, data, t_alloc, 256); +} + +static void fs_readdir_split_512(void *obj, void *data, + QGuestAllocator *t_alloc) +{ + fs_readdir_split(obj, data, t_alloc, 512); +} + static void register_virtio_9p_test(void) { qos_add_test("config", "virtio-9p", pci_config, NULL); @@ -810,6 +912,12 @@ static void register_virtio_9p_test(void) qos_add_test("fs/flush/ignored", "virtio-9p", fs_flush_ignored, NULL); qos_add_test("fs/readdir/basic", "virtio-9p", fs_readdir, NULL); + qos_add_test("fs/readdir/split_512", "virtio-9p", + fs_readdir_split_512, NULL); + qos_add_test("fs/readdir/split_256", "virtio-9p", + fs_readdir_split_256, NULL); + qos_add_test("fs/readdir/split_128", "virtio-9p", + fs_readdir_split_128, NULL); } libqos_init(register_virtio_9p_test);