xref: /openbmc/qemu/hw/9pfs/9p.c (revision 20ced60dd2a577d5e9bf0a16ff3ef0f8a953f495)
1 /*
2  * Virtio 9p backend
3  *
4  * Copyright IBM, Corp. 2010
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  */
13 
14 /*
15  * Not so fast! You might want to read the 9p developer docs first:
16  * https://wiki.qemu.org/Documentation/9p
17  */
18 
19 #include "qemu/osdep.h"
20 #ifdef CONFIG_LINUX
21 #include <linux/limits.h>
22 #endif
23 #include <glib/gprintf.h>
24 #include "hw/virtio/virtio.h"
25 #include "qapi/error.h"
26 #include "qemu/error-report.h"
27 #include "qemu/iov.h"
28 #include "qemu/main-loop.h"
29 #include "qemu/sockets.h"
30 #include "virtio-9p.h"
31 #include "fsdev/qemu-fsdev.h"
32 #include "9p-xattr.h"
33 #include "9p-util.h"
34 #include "coth.h"
35 #include "trace.h"
36 #include "migration/blocker.h"
37 #include "qemu/xxhash.h"
38 #include <math.h>
39 
40 int open_fd_hw;
41 int total_open_fd;
42 static int open_fd_rc;
43 
44 enum {
45     Oread   = 0x00,
46     Owrite  = 0x01,
47     Ordwr   = 0x02,
48     Oexec   = 0x03,
49     Oexcl   = 0x04,
50     Otrunc  = 0x10,
51     Orexec  = 0x20,
52     Orclose = 0x40,
53     Oappend = 0x80,
54 };
55 
56 P9ARRAY_DEFINE_TYPE(V9fsPath, v9fs_path_free);
57 
58 static ssize_t pdu_marshal(V9fsPDU *pdu, size_t offset, const char *fmt, ...)
59 {
60     ssize_t ret;
61     va_list ap;
62 
63     va_start(ap, fmt);
64     ret = pdu->s->transport->pdu_vmarshal(pdu, offset, fmt, ap);
65     va_end(ap);
66 
67     return ret;
68 }
69 
70 static ssize_t pdu_unmarshal(V9fsPDU *pdu, size_t offset, const char *fmt, ...)
71 {
72     ssize_t ret;
73     va_list ap;
74 
75     va_start(ap, fmt);
76     ret = pdu->s->transport->pdu_vunmarshal(pdu, offset, fmt, ap);
77     va_end(ap);
78 
79     return ret;
80 }
81 
82 static int omode_to_uflags(int8_t mode)
83 {
84     int ret = 0;
85 
86     switch (mode & 3) {
87     case Oread:
88         ret = O_RDONLY;
89         break;
90     case Ordwr:
91         ret = O_RDWR;
92         break;
93     case Owrite:
94         ret = O_WRONLY;
95         break;
96     case Oexec:
97         ret = O_RDONLY;
98         break;
99     }
100 
101     if (mode & Otrunc) {
102         ret |= O_TRUNC;
103     }
104 
105     if (mode & Oappend) {
106         ret |= O_APPEND;
107     }
108 
109     if (mode & Oexcl) {
110         ret |= O_EXCL;
111     }
112 
113     return ret;
114 }
115 
116 typedef struct DotlOpenflagMap {
117     int dotl_flag;
118     int open_flag;
119 } DotlOpenflagMap;
120 
121 static int dotl_to_open_flags(int flags)
122 {
123     int i;
124     /*
125      * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
126      * and P9_DOTL_NOACCESS
127      */
128     int oflags = flags & O_ACCMODE;
129 
130     DotlOpenflagMap dotl_oflag_map[] = {
131         { P9_DOTL_CREATE, O_CREAT },
132         { P9_DOTL_EXCL, O_EXCL },
133         { P9_DOTL_NOCTTY , O_NOCTTY },
134         { P9_DOTL_TRUNC, O_TRUNC },
135         { P9_DOTL_APPEND, O_APPEND },
136         { P9_DOTL_NONBLOCK, O_NONBLOCK } ,
137         { P9_DOTL_DSYNC, O_DSYNC },
138         { P9_DOTL_FASYNC, FASYNC },
139 #ifndef CONFIG_DARWIN
140         { P9_DOTL_NOATIME, O_NOATIME },
141         /*
142          *  On Darwin, we could map to F_NOCACHE, which is
143          *  similar, but doesn't quite have the same
144          *  semantics. However, we don't support O_DIRECT
145          *  even on linux at the moment, so we just ignore
146          *  it here.
147          */
148         { P9_DOTL_DIRECT, O_DIRECT },
149 #endif
150         { P9_DOTL_LARGEFILE, O_LARGEFILE },
151         { P9_DOTL_DIRECTORY, O_DIRECTORY },
152         { P9_DOTL_NOFOLLOW, O_NOFOLLOW },
153         { P9_DOTL_SYNC, O_SYNC },
154     };
155 
156     for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
157         if (flags & dotl_oflag_map[i].dotl_flag) {
158             oflags |= dotl_oflag_map[i].open_flag;
159         }
160     }
161 
162     return oflags;
163 }
164 
165 void cred_init(FsCred *credp)
166 {
167     credp->fc_uid = -1;
168     credp->fc_gid = -1;
169     credp->fc_mode = -1;
170     credp->fc_rdev = -1;
171 }
172 
173 static int get_dotl_openflags(V9fsState *s, int oflags)
174 {
175     int flags;
176     /*
177      * Filter the client open flags
178      */
179     flags = dotl_to_open_flags(oflags);
180     flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT);
181 #ifndef CONFIG_DARWIN
182     /*
183      * Ignore direct disk access hint until the server supports it.
184      */
185     flags &= ~O_DIRECT;
186 #endif
187     return flags;
188 }
189 
190 void v9fs_path_init(V9fsPath *path)
191 {
192     path->data = NULL;
193     path->size = 0;
194 }
195 
196 void v9fs_path_free(V9fsPath *path)
197 {
198     g_free(path->data);
199     path->data = NULL;
200     path->size = 0;
201 }
202 
203 
204 void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...)
205 {
206     va_list ap;
207 
208     v9fs_path_free(path);
209 
210     va_start(ap, fmt);
211     /* Bump the size for including terminating NULL */
212     path->size = g_vasprintf(&path->data, fmt, ap) + 1;
213     va_end(ap);
214 }
215 
216 void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src)
217 {
218     v9fs_path_free(dst);
219     dst->size = src->size;
220     dst->data = g_memdup(src->data, src->size);
221 }
222 
223 int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
224                       const char *name, V9fsPath *path)
225 {
226     int err;
227     err = s->ops->name_to_path(&s->ctx, dirpath, name, path);
228     if (err < 0) {
229         err = -errno;
230     }
231     return err;
232 }
233 
234 /*
235  * Return TRUE if s1 is an ancestor of s2.
236  *
237  * E.g. "a/b" is an ancestor of "a/b/c" but not of "a/bc/d".
238  * As a special case, We treat s1 as ancestor of s2 if they are same!
239  */
240 static int v9fs_path_is_ancestor(V9fsPath *s1, V9fsPath *s2)
241 {
242     if (!strncmp(s1->data, s2->data, s1->size - 1)) {
243         if (s2->data[s1->size - 1] == '\0' || s2->data[s1->size - 1] == '/') {
244             return 1;
245         }
246     }
247     return 0;
248 }
249 
250 static size_t v9fs_string_size(V9fsString *str)
251 {
252     return str->size;
253 }
254 
255 /*
256  * returns 0 if fid got re-opened, 1 if not, < 0 on error
257  */
258 static int coroutine_fn v9fs_reopen_fid(V9fsPDU *pdu, V9fsFidState *f)
259 {
260     int err = 1;
261     if (f->fid_type == P9_FID_FILE) {
262         if (f->fs.fd == -1) {
263             do {
264                 err = v9fs_co_open(pdu, f, f->open_flags);
265             } while (err == -EINTR && !pdu->cancelled);
266         }
267     } else if (f->fid_type == P9_FID_DIR) {
268         if (f->fs.dir.stream == NULL) {
269             do {
270                 err = v9fs_co_opendir(pdu, f);
271             } while (err == -EINTR && !pdu->cancelled);
272         }
273     }
274     return err;
275 }
276 
277 static V9fsFidState *coroutine_fn get_fid(V9fsPDU *pdu, int32_t fid)
278 {
279     int err;
280     V9fsFidState *f;
281     V9fsState *s = pdu->s;
282 
283     f = g_hash_table_lookup(s->fids, GINT_TO_POINTER(fid));
284     if (f) {
285         BUG_ON(f->clunked);
286         /*
287          * Update the fid ref upfront so that
288          * we don't get reclaimed when we yield
289          * in open later.
290          */
291         f->ref++;
292         /*
293          * check whether we need to reopen the
294          * file. We might have closed the fd
295          * while trying to free up some file
296          * descriptors.
297          */
298         err = v9fs_reopen_fid(pdu, f);
299         if (err < 0) {
300             f->ref--;
301             return NULL;
302         }
303         /*
304          * Mark the fid as referenced so that the LRU
305          * reclaim won't close the file descriptor
306          */
307         f->flags |= FID_REFERENCED;
308         return f;
309     }
310     return NULL;
311 }
312 
313 static V9fsFidState *alloc_fid(V9fsState *s, int32_t fid)
314 {
315     V9fsFidState *f;
316 
317     f = g_hash_table_lookup(s->fids, GINT_TO_POINTER(fid));
318     if (f) {
319         /* If fid is already there return NULL */
320         BUG_ON(f->clunked);
321         return NULL;
322     }
323     f = g_new0(V9fsFidState, 1);
324     f->fid = fid;
325     f->fid_type = P9_FID_NONE;
326     f->ref = 1;
327     /*
328      * Mark the fid as referenced so that the LRU
329      * reclaim won't close the file descriptor
330      */
331     f->flags |= FID_REFERENCED;
332     g_hash_table_insert(s->fids, GINT_TO_POINTER(fid), f);
333 
334     v9fs_readdir_init(s->proto_version, &f->fs.dir);
335     v9fs_readdir_init(s->proto_version, &f->fs_reclaim.dir);
336 
337     return f;
338 }
339 
340 static int coroutine_fn v9fs_xattr_fid_clunk(V9fsPDU *pdu, V9fsFidState *fidp)
341 {
342     int retval = 0;
343 
344     if (fidp->fs.xattr.xattrwalk_fid) {
345         /* getxattr/listxattr fid */
346         goto free_value;
347     }
348     /*
349      * if this is fid for setxattr. clunk should
350      * result in setxattr localcall
351      */
352     if (fidp->fs.xattr.len != fidp->fs.xattr.copied_len) {
353         /* clunk after partial write */
354         retval = -EINVAL;
355         goto free_out;
356     }
357     if (fidp->fs.xattr.len) {
358         retval = v9fs_co_lsetxattr(pdu, &fidp->path, &fidp->fs.xattr.name,
359                                    fidp->fs.xattr.value,
360                                    fidp->fs.xattr.len,
361                                    fidp->fs.xattr.flags);
362     } else {
363         retval = v9fs_co_lremovexattr(pdu, &fidp->path, &fidp->fs.xattr.name);
364     }
365 free_out:
366     v9fs_string_free(&fidp->fs.xattr.name);
367 free_value:
368     g_free(fidp->fs.xattr.value);
369     return retval;
370 }
371 
372 static int coroutine_fn free_fid(V9fsPDU *pdu, V9fsFidState *fidp)
373 {
374     int retval = 0;
375 
376     if (fidp->fid_type == P9_FID_FILE) {
377         /* If we reclaimed the fd no need to close */
378         if (fidp->fs.fd != -1) {
379             retval = v9fs_co_close(pdu, &fidp->fs);
380         }
381     } else if (fidp->fid_type == P9_FID_DIR) {
382         if (fidp->fs.dir.stream != NULL) {
383             retval = v9fs_co_closedir(pdu, &fidp->fs);
384         }
385     } else if (fidp->fid_type == P9_FID_XATTR) {
386         retval = v9fs_xattr_fid_clunk(pdu, fidp);
387     }
388     v9fs_path_free(&fidp->path);
389     g_free(fidp);
390     return retval;
391 }
392 
393 static int coroutine_fn put_fid(V9fsPDU *pdu, V9fsFidState *fidp)
394 {
395     BUG_ON(!fidp->ref);
396     fidp->ref--;
397     /*
398      * Don't free the fid if it is in reclaim list
399      */
400     if (!fidp->ref && fidp->clunked) {
401         if (fidp->fid == pdu->s->root_fid) {
402             /*
403              * if the clunked fid is root fid then we
404              * have unmounted the fs on the client side.
405              * delete the migration blocker. Ideally, this
406              * should be hooked to transport close notification
407              */
408             migrate_del_blocker(&pdu->s->migration_blocker);
409         }
410         return free_fid(pdu, fidp);
411     }
412     return 0;
413 }
414 
415 static V9fsFidState *clunk_fid(V9fsState *s, int32_t fid)
416 {
417     V9fsFidState *fidp;
418 
419     /* TODO: Use g_hash_table_steal_extended() instead? */
420     fidp = g_hash_table_lookup(s->fids, GINT_TO_POINTER(fid));
421     if (fidp) {
422         g_hash_table_remove(s->fids, GINT_TO_POINTER(fid));
423         fidp->clunked = true;
424         return fidp;
425     }
426     return NULL;
427 }
428 
429 void coroutine_fn v9fs_reclaim_fd(V9fsPDU *pdu)
430 {
431     int reclaim_count = 0;
432     V9fsState *s = pdu->s;
433     V9fsFidState *f;
434     GHashTableIter iter;
435     gpointer fid;
436     int err;
437     int nclosed = 0;
438 
439     /* prevent multiple coroutines running this function simultaniously */
440     if (s->reclaiming) {
441         return;
442     }
443     s->reclaiming = true;
444 
445     g_hash_table_iter_init(&iter, s->fids);
446 
447     QSLIST_HEAD(, V9fsFidState) reclaim_list =
448         QSLIST_HEAD_INITIALIZER(reclaim_list);
449 
450     /* Pick FIDs to be closed, collect them on reclaim_list. */
451     while (g_hash_table_iter_next(&iter, &fid, (gpointer *) &f)) {
452         /*
453          * Unlinked fids cannot be reclaimed, skip those, and also skip fids
454          * currently being operated on.
455          */
456         if (f->ref || f->flags & FID_NON_RECLAIMABLE) {
457             continue;
458         }
459         /*
460          * if it is a recently referenced fid
461          * we leave the fid untouched and clear the
462          * reference bit. We come back to it later
463          * in the next iteration. (a simple LRU without
464          * moving list elements around)
465          */
466         if (f->flags & FID_REFERENCED) {
467             f->flags &= ~FID_REFERENCED;
468             continue;
469         }
470         /*
471          * Add fids to reclaim list.
472          */
473         if (f->fid_type == P9_FID_FILE) {
474             if (f->fs.fd != -1) {
475                 /*
476                  * Up the reference count so that
477                  * a clunk request won't free this fid
478                  */
479                 f->ref++;
480                 QSLIST_INSERT_HEAD(&reclaim_list, f, reclaim_next);
481                 f->fs_reclaim.fd = f->fs.fd;
482                 f->fs.fd = -1;
483                 reclaim_count++;
484             }
485         } else if (f->fid_type == P9_FID_DIR) {
486             if (f->fs.dir.stream != NULL) {
487                 /*
488                  * Up the reference count so that
489                  * a clunk request won't free this fid
490                  */
491                 f->ref++;
492                 QSLIST_INSERT_HEAD(&reclaim_list, f, reclaim_next);
493                 f->fs_reclaim.dir.stream = f->fs.dir.stream;
494                 f->fs.dir.stream = NULL;
495                 reclaim_count++;
496             }
497         }
498         if (reclaim_count >= open_fd_rc) {
499             break;
500         }
501     }
502     /*
503      * Close the picked FIDs altogether on a background I/O driver thread. Do
504      * this all at once to keep latency (i.e. amount of thread hops between main
505      * thread <-> fs driver background thread) as low as possible.
506      */
507     v9fs_co_run_in_worker({
508         QSLIST_FOREACH(f, &reclaim_list, reclaim_next) {
509             err = (f->fid_type == P9_FID_DIR) ?
510                 s->ops->closedir(&s->ctx, &f->fs_reclaim) :
511                 s->ops->close(&s->ctx, &f->fs_reclaim);
512 
513             /* 'man 2 close' suggests to ignore close() errors except of EBADF */
514             if (unlikely(err && errno == EBADF)) {
515                 /*
516                  * unexpected case as FIDs were picked above by having a valid
517                  * file descriptor
518                  */
519                 error_report("9pfs: v9fs_reclaim_fd() WARNING: close() failed with EBADF");
520             } else {
521                 /* total_open_fd must only be mutated on main thread */
522                 nclosed++;
523             }
524         }
525     });
526     total_open_fd -= nclosed;
527     /* Free the closed FIDs. */
528     while (!QSLIST_EMPTY(&reclaim_list)) {
529         f = QSLIST_FIRST(&reclaim_list);
530         QSLIST_REMOVE(&reclaim_list, f, V9fsFidState, reclaim_next);
531         /*
532          * Now drop the fid reference, free it
533          * if clunked.
534          */
535         put_fid(pdu, f);
536     }
537 
538     s->reclaiming = false;
539 }
540 
541 /*
542  * This is used when a path is removed from the directory tree. Any
543  * fids that still reference it must not be closed from then on, since
544  * they cannot be reopened.
545  */
546 static int coroutine_fn v9fs_mark_fids_unreclaim(V9fsPDU *pdu, V9fsPath *path)
547 {
548     int err = 0;
549     V9fsState *s = pdu->s;
550     V9fsFidState *fidp;
551     gpointer fid;
552     GHashTableIter iter;
553     /*
554      * The most common case is probably that we have exactly one
555      * fid for the given path, so preallocate exactly one.
556      */
557     g_autoptr(GArray) to_reopen = g_array_sized_new(FALSE, FALSE,
558             sizeof(V9fsFidState *), 1);
559     gint i;
560 
561     g_hash_table_iter_init(&iter, s->fids);
562 
563     /*
564      * We iterate over the fid table looking for the entries we need
565      * to reopen, and store them in to_reopen. This is because
566      * v9fs_reopen_fid() and put_fid() yield. This allows the fid table
567      * to be modified in the meantime, invalidating our iterator.
568      */
569     while (g_hash_table_iter_next(&iter, &fid, (gpointer *) &fidp)) {
570         if (fidp->path.size == path->size &&
571             !memcmp(fidp->path.data, path->data, path->size)) {
572             /*
573              * Ensure the fid survives a potential clunk request during
574              * v9fs_reopen_fid or put_fid.
575              */
576             fidp->ref++;
577             fidp->flags |= FID_NON_RECLAIMABLE;
578             g_array_append_val(to_reopen, fidp);
579         }
580     }
581 
582     for (i = 0; i < to_reopen->len; i++) {
583         fidp = g_array_index(to_reopen, V9fsFidState*, i);
584         /* reopen the file/dir if already closed */
585         err = v9fs_reopen_fid(pdu, fidp);
586         if (err < 0) {
587             break;
588         }
589     }
590 
591     for (i = 0; i < to_reopen->len; i++) {
592         put_fid(pdu, g_array_index(to_reopen, V9fsFidState*, i));
593     }
594     return err;
595 }
596 
597 static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
598 {
599     V9fsState *s = pdu->s;
600     V9fsFidState *fidp;
601     GList *freeing;
602     /*
603      * Get a list of all the values (fid states) in the table, which
604      * we then...
605      */
606     g_autoptr(GList) fids = g_hash_table_get_values(s->fids);
607 
608     /* ... remove from the table, taking over ownership. */
609     g_hash_table_steal_all(s->fids);
610 
611     /*
612      * This allows us to release our references to them asynchronously without
613      * iterating over the hash table and risking iterator invalidation
614      * through concurrent modifications.
615      */
616     for (freeing = fids; freeing; freeing = freeing->next) {
617         fidp = freeing->data;
618         fidp->ref++;
619         fidp->clunked = true;
620         put_fid(pdu, fidp);
621     }
622 }
623 
624 #define P9_QID_TYPE_DIR         0x80
625 #define P9_QID_TYPE_SYMLINK     0x02
626 
627 #define P9_STAT_MODE_DIR        0x80000000
628 #define P9_STAT_MODE_APPEND     0x40000000
629 #define P9_STAT_MODE_EXCL       0x20000000
630 #define P9_STAT_MODE_MOUNT      0x10000000
631 #define P9_STAT_MODE_AUTH       0x08000000
632 #define P9_STAT_MODE_TMP        0x04000000
633 #define P9_STAT_MODE_SYMLINK    0x02000000
634 #define P9_STAT_MODE_LINK       0x01000000
635 #define P9_STAT_MODE_DEVICE     0x00800000
636 #define P9_STAT_MODE_NAMED_PIPE 0x00200000
637 #define P9_STAT_MODE_SOCKET     0x00100000
638 #define P9_STAT_MODE_SETUID     0x00080000
639 #define P9_STAT_MODE_SETGID     0x00040000
640 #define P9_STAT_MODE_SETVTX     0x00010000
641 
642 #define P9_STAT_MODE_TYPE_BITS (P9_STAT_MODE_DIR |          \
643                                 P9_STAT_MODE_SYMLINK |      \
644                                 P9_STAT_MODE_LINK |         \
645                                 P9_STAT_MODE_DEVICE |       \
646                                 P9_STAT_MODE_NAMED_PIPE |   \
647                                 P9_STAT_MODE_SOCKET)
648 
649 /* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
650 static inline uint8_t mirror8bit(uint8_t byte)
651 {
652     return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
653 }
654 
655 /* Same as mirror8bit() just for a 64 bit data type instead for a byte. */
656 static inline uint64_t mirror64bit(uint64_t value)
657 {
658     return ((uint64_t)mirror8bit(value         & 0xff) << 56) |
659            ((uint64_t)mirror8bit((value >> 8)  & 0xff) << 48) |
660            ((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) |
661            ((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) |
662            ((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) |
663            ((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) |
664            ((uint64_t)mirror8bit((value >> 48) & 0xff) << 8)  |
665            ((uint64_t)mirror8bit((value >> 56) & 0xff));
666 }
667 
668 /*
669  * Parameter k for the Exponential Golomb algorithm to be used.
670  *
671  * The smaller this value, the smaller the minimum bit count for the Exp.
672  * Golomb generated affixes will be (at lowest index) however for the
673  * price of having higher maximum bit count of generated affixes (at highest
674  * index). Likewise increasing this parameter yields in smaller maximum bit
675  * count for the price of having higher minimum bit count.
676  *
677  * In practice that means: a good value for k depends on the expected amount
678  * of devices to be exposed by one export. For a small amount of devices k
679  * should be small, for a large amount of devices k might be increased
680  * instead. The default of k=0 should be fine for most users though.
681  *
682  * IMPORTANT: In case this ever becomes a runtime parameter; the value of
683  * k should not change as long as guest is still running! Because that would
684  * cause completely different inode numbers to be generated on guest.
685  */
686 #define EXP_GOLOMB_K    0
687 
688 /**
689  * expGolombEncode() - Exponential Golomb algorithm for arbitrary k
690  *                     (including k=0).
691  *
692  * @n: natural number (or index) of the prefix to be generated
693  *     (1, 2, 3, ...)
694  * @k: parameter k of Exp. Golomb algorithm to be used
695  *     (see comment on EXP_GOLOMB_K macro for details about k)
696  * Return: prefix for given @n and @k
697  *
698  * The Exponential Golomb algorithm generates prefixes (NOT suffixes!)
699  * with growing length and with the mathematical property of being
700  * "prefix-free". The latter means the generated prefixes can be prepended
701  * in front of arbitrary numbers and the resulting concatenated numbers are
702  * guaranteed to be always unique.
703  *
704  * This is a minor adjustment to the original Exp. Golomb algorithm in the
705  * sense that lowest allowed index (@n) starts with 1, not with zero.
706  */
707 static VariLenAffix expGolombEncode(uint64_t n, int k)
708 {
709     const uint64_t value = n + (1 << k) - 1;
710     const int bits = (int) log2(value) + 1;
711     return (VariLenAffix) {
712         .type = AffixType_Prefix,
713         .value = value,
714         .bits = bits + MAX((bits - 1 - k), 0)
715     };
716 }
717 
718 /**
719  * invertAffix() - Converts a suffix into a prefix, or a prefix into a suffix.
720  * @affix: either suffix or prefix to be inverted
721  * Return: inversion of passed @affix
722  *
723  * Simply mirror all bits of the affix value, for the purpose to preserve
724  * respectively the mathematical "prefix-free" or "suffix-free" property
725  * after the conversion.
726  *
727  * If a passed prefix is suitable to create unique numbers, then the
728  * returned suffix is suitable to create unique numbers as well (and vice
729  * versa).
730  */
731 static VariLenAffix invertAffix(const VariLenAffix *affix)
732 {
733     return (VariLenAffix) {
734         .type =
735             (affix->type == AffixType_Suffix) ?
736                 AffixType_Prefix : AffixType_Suffix,
737         .value =
738             mirror64bit(affix->value) >>
739             ((sizeof(affix->value) * 8) - affix->bits),
740         .bits = affix->bits
741     };
742 }
743 
744 /**
745  * affixForIndex() - Generates suffix numbers with "suffix-free" property.
746  * @index: natural number (or index) of the suffix to be generated
747  *         (1, 2, 3, ...)
748  * Return: Suffix suitable to assemble unique number.
749  *
750  * This is just a wrapper function on top of the Exp. Golomb algorithm.
751  *
752  * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes,
753  * this function converts the Exp. Golomb prefixes into appropriate suffixes
754  * which are still suitable for generating unique numbers.
755  */
756 static VariLenAffix affixForIndex(uint64_t index)
757 {
758     VariLenAffix prefix;
759     prefix = expGolombEncode(index, EXP_GOLOMB_K);
760     return invertAffix(&prefix); /* convert prefix to suffix */
761 }
762 
763 static uint32_t qpp_hash(QppEntry e)
764 {
765     return qemu_xxhash4(e.ino_prefix, e.dev);
766 }
767 
768 static uint32_t qpf_hash(QpfEntry e)
769 {
770     return qemu_xxhash4(e.ino, e.dev);
771 }
772 
773 static bool qpd_cmp_func(const void *obj, const void *userp)
774 {
775     const QpdEntry *e1 = obj, *e2 = userp;
776     return e1->dev == e2->dev;
777 }
778 
779 static bool qpp_cmp_func(const void *obj, const void *userp)
780 {
781     const QppEntry *e1 = obj, *e2 = userp;
782     return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
783 }
784 
785 static bool qpf_cmp_func(const void *obj, const void *userp)
786 {
787     const QpfEntry *e1 = obj, *e2 = userp;
788     return e1->dev == e2->dev && e1->ino == e2->ino;
789 }
790 
791 static void qp_table_remove(void *p, uint32_t h, void *up)
792 {
793     g_free(p);
794 }
795 
796 static void qp_table_destroy(struct qht *ht)
797 {
798     if (!ht || !ht->map) {
799         return;
800     }
801     qht_iter(ht, qp_table_remove, NULL);
802     qht_destroy(ht);
803 }
804 
805 static void qpd_table_init(struct qht *ht)
806 {
807     qht_init(ht, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
808 }
809 
810 static void qpp_table_init(struct qht *ht)
811 {
812     qht_init(ht, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
813 }
814 
815 static void qpf_table_init(struct qht *ht)
816 {
817     qht_init(ht, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
818 }
819 
820 /*
821  * Returns how many (high end) bits of inode numbers of the passed fs
822  * device shall be used (in combination with the device number) to
823  * generate hash values for qpp_table entries.
824  *
825  * This function is required if variable length suffixes are used for inode
826  * number mapping on guest level. Since a device may end up having multiple
827  * entries in qpp_table, each entry most probably with a different suffix
828  * length, we thus need this function in conjunction with qpd_table to
829  * "agree" about a fix amount of bits (per device) to be always used for
830  * generating hash values for the purpose of accessing qpp_table in order
831  * get consistent behaviour when accessing qpp_table.
832  */
833 static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
834 {
835     QpdEntry lookup = {
836         .dev = dev
837     }, *val;
838     uint32_t hash = dev;
839     VariLenAffix affix;
840 
841     val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
842     if (!val) {
843         val = g_new0(QpdEntry, 1);
844         *val = lookup;
845         affix = affixForIndex(pdu->s->qp_affix_next);
846         val->prefix_bits = affix.bits;
847         qht_insert(&pdu->s->qpd_table, val, hash, NULL);
848         pdu->s->qp_ndevices++;
849     }
850     return val->prefix_bits;
851 }
852 
853 /*
854  * Slow / full mapping host inode nr -> guest inode nr.
855  *
856  * This function performs a slower and much more costly remapping of an
857  * original file inode number on host to an appropriate different inode
858  * number on guest. For every (dev, inode) combination on host a new
859  * sequential number is generated, cached and exposed as inode number on
860  * guest.
861  *
862  * This is just a "last resort" fallback solution if the much faster/cheaper
863  * qid_path_suffixmap() failed. In practice this slow / full mapping is not
864  * expected ever to be used at all though.
865  *
866  * See qid_path_suffixmap() for details
867  *
868  */
869 static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
870                             uint64_t *path)
871 {
872     QpfEntry lookup = {
873         .dev = stbuf->st_dev,
874         .ino = stbuf->st_ino
875     }, *val;
876     uint32_t hash = qpf_hash(lookup);
877     VariLenAffix affix;
878 
879     val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
880 
881     if (!val) {
882         if (pdu->s->qp_fullpath_next == 0) {
883             /* no more files can be mapped :'( */
884             error_report_once(
885                 "9p: No more prefixes available for remapping inodes from "
886                 "host to guest."
887             );
888             return -ENFILE;
889         }
890 
891         val = g_new0(QpfEntry, 1);
892         *val = lookup;
893 
894         /* new unique inode and device combo */
895         affix = affixForIndex(
896             1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
897         );
898         val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
899         pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
900         qht_insert(&pdu->s->qpf_table, val, hash, NULL);
901     }
902 
903     *path = val->path;
904     return 0;
905 }
906 
907 /*
908  * Quick mapping host inode nr -> guest inode nr.
909  *
910  * This function performs quick remapping of an original file inode number
911  * on host to an appropriate different inode number on guest. This remapping
912  * of inodes is required to avoid inode nr collisions on guest which would
913  * happen if the 9p export contains more than 1 exported file system (or
914  * more than 1 file system data set), because unlike on host level where the
915  * files would have different device nrs, all files exported by 9p would
916  * share the same device nr on guest (the device nr of the virtual 9p device
917  * that is).
918  *
919  * Inode remapping is performed by chopping off high end bits of the original
920  * inode number from host, shifting the result upwards and then assigning a
921  * generated suffix number for the low end bits, where the same suffix number
922  * will be shared by all inodes with the same device id AND the same high end
923  * bits that have been chopped off. That approach utilizes the fact that inode
924  * numbers very likely share the same high end bits (i.e. due to their common
925  * sequential generation by file systems) and hence we only have to generate
926  * and track a very limited amount of suffixes in practice due to that.
927  *
928  * We generate variable size suffixes for that purpose. The 1st generated
929  * suffix will only have 1 bit and hence we only need to chop off 1 bit from
930  * the original inode number. The subsequent suffixes being generated will
931  * grow in (bit) size subsequently, i.e. the 2nd and 3rd suffix being
932  * generated will have 3 bits and hence we have to chop off 3 bits from their
933  * original inodes, and so on. That approach of using variable length suffixes
934  * (i.e. over fixed size ones) utilizes the fact that in practice only a very
935  * limited amount of devices are shared by the same export (e.g. typically
936  * less than 2 dozen devices per 9p export), so in practice we need to chop
937  * off less bits than with fixed size prefixes and yet are flexible to add
938  * new devices at runtime below host's export directory at any time without
939  * having to reboot guest nor requiring to reconfigure guest for that. And due
940  * to the very limited amount of original high end bits that we chop off that
941  * way, the total amount of suffixes we need to generate is less than by using
942  * fixed size prefixes and hence it also improves performance of the inode
943  * remapping algorithm, and finally has the nice side effect that the inode
944  * numbers on guest will be much smaller & human friendly. ;-)
945  */
946 static int qid_path_suffixmap(V9fsPDU *pdu, const struct stat *stbuf,
947                               uint64_t *path)
948 {
949     const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
950     QppEntry lookup = {
951         .dev = stbuf->st_dev,
952         .ino_prefix = (uint16_t) (stbuf->st_ino >> (64 - ino_hash_bits))
953     }, *val;
954     uint32_t hash = qpp_hash(lookup);
955 
956     val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
957 
958     if (!val) {
959         if (pdu->s->qp_affix_next == 0) {
960             /* we ran out of affixes */
961             warn_report_once(
962                 "9p: Potential degraded performance of inode remapping"
963             );
964             return -ENFILE;
965         }
966 
967         val = g_new0(QppEntry, 1);
968         *val = lookup;
969 
970         /* new unique inode affix and device combo */
971         val->qp_affix_index = pdu->s->qp_affix_next++;
972         val->qp_affix = affixForIndex(val->qp_affix_index);
973         qht_insert(&pdu->s->qpp_table, val, hash, NULL);
974     }
975     /* assuming generated affix to be suffix type, not prefix */
976     *path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value;
977     return 0;
978 }
979 
980 static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
981 {
982     int err;
983     size_t size;
984 
985     if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
986         /* map inode+device to qid path (fast path) */
987         err = qid_path_suffixmap(pdu, stbuf, &qidp->path);
988         if (err == -ENFILE) {
989             /* fast path didn't work, fall back to full map */
990             err = qid_path_fullmap(pdu, stbuf, &qidp->path);
991         }
992         if (err) {
993             return err;
994         }
995     } else {
996         if (pdu->s->dev_id != stbuf->st_dev) {
997             if (pdu->s->ctx.export_flags & V9FS_FORBID_MULTIDEVS) {
998                 error_report_once(
999                     "9p: Multiple devices detected in same VirtFS export. "
1000                     "Access of guest to additional devices is (partly) "
1001                     "denied due to virtfs option 'multidevs=forbid' being "
1002                     "effective."
1003                 );
1004                 return -ENODEV;
1005             } else {
1006                 warn_report_once(
1007                     "9p: Multiple devices detected in same VirtFS export, "
1008                     "which might lead to file ID collisions and severe "
1009                     "misbehaviours on guest! You should either use a "
1010                     "separate export for each device shared from host or "
1011                     "use virtfs option 'multidevs=remap'!"
1012                 );
1013             }
1014         }
1015         memset(&qidp->path, 0, sizeof(qidp->path));
1016         size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
1017         memcpy(&qidp->path, &stbuf->st_ino, size);
1018     }
1019 
1020     qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
1021     qidp->type = 0;
1022     if (S_ISDIR(stbuf->st_mode)) {
1023         qidp->type |= P9_QID_TYPE_DIR;
1024     }
1025     if (S_ISLNK(stbuf->st_mode)) {
1026         qidp->type |= P9_QID_TYPE_SYMLINK;
1027     }
1028 
1029     return 0;
1030 }
1031 
1032 V9fsPDU *pdu_alloc(V9fsState *s)
1033 {
1034     V9fsPDU *pdu = NULL;
1035 
1036     if (!QLIST_EMPTY(&s->free_list)) {
1037         pdu = QLIST_FIRST(&s->free_list);
1038         QLIST_REMOVE(pdu, next);
1039         QLIST_INSERT_HEAD(&s->active_list, pdu, next);
1040     }
1041     return pdu;
1042 }
1043 
1044 void pdu_free(V9fsPDU *pdu)
1045 {
1046     V9fsState *s = pdu->s;
1047 
1048     g_assert(!pdu->cancelled);
1049     QLIST_REMOVE(pdu, next);
1050     QLIST_INSERT_HEAD(&s->free_list, pdu, next);
1051 }
1052 
1053 static void coroutine_fn pdu_complete(V9fsPDU *pdu, ssize_t len)
1054 {
1055     int8_t id = pdu->id + 1; /* Response */
1056     V9fsState *s = pdu->s;
1057     int ret;
1058 
1059     /*
1060      * The 9p spec requires that successfully cancelled pdus receive no reply.
1061      * Sending a reply would confuse clients because they would
1062      * assume that any EINTR is the actual result of the operation,
1063      * rather than a consequence of the cancellation. However, if
1064      * the operation completed (successfully or with an error other
1065      * than caused be cancellation), we do send out that reply, both
1066      * for efficiency and to avoid confusing the rest of the state machine
1067      * that assumes passing a non-error here will mean a successful
1068      * transmission of the reply.
1069      */
1070     bool discard = pdu->cancelled && len == -EINTR;
1071     if (discard) {
1072         trace_v9fs_rcancel(pdu->tag, pdu->id);
1073         pdu->size = 0;
1074         goto out_notify;
1075     }
1076 
1077     if (len < 0) {
1078         int err = -len;
1079         len = 7;
1080 
1081         if (s->proto_version != V9FS_PROTO_2000L) {
1082             V9fsString str;
1083 
1084             str.data = strerror(err);
1085             str.size = strlen(str.data);
1086 
1087             ret = pdu_marshal(pdu, len, "s", &str);
1088             if (ret < 0) {
1089                 goto out_notify;
1090             }
1091             len += ret;
1092             id = P9_RERROR;
1093         } else {
1094             err = errno_to_dotl(err);
1095         }
1096 
1097         ret = pdu_marshal(pdu, len, "d", err);
1098         if (ret < 0) {
1099             goto out_notify;
1100         }
1101         len += ret;
1102 
1103         if (s->proto_version == V9FS_PROTO_2000L) {
1104             id = P9_RLERROR;
1105         }
1106         trace_v9fs_rerror(pdu->tag, pdu->id, err); /* Trace ERROR */
1107     }
1108 
1109     /* fill out the header */
1110     if (pdu_marshal(pdu, 0, "dbw", (int32_t)len, id, pdu->tag) < 0) {
1111         goto out_notify;
1112     }
1113 
1114     /* keep these in sync */
1115     pdu->size = len;
1116     pdu->id = id;
1117 
1118 out_notify:
1119     pdu->s->transport->push_and_notify(pdu);
1120 
1121     /* Now wakeup anybody waiting in flush for this request */
1122     if (!qemu_co_queue_next(&pdu->complete)) {
1123         pdu_free(pdu);
1124     }
1125 }
1126 
1127 static mode_t v9mode_to_mode(uint32_t mode, V9fsString *extension)
1128 {
1129     mode_t ret;
1130 
1131     ret = mode & 0777;
1132     if (mode & P9_STAT_MODE_DIR) {
1133         ret |= S_IFDIR;
1134     }
1135 
1136     if (mode & P9_STAT_MODE_SYMLINK) {
1137         ret |= S_IFLNK;
1138     }
1139     if (mode & P9_STAT_MODE_SOCKET) {
1140         ret |= S_IFSOCK;
1141     }
1142     if (mode & P9_STAT_MODE_NAMED_PIPE) {
1143         ret |= S_IFIFO;
1144     }
1145     if (mode & P9_STAT_MODE_DEVICE) {
1146         if (extension->size && extension->data[0] == 'c') {
1147             ret |= S_IFCHR;
1148         } else {
1149             ret |= S_IFBLK;
1150         }
1151     }
1152 
1153     if (!(ret & ~0777)) {
1154         ret |= S_IFREG;
1155     }
1156 
1157     if (mode & P9_STAT_MODE_SETUID) {
1158         ret |= S_ISUID;
1159     }
1160     if (mode & P9_STAT_MODE_SETGID) {
1161         ret |= S_ISGID;
1162     }
1163     if (mode & P9_STAT_MODE_SETVTX) {
1164         ret |= S_ISVTX;
1165     }
1166 
1167     return ret;
1168 }
1169 
1170 static int donttouch_stat(V9fsStat *stat)
1171 {
1172     if (stat->type == -1 &&
1173         stat->dev == -1 &&
1174         stat->qid.type == 0xff &&
1175         stat->qid.version == (uint32_t) -1 &&
1176         stat->qid.path == (uint64_t) -1 &&
1177         stat->mode == -1 &&
1178         stat->atime == -1 &&
1179         stat->mtime == -1 &&
1180         stat->length == -1 &&
1181         !stat->name.size &&
1182         !stat->uid.size &&
1183         !stat->gid.size &&
1184         !stat->muid.size &&
1185         stat->n_uid == -1 &&
1186         stat->n_gid == -1 &&
1187         stat->n_muid == -1) {
1188         return 1;
1189     }
1190 
1191     return 0;
1192 }
1193 
1194 static void v9fs_stat_init(V9fsStat *stat)
1195 {
1196     v9fs_string_init(&stat->name);
1197     v9fs_string_init(&stat->uid);
1198     v9fs_string_init(&stat->gid);
1199     v9fs_string_init(&stat->muid);
1200     v9fs_string_init(&stat->extension);
1201 }
1202 
1203 static void v9fs_stat_free(V9fsStat *stat)
1204 {
1205     v9fs_string_free(&stat->name);
1206     v9fs_string_free(&stat->uid);
1207     v9fs_string_free(&stat->gid);
1208     v9fs_string_free(&stat->muid);
1209     v9fs_string_free(&stat->extension);
1210 }
1211 
1212 static uint32_t stat_to_v9mode(const struct stat *stbuf)
1213 {
1214     uint32_t mode;
1215 
1216     mode = stbuf->st_mode & 0777;
1217     if (S_ISDIR(stbuf->st_mode)) {
1218         mode |= P9_STAT_MODE_DIR;
1219     }
1220 
1221     if (S_ISLNK(stbuf->st_mode)) {
1222         mode |= P9_STAT_MODE_SYMLINK;
1223     }
1224 
1225     if (S_ISSOCK(stbuf->st_mode)) {
1226         mode |= P9_STAT_MODE_SOCKET;
1227     }
1228 
1229     if (S_ISFIFO(stbuf->st_mode)) {
1230         mode |= P9_STAT_MODE_NAMED_PIPE;
1231     }
1232 
1233     if (S_ISBLK(stbuf->st_mode) || S_ISCHR(stbuf->st_mode)) {
1234         mode |= P9_STAT_MODE_DEVICE;
1235     }
1236 
1237     if (stbuf->st_mode & S_ISUID) {
1238         mode |= P9_STAT_MODE_SETUID;
1239     }
1240 
1241     if (stbuf->st_mode & S_ISGID) {
1242         mode |= P9_STAT_MODE_SETGID;
1243     }
1244 
1245     if (stbuf->st_mode & S_ISVTX) {
1246         mode |= P9_STAT_MODE_SETVTX;
1247     }
1248 
1249     return mode;
1250 }
1251 
1252 static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
1253                                        const char *basename,
1254                                        const struct stat *stbuf,
1255                                        V9fsStat *v9stat)
1256 {
1257     int err;
1258 
1259     memset(v9stat, 0, sizeof(*v9stat));
1260 
1261     err = stat_to_qid(pdu, stbuf, &v9stat->qid);
1262     if (err < 0) {
1263         return err;
1264     }
1265     v9stat->mode = stat_to_v9mode(stbuf);
1266     v9stat->atime = stbuf->st_atime;
1267     v9stat->mtime = stbuf->st_mtime;
1268     v9stat->length = stbuf->st_size;
1269 
1270     v9fs_string_free(&v9stat->uid);
1271     v9fs_string_free(&v9stat->gid);
1272     v9fs_string_free(&v9stat->muid);
1273 
1274     v9stat->n_uid = stbuf->st_uid;
1275     v9stat->n_gid = stbuf->st_gid;
1276     v9stat->n_muid = 0;
1277 
1278     v9fs_string_free(&v9stat->extension);
1279 
1280     if (v9stat->mode & P9_STAT_MODE_SYMLINK) {
1281         err = v9fs_co_readlink(pdu, path, &v9stat->extension);
1282         if (err < 0) {
1283             return err;
1284         }
1285     } else if (v9stat->mode & P9_STAT_MODE_DEVICE) {
1286         v9fs_string_sprintf(&v9stat->extension, "%c %u %u",
1287                 S_ISCHR(stbuf->st_mode) ? 'c' : 'b',
1288                 major(stbuf->st_rdev), minor(stbuf->st_rdev));
1289     } else if (S_ISDIR(stbuf->st_mode) || S_ISREG(stbuf->st_mode)) {
1290         v9fs_string_sprintf(&v9stat->extension, "%s %lu",
1291                 "HARDLINKCOUNT", (unsigned long)stbuf->st_nlink);
1292     }
1293 
1294     v9fs_string_sprintf(&v9stat->name, "%s", basename);
1295 
1296     v9stat->size = 61 +
1297         v9fs_string_size(&v9stat->name) +
1298         v9fs_string_size(&v9stat->uid) +
1299         v9fs_string_size(&v9stat->gid) +
1300         v9fs_string_size(&v9stat->muid) +
1301         v9fs_string_size(&v9stat->extension);
1302     return 0;
1303 }
1304 
1305 #define P9_STATS_MODE          0x00000001ULL
1306 #define P9_STATS_NLINK         0x00000002ULL
1307 #define P9_STATS_UID           0x00000004ULL
1308 #define P9_STATS_GID           0x00000008ULL
1309 #define P9_STATS_RDEV          0x00000010ULL
1310 #define P9_STATS_ATIME         0x00000020ULL
1311 #define P9_STATS_MTIME         0x00000040ULL
1312 #define P9_STATS_CTIME         0x00000080ULL
1313 #define P9_STATS_INO           0x00000100ULL
1314 #define P9_STATS_SIZE          0x00000200ULL
1315 #define P9_STATS_BLOCKS        0x00000400ULL
1316 
1317 #define P9_STATS_BTIME         0x00000800ULL
1318 #define P9_STATS_GEN           0x00001000ULL
1319 #define P9_STATS_DATA_VERSION  0x00002000ULL
1320 
1321 #define P9_STATS_BASIC         0x000007ffULL /* Mask for fields up to BLOCKS */
1322 #define P9_STATS_ALL           0x00003fffULL /* Mask for All fields above */
1323 
1324 
1325 /**
1326  * blksize_to_iounit() - Block size exposed to 9p client.
1327  * Return: block size
1328  *
1329  * @pdu: 9p client request
1330  * @blksize: host filesystem's block size
1331  *
1332  * Convert host filesystem's block size into an appropriate block size for
1333  * 9p client (guest OS side). The value returned suggests an "optimum" block
1334  * size for 9p I/O, i.e. to maximize performance.
1335  */
1336 static int32_t blksize_to_iounit(const V9fsPDU *pdu, int32_t blksize)
1337 {
1338     int32_t iounit = 0;
1339     V9fsState *s = pdu->s;
1340 
1341     /*
1342      * iounit should be multiples of blksize (host filesystem block size)
1343      * as well as less than (client msize - P9_IOHDRSZ)
1344      */
1345     if (blksize) {
1346         iounit = QEMU_ALIGN_DOWN(s->msize - P9_IOHDRSZ, blksize);
1347     }
1348     if (!iounit) {
1349         iounit = s->msize - P9_IOHDRSZ;
1350     }
1351     return iounit;
1352 }
1353 
1354 static int32_t stat_to_iounit(const V9fsPDU *pdu, const struct stat *stbuf)
1355 {
1356     return blksize_to_iounit(pdu, stbuf->st_blksize);
1357 }
1358 
1359 static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
1360                                 V9fsStatDotl *v9lstat)
1361 {
1362     memset(v9lstat, 0, sizeof(*v9lstat));
1363 
1364     v9lstat->st_mode = stbuf->st_mode;
1365     v9lstat->st_nlink = stbuf->st_nlink;
1366     v9lstat->st_uid = stbuf->st_uid;
1367     v9lstat->st_gid = stbuf->st_gid;
1368     v9lstat->st_rdev = host_dev_to_dotl_dev(stbuf->st_rdev);
1369     v9lstat->st_size = stbuf->st_size;
1370     v9lstat->st_blksize = stat_to_iounit(pdu, stbuf);
1371     v9lstat->st_blocks = stbuf->st_blocks;
1372     v9lstat->st_atime_sec = stbuf->st_atime;
1373     v9lstat->st_mtime_sec = stbuf->st_mtime;
1374     v9lstat->st_ctime_sec = stbuf->st_ctime;
1375 #ifdef CONFIG_DARWIN
1376     v9lstat->st_atime_nsec = stbuf->st_atimespec.tv_nsec;
1377     v9lstat->st_mtime_nsec = stbuf->st_mtimespec.tv_nsec;
1378     v9lstat->st_ctime_nsec = stbuf->st_ctimespec.tv_nsec;
1379 #else
1380     v9lstat->st_atime_nsec = stbuf->st_atim.tv_nsec;
1381     v9lstat->st_mtime_nsec = stbuf->st_mtim.tv_nsec;
1382     v9lstat->st_ctime_nsec = stbuf->st_ctim.tv_nsec;
1383 #endif
1384     /* Currently we only support BASIC fields in stat */
1385     v9lstat->st_result_mask = P9_STATS_BASIC;
1386 
1387     return stat_to_qid(pdu, stbuf, &v9lstat->qid);
1388 }
1389 
1390 static void print_sg(struct iovec *sg, int cnt)
1391 {
1392     int i;
1393 
1394     printf("sg[%d]: {", cnt);
1395     for (i = 0; i < cnt; i++) {
1396         if (i) {
1397             printf(", ");
1398         }
1399         printf("(%p, %zd)", sg[i].iov_base, sg[i].iov_len);
1400     }
1401     printf("}\n");
1402 }
1403 
1404 /* Will call this only for path name based fid */
1405 static void v9fs_fix_path(V9fsPath *dst, V9fsPath *src, int len)
1406 {
1407     V9fsPath str;
1408     v9fs_path_init(&str);
1409     v9fs_path_copy(&str, dst);
1410     v9fs_path_sprintf(dst, "%s%s", src->data, str.data + len);
1411     v9fs_path_free(&str);
1412 }
1413 
1414 static inline bool is_ro_export(FsContext *ctx)
1415 {
1416     return ctx->export_flags & V9FS_RDONLY;
1417 }
1418 
1419 static void coroutine_fn v9fs_version(void *opaque)
1420 {
1421     ssize_t err;
1422     V9fsPDU *pdu = opaque;
1423     V9fsState *s = pdu->s;
1424     V9fsString version;
1425     size_t offset = 7;
1426 
1427     v9fs_string_init(&version);
1428     err = pdu_unmarshal(pdu, offset, "ds", &s->msize, &version);
1429     if (err < 0) {
1430         goto out;
1431     }
1432     trace_v9fs_version(pdu->tag, pdu->id, s->msize, version.data);
1433 
1434     virtfs_reset(pdu);
1435 
1436     if (!strcmp(version.data, "9P2000.u")) {
1437         s->proto_version = V9FS_PROTO_2000U;
1438     } else if (!strcmp(version.data, "9P2000.L")) {
1439         s->proto_version = V9FS_PROTO_2000L;
1440     } else {
1441         v9fs_string_sprintf(&version, "unknown");
1442         /* skip min. msize check, reporting invalid version has priority */
1443         goto marshal;
1444     }
1445 
1446     if (s->msize < P9_MIN_MSIZE) {
1447         err = -EMSGSIZE;
1448         error_report(
1449             "9pfs: Client requested msize < minimum msize ("
1450             stringify(P9_MIN_MSIZE) ") supported by this server."
1451         );
1452         goto out;
1453     }
1454 
1455     /* 8192 is the default msize of Linux clients */
1456     if (s->msize <= 8192 && !(s->ctx.export_flags & V9FS_NO_PERF_WARN)) {
1457         warn_report_once(
1458             "9p: degraded performance: a reasonable high msize should be "
1459             "chosen on client/guest side (chosen msize is <= 8192). See "
1460             "https://wiki.qemu.org/Documentation/9psetup#msize for details."
1461         );
1462     }
1463 
1464 marshal:
1465     err = pdu_marshal(pdu, offset, "ds", s->msize, &version);
1466     if (err < 0) {
1467         goto out;
1468     }
1469     err += offset;
1470     trace_v9fs_version_return(pdu->tag, pdu->id, s->msize, version.data);
1471 out:
1472     pdu_complete(pdu, err);
1473     v9fs_string_free(&version);
1474 }
1475 
1476 static void coroutine_fn v9fs_attach(void *opaque)
1477 {
1478     V9fsPDU *pdu = opaque;
1479     V9fsState *s = pdu->s;
1480     int32_t fid, afid, n_uname;
1481     V9fsString uname, aname;
1482     V9fsFidState *fidp;
1483     size_t offset = 7;
1484     V9fsQID qid;
1485     ssize_t err;
1486     struct stat stbuf;
1487 
1488     v9fs_string_init(&uname);
1489     v9fs_string_init(&aname);
1490     err = pdu_unmarshal(pdu, offset, "ddssd", &fid,
1491                         &afid, &uname, &aname, &n_uname);
1492     if (err < 0) {
1493         goto out_nofid;
1494     }
1495     trace_v9fs_attach(pdu->tag, pdu->id, fid, afid, uname.data, aname.data);
1496 
1497     fidp = alloc_fid(s, fid);
1498     if (fidp == NULL) {
1499         err = -EINVAL;
1500         goto out_nofid;
1501     }
1502     fidp->uid = n_uname;
1503     err = v9fs_co_name_to_path(pdu, NULL, "/", &fidp->path);
1504     if (err < 0) {
1505         err = -EINVAL;
1506         clunk_fid(s, fid);
1507         goto out;
1508     }
1509     err = v9fs_co_lstat(pdu, &fidp->path, &stbuf);
1510     if (err < 0) {
1511         err = -EINVAL;
1512         clunk_fid(s, fid);
1513         goto out;
1514     }
1515     err = stat_to_qid(pdu, &stbuf, &qid);
1516     if (err < 0) {
1517         err = -EINVAL;
1518         clunk_fid(s, fid);
1519         goto out;
1520     }
1521 
1522     /*
1523      * disable migration if we haven't done already.
1524      * attach could get called multiple times for the same export.
1525      */
1526     if (!s->migration_blocker) {
1527         error_setg(&s->migration_blocker,
1528                    "Migration is disabled when VirtFS export path '%s' is mounted in the guest using mount_tag '%s'",
1529                    s->ctx.fs_root ? s->ctx.fs_root : "NULL", s->tag);
1530         err = migrate_add_blocker(&s->migration_blocker, NULL);
1531         if (err < 0) {
1532             clunk_fid(s, fid);
1533             goto out;
1534         }
1535         s->root_fid = fid;
1536     }
1537 
1538     err = pdu_marshal(pdu, offset, "Q", &qid);
1539     if (err < 0) {
1540         clunk_fid(s, fid);
1541         goto out;
1542     }
1543     err += offset;
1544 
1545     memcpy(&s->root_st, &stbuf, sizeof(stbuf));
1546     trace_v9fs_attach_return(pdu->tag, pdu->id,
1547                              qid.type, qid.version, qid.path);
1548 out:
1549     put_fid(pdu, fidp);
1550 out_nofid:
1551     pdu_complete(pdu, err);
1552     v9fs_string_free(&uname);
1553     v9fs_string_free(&aname);
1554 }
1555 
1556 static void coroutine_fn v9fs_stat(void *opaque)
1557 {
1558     int32_t fid;
1559     V9fsStat v9stat;
1560     ssize_t err = 0;
1561     size_t offset = 7;
1562     struct stat stbuf;
1563     V9fsFidState *fidp;
1564     V9fsPDU *pdu = opaque;
1565     char *basename;
1566 
1567     err = pdu_unmarshal(pdu, offset, "d", &fid);
1568     if (err < 0) {
1569         goto out_nofid;
1570     }
1571     trace_v9fs_stat(pdu->tag, pdu->id, fid);
1572 
1573     fidp = get_fid(pdu, fid);
1574     if (fidp == NULL) {
1575         err = -ENOENT;
1576         goto out_nofid;
1577     }
1578     err = v9fs_co_lstat(pdu, &fidp->path, &stbuf);
1579     if (err < 0) {
1580         goto out;
1581     }
1582     basename = g_path_get_basename(fidp->path.data);
1583     err = stat_to_v9stat(pdu, &fidp->path, basename, &stbuf, &v9stat);
1584     g_free(basename);
1585     if (err < 0) {
1586         goto out;
1587     }
1588     err = pdu_marshal(pdu, offset, "wS", 0, &v9stat);
1589     if (err < 0) {
1590         v9fs_stat_free(&v9stat);
1591         goto out;
1592     }
1593     trace_v9fs_stat_return(pdu->tag, pdu->id, v9stat.mode,
1594                            v9stat.atime, v9stat.mtime, v9stat.length);
1595     err += offset;
1596     v9fs_stat_free(&v9stat);
1597 out:
1598     put_fid(pdu, fidp);
1599 out_nofid:
1600     pdu_complete(pdu, err);
1601 }
1602 
1603 static bool fid_has_valid_file_handle(V9fsState *s, V9fsFidState *fidp)
1604 {
1605     return s->ops->has_valid_file_handle(fidp->fid_type, &fidp->fs);
1606 }
1607 
1608 static void coroutine_fn v9fs_getattr(void *opaque)
1609 {
1610     int32_t fid;
1611     size_t offset = 7;
1612     ssize_t retval = 0;
1613     struct stat stbuf;
1614     V9fsFidState *fidp;
1615     uint64_t request_mask;
1616     V9fsStatDotl v9stat_dotl;
1617     V9fsPDU *pdu = opaque;
1618 
1619     retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
1620     if (retval < 0) {
1621         goto out_nofid;
1622     }
1623     trace_v9fs_getattr(pdu->tag, pdu->id, fid, request_mask);
1624 
1625     fidp = get_fid(pdu, fid);
1626     if (fidp == NULL) {
1627         retval = -ENOENT;
1628         goto out_nofid;
1629     }
1630     if (fid_has_valid_file_handle(pdu->s, fidp)) {
1631         retval = v9fs_co_fstat(pdu, fidp, &stbuf);
1632     } else {
1633         retval = v9fs_co_lstat(pdu, &fidp->path, &stbuf);
1634     }
1635     if (retval < 0) {
1636         goto out;
1637     }
1638     retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
1639     if (retval < 0) {
1640         goto out;
1641     }
1642 
1643     /*  fill st_gen if requested and supported by underlying fs */
1644     if (request_mask & P9_STATS_GEN) {
1645         retval = v9fs_co_st_gen(pdu, &fidp->path, stbuf.st_mode, &v9stat_dotl);
1646         switch (retval) {
1647         case 0:
1648             /* we have valid st_gen: update result mask */
1649             v9stat_dotl.st_result_mask |= P9_STATS_GEN;
1650             break;
1651         case -EINTR:
1652             /* request cancelled, e.g. by Tflush */
1653             goto out;
1654         default:
1655             /* failed to get st_gen: not fatal, ignore */
1656             break;
1657         }
1658     }
1659     retval = pdu_marshal(pdu, offset, "A", &v9stat_dotl);
1660     if (retval < 0) {
1661         goto out;
1662     }
1663     retval += offset;
1664     trace_v9fs_getattr_return(pdu->tag, pdu->id, v9stat_dotl.st_result_mask,
1665                               v9stat_dotl.st_mode, v9stat_dotl.st_uid,
1666                               v9stat_dotl.st_gid);
1667 out:
1668     put_fid(pdu, fidp);
1669 out_nofid:
1670     pdu_complete(pdu, retval);
1671 }
1672 
1673 /* Attribute flags */
1674 #define P9_ATTR_MODE       (1 << 0)
1675 #define P9_ATTR_UID        (1 << 1)
1676 #define P9_ATTR_GID        (1 << 2)
1677 #define P9_ATTR_SIZE       (1 << 3)
1678 #define P9_ATTR_ATIME      (1 << 4)
1679 #define P9_ATTR_MTIME      (1 << 5)
1680 #define P9_ATTR_CTIME      (1 << 6)
1681 #define P9_ATTR_ATIME_SET  (1 << 7)
1682 #define P9_ATTR_MTIME_SET  (1 << 8)
1683 
1684 #define P9_ATTR_MASK    127
1685 
1686 static void coroutine_fn v9fs_setattr(void *opaque)
1687 {
1688     int err = 0;
1689     int32_t fid;
1690     V9fsFidState *fidp;
1691     size_t offset = 7;
1692     V9fsIattr v9iattr;
1693     V9fsPDU *pdu = opaque;
1694 
1695     err = pdu_unmarshal(pdu, offset, "dI", &fid, &v9iattr);
1696     if (err < 0) {
1697         goto out_nofid;
1698     }
1699 
1700     trace_v9fs_setattr(pdu->tag, pdu->id, fid,
1701                        v9iattr.valid, v9iattr.mode, v9iattr.uid, v9iattr.gid,
1702                        v9iattr.size, v9iattr.atime_sec, v9iattr.mtime_sec);
1703 
1704     fidp = get_fid(pdu, fid);
1705     if (fidp == NULL) {
1706         err = -EINVAL;
1707         goto out_nofid;
1708     }
1709     if (v9iattr.valid & P9_ATTR_MODE) {
1710         err = v9fs_co_chmod(pdu, &fidp->path, v9iattr.mode);
1711         if (err < 0) {
1712             goto out;
1713         }
1714     }
1715     if (v9iattr.valid & (P9_ATTR_ATIME | P9_ATTR_MTIME)) {
1716         struct timespec times[2];
1717         if (v9iattr.valid & P9_ATTR_ATIME) {
1718             if (v9iattr.valid & P9_ATTR_ATIME_SET) {
1719                 times[0].tv_sec = v9iattr.atime_sec;
1720                 times[0].tv_nsec = v9iattr.atime_nsec;
1721             } else {
1722                 times[0].tv_nsec = UTIME_NOW;
1723             }
1724         } else {
1725             times[0].tv_nsec = UTIME_OMIT;
1726         }
1727         if (v9iattr.valid & P9_ATTR_MTIME) {
1728             if (v9iattr.valid & P9_ATTR_MTIME_SET) {
1729                 times[1].tv_sec = v9iattr.mtime_sec;
1730                 times[1].tv_nsec = v9iattr.mtime_nsec;
1731             } else {
1732                 times[1].tv_nsec = UTIME_NOW;
1733             }
1734         } else {
1735             times[1].tv_nsec = UTIME_OMIT;
1736         }
1737         if (fid_has_valid_file_handle(pdu->s, fidp)) {
1738             err = v9fs_co_futimens(pdu, fidp, times);
1739         } else {
1740             err = v9fs_co_utimensat(pdu, &fidp->path, times);
1741         }
1742         if (err < 0) {
1743             goto out;
1744         }
1745     }
1746     /*
1747      * If the only valid entry in iattr is ctime we can call
1748      * chown(-1,-1) to update the ctime of the file
1749      */
1750     if ((v9iattr.valid & (P9_ATTR_UID | P9_ATTR_GID)) ||
1751         ((v9iattr.valid & P9_ATTR_CTIME)
1752          && !((v9iattr.valid & P9_ATTR_MASK) & ~P9_ATTR_CTIME))) {
1753         if (!(v9iattr.valid & P9_ATTR_UID)) {
1754             v9iattr.uid = -1;
1755         }
1756         if (!(v9iattr.valid & P9_ATTR_GID)) {
1757             v9iattr.gid = -1;
1758         }
1759         err = v9fs_co_chown(pdu, &fidp->path, v9iattr.uid,
1760                             v9iattr.gid);
1761         if (err < 0) {
1762             goto out;
1763         }
1764     }
1765     if (v9iattr.valid & (P9_ATTR_SIZE)) {
1766         if (fid_has_valid_file_handle(pdu->s, fidp)) {
1767             err = v9fs_co_ftruncate(pdu, fidp, v9iattr.size);
1768         } else {
1769             err = v9fs_co_truncate(pdu, &fidp->path, v9iattr.size);
1770         }
1771         if (err < 0) {
1772             goto out;
1773         }
1774     }
1775     err = offset;
1776     trace_v9fs_setattr_return(pdu->tag, pdu->id);
1777 out:
1778     put_fid(pdu, fidp);
1779 out_nofid:
1780     pdu_complete(pdu, err);
1781 }
1782 
1783 static int v9fs_walk_marshal(V9fsPDU *pdu, uint16_t nwnames, V9fsQID *qids)
1784 {
1785     int i;
1786     ssize_t err;
1787     size_t offset = 7;
1788 
1789     err = pdu_marshal(pdu, offset, "w", nwnames);
1790     if (err < 0) {
1791         return err;
1792     }
1793     offset += err;
1794     for (i = 0; i < nwnames; i++) {
1795         err = pdu_marshal(pdu, offset, "Q", &qids[i]);
1796         if (err < 0) {
1797             return err;
1798         }
1799         offset += err;
1800     }
1801     return offset;
1802 }
1803 
1804 static bool name_is_illegal(const char *name)
1805 {
1806     return !*name || strchr(name, '/') != NULL;
1807 }
1808 
1809 static bool same_stat_id(const struct stat *a, const struct stat *b)
1810 {
1811     return a->st_dev == b->st_dev && a->st_ino == b->st_ino;
1812 }
1813 
1814 /*
1815  * Returns a (newly allocated) comma-separated string presentation of the
1816  * passed array for logging (tracing) purpose for trace event "v9fs_walk".
1817  *
1818  * It is caller's responsibility to free the returned string.
1819  */
1820 static char *trace_v9fs_walk_wnames(V9fsString *wnames, size_t nwnames)
1821 {
1822     g_autofree char **arr = g_malloc0_n(nwnames + 1, sizeof(char *));
1823     for (size_t i = 0; i < nwnames; ++i) {
1824         arr[i] = wnames[i].data;
1825     }
1826     return g_strjoinv(", ", arr);
1827 }
1828 
1829 static void coroutine_fn v9fs_walk(void *opaque)
1830 {
1831     int name_idx, nwalked;
1832     g_autofree V9fsQID *qids = NULL;
1833     int i, err = 0, any_err = 0;
1834     V9fsPath dpath, path;
1835     P9ARRAY_REF(V9fsPath) pathes = NULL;
1836     uint16_t nwnames;
1837     struct stat stbuf, fidst;
1838     g_autofree struct stat *stbufs = NULL;
1839     size_t offset = 7;
1840     int32_t fid, newfid;
1841     P9ARRAY_REF(V9fsString) wnames = NULL;
1842     g_autofree char *trace_wnames = NULL;
1843     V9fsFidState *fidp;
1844     V9fsFidState *newfidp = NULL;
1845     V9fsPDU *pdu = opaque;
1846     V9fsState *s = pdu->s;
1847     V9fsQID qid;
1848 
1849     err = pdu_unmarshal(pdu, offset, "ddw", &fid, &newfid, &nwnames);
1850     if (err < 0) {
1851         pdu_complete(pdu, err);
1852         return;
1853     }
1854     offset += err;
1855 
1856     if (nwnames > P9_MAXWELEM) {
1857         err = -EINVAL;
1858         goto out_nofid_nownames;
1859     }
1860     if (nwnames) {
1861         P9ARRAY_NEW(V9fsString, wnames, nwnames);
1862         qids   = g_new0(V9fsQID, nwnames);
1863         stbufs = g_new0(struct stat, nwnames);
1864         P9ARRAY_NEW(V9fsPath, pathes, nwnames);
1865         for (i = 0; i < nwnames; i++) {
1866             err = pdu_unmarshal(pdu, offset, "s", &wnames[i]);
1867             if (err < 0) {
1868                 goto out_nofid_nownames;
1869             }
1870             if (name_is_illegal(wnames[i].data)) {
1871                 err = -ENOENT;
1872                 goto out_nofid_nownames;
1873             }
1874             offset += err;
1875         }
1876         if (trace_event_get_state_backends(TRACE_V9FS_WALK)) {
1877             trace_wnames = trace_v9fs_walk_wnames(wnames, nwnames);
1878             trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames,
1879                             trace_wnames);
1880         }
1881     } else {
1882         trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames, "");
1883     }
1884 
1885     fidp = get_fid(pdu, fid);
1886     if (fidp == NULL) {
1887         err = -ENOENT;
1888         goto out_nofid;
1889     }
1890 
1891     v9fs_path_init(&dpath);
1892     v9fs_path_init(&path);
1893     /*
1894      * Both dpath and path initially point to fidp.
1895      * Needed to handle request with nwnames == 0
1896      */
1897     v9fs_path_copy(&dpath, &fidp->path);
1898     v9fs_path_copy(&path, &fidp->path);
1899 
1900     /*
1901      * To keep latency (i.e. overall execution time for processing this
1902      * Twalk client request) as small as possible, run all the required fs
1903      * driver code altogether inside the following block.
1904      */
1905     v9fs_co_run_in_worker({
1906         nwalked = 0;
1907         if (v9fs_request_cancelled(pdu)) {
1908             any_err |= err = -EINTR;
1909             break;
1910         }
1911         err = s->ops->lstat(&s->ctx, &dpath, &fidst);
1912         if (err < 0) {
1913             any_err |= err = -errno;
1914             break;
1915         }
1916         stbuf = fidst;
1917         for (; nwalked < nwnames; nwalked++) {
1918             if (v9fs_request_cancelled(pdu)) {
1919                 any_err |= err = -EINTR;
1920                 break;
1921             }
1922             if (!same_stat_id(&pdu->s->root_st, &stbuf) ||
1923                 strcmp("..", wnames[nwalked].data))
1924             {
1925                 err = s->ops->name_to_path(&s->ctx, &dpath,
1926                                            wnames[nwalked].data,
1927                                            &pathes[nwalked]);
1928                 if (err < 0) {
1929                     any_err |= err = -errno;
1930                     break;
1931                 }
1932                 if (v9fs_request_cancelled(pdu)) {
1933                     any_err |= err = -EINTR;
1934                     break;
1935                 }
1936                 err = s->ops->lstat(&s->ctx, &pathes[nwalked], &stbuf);
1937                 if (err < 0) {
1938                     any_err |= err = -errno;
1939                     break;
1940                 }
1941                 stbufs[nwalked] = stbuf;
1942                 v9fs_path_copy(&dpath, &pathes[nwalked]);
1943             }
1944         }
1945     });
1946     /*
1947      * Handle all the rest of this Twalk request on main thread ...
1948      *
1949      * NOTE: -EINTR is an exception where we deviate from the protocol spec
1950      * and simply send a (R)Lerror response instead of bothering to assemble
1951      * a (deducted) Rwalk response; because -EINTR is always the result of a
1952      * Tflush request, so client would no longer wait for a response in this
1953      * case anyway.
1954      */
1955     if ((err < 0 && !nwalked) || err == -EINTR) {
1956         goto out;
1957     }
1958 
1959     any_err |= err = stat_to_qid(pdu, &fidst, &qid);
1960     if (err < 0 && !nwalked) {
1961         goto out;
1962     }
1963     stbuf = fidst;
1964 
1965     /* reset dpath and path */
1966     v9fs_path_copy(&dpath, &fidp->path);
1967     v9fs_path_copy(&path, &fidp->path);
1968 
1969     for (name_idx = 0; name_idx < nwalked; name_idx++) {
1970         if (!same_stat_id(&pdu->s->root_st, &stbuf) ||
1971             strcmp("..", wnames[name_idx].data))
1972         {
1973             stbuf = stbufs[name_idx];
1974             any_err |= err = stat_to_qid(pdu, &stbuf, &qid);
1975             if (err < 0) {
1976                 break;
1977             }
1978             v9fs_path_copy(&path, &pathes[name_idx]);
1979             v9fs_path_copy(&dpath, &path);
1980         }
1981         memcpy(&qids[name_idx], &qid, sizeof(qid));
1982     }
1983     if (any_err < 0) {
1984         if (!name_idx) {
1985             /* don't send any QIDs, send Rlerror instead */
1986             goto out;
1987         } else {
1988             /* send QIDs (not Rlerror), but fid MUST remain unaffected */
1989             goto send_qids;
1990         }
1991     }
1992     if (fid == newfid) {
1993         if (fidp->fid_type != P9_FID_NONE) {
1994             err = -EINVAL;
1995             goto out;
1996         }
1997         v9fs_path_write_lock(s);
1998         v9fs_path_copy(&fidp->path, &path);
1999         v9fs_path_unlock(s);
2000     } else {
2001         newfidp = alloc_fid(s, newfid);
2002         if (newfidp == NULL) {
2003             err = -EINVAL;
2004             goto out;
2005         }
2006         newfidp->uid = fidp->uid;
2007         v9fs_path_copy(&newfidp->path, &path);
2008     }
2009 send_qids:
2010     err = v9fs_walk_marshal(pdu, name_idx, qids);
2011     trace_v9fs_walk_return(pdu->tag, pdu->id, name_idx, qids);
2012 out:
2013     put_fid(pdu, fidp);
2014     if (newfidp) {
2015         put_fid(pdu, newfidp);
2016     }
2017     v9fs_path_free(&dpath);
2018     v9fs_path_free(&path);
2019     goto out_pdu_complete;
2020 out_nofid_nownames:
2021     trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames, "<?>");
2022 out_nofid:
2023 out_pdu_complete:
2024     pdu_complete(pdu, err);
2025 }
2026 
2027 static int32_t coroutine_fn get_iounit(V9fsPDU *pdu, V9fsPath *path)
2028 {
2029     struct statfs stbuf;
2030     int err = v9fs_co_statfs(pdu, path, &stbuf);
2031 
2032     return blksize_to_iounit(pdu, (err >= 0) ? stbuf.f_bsize : 0);
2033 }
2034 
2035 static void coroutine_fn v9fs_open(void *opaque)
2036 {
2037     int flags;
2038     int32_t fid;
2039     int32_t mode;
2040     V9fsQID qid;
2041     int iounit = 0;
2042     ssize_t err = 0;
2043     size_t offset = 7;
2044     struct stat stbuf;
2045     V9fsFidState *fidp;
2046     V9fsPDU *pdu = opaque;
2047     V9fsState *s = pdu->s;
2048     g_autofree char *trace_oflags = NULL;
2049 
2050     if (s->proto_version == V9FS_PROTO_2000L) {
2051         err = pdu_unmarshal(pdu, offset, "dd", &fid, &mode);
2052     } else {
2053         uint8_t modebyte;
2054         err = pdu_unmarshal(pdu, offset, "db", &fid, &modebyte);
2055         mode = modebyte;
2056     }
2057     if (err < 0) {
2058         goto out_nofid;
2059     }
2060     if (trace_event_get_state_backends(TRACE_V9FS_OPEN)) {
2061         trace_oflags = qemu_open_flags_tostr(
2062             (s->proto_version == V9FS_PROTO_2000L) ?
2063                 dotl_to_open_flags(mode) : omode_to_uflags(mode)
2064         );
2065         trace_v9fs_open(pdu->tag, pdu->id, fid, mode, trace_oflags);
2066     }
2067 
2068     fidp = get_fid(pdu, fid);
2069     if (fidp == NULL) {
2070         err = -ENOENT;
2071         goto out_nofid;
2072     }
2073     if (fidp->fid_type != P9_FID_NONE) {
2074         err = -EINVAL;
2075         goto out;
2076     }
2077 
2078     err = v9fs_co_lstat(pdu, &fidp->path, &stbuf);
2079     if (err < 0) {
2080         goto out;
2081     }
2082     err = stat_to_qid(pdu, &stbuf, &qid);
2083     if (err < 0) {
2084         goto out;
2085     }
2086     if (S_ISDIR(stbuf.st_mode)) {
2087         err = v9fs_co_opendir(pdu, fidp);
2088         if (err < 0) {
2089             goto out;
2090         }
2091         fidp->fid_type = P9_FID_DIR;
2092         err = pdu_marshal(pdu, offset, "Qd", &qid, 0);
2093         if (err < 0) {
2094             goto out;
2095         }
2096         err += offset;
2097     } else {
2098         if (s->proto_version == V9FS_PROTO_2000L) {
2099             flags = get_dotl_openflags(s, mode);
2100         } else {
2101             flags = omode_to_uflags(mode);
2102         }
2103         if (is_ro_export(&s->ctx)) {
2104             if (mode & O_WRONLY || mode & O_RDWR ||
2105                 mode & O_APPEND || mode & O_TRUNC) {
2106                 err = -EROFS;
2107                 goto out;
2108             }
2109         }
2110         err = v9fs_co_open(pdu, fidp, flags);
2111         if (err < 0) {
2112             goto out;
2113         }
2114         fidp->fid_type = P9_FID_FILE;
2115         fidp->open_flags = flags;
2116         if (flags & O_EXCL) {
2117             /*
2118              * We let the host file system do O_EXCL check
2119              * We should not reclaim such fd
2120              */
2121             fidp->flags |= FID_NON_RECLAIMABLE;
2122         }
2123         iounit = get_iounit(pdu, &fidp->path);
2124         err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
2125         if (err < 0) {
2126             goto out;
2127         }
2128         err += offset;
2129     }
2130     trace_v9fs_open_return(pdu->tag, pdu->id,
2131                            qid.type, qid.version, qid.path, iounit);
2132 out:
2133     put_fid(pdu, fidp);
2134 out_nofid:
2135     pdu_complete(pdu, err);
2136 }
2137 
2138 static void coroutine_fn v9fs_lcreate(void *opaque)
2139 {
2140     int32_t dfid, flags, mode;
2141     gid_t gid;
2142     ssize_t err = 0;
2143     ssize_t offset = 7;
2144     V9fsString name;
2145     V9fsFidState *fidp;
2146     struct stat stbuf;
2147     V9fsQID qid;
2148     int32_t iounit;
2149     V9fsPDU *pdu = opaque;
2150 
2151     v9fs_string_init(&name);
2152     err = pdu_unmarshal(pdu, offset, "dsddd", &dfid,
2153                         &name, &flags, &mode, &gid);
2154     if (err < 0) {
2155         goto out_nofid;
2156     }
2157     trace_v9fs_lcreate(pdu->tag, pdu->id, dfid, flags, mode, gid);
2158 
2159     if (name_is_illegal(name.data)) {
2160         err = -ENOENT;
2161         goto out_nofid;
2162     }
2163 
2164     if (!strcmp(".", name.data) || !strcmp("..", name.data)) {
2165         err = -EEXIST;
2166         goto out_nofid;
2167     }
2168 
2169     fidp = get_fid(pdu, dfid);
2170     if (fidp == NULL) {
2171         err = -ENOENT;
2172         goto out_nofid;
2173     }
2174     if (fidp->fid_type != P9_FID_NONE) {
2175         err = -EINVAL;
2176         goto out;
2177     }
2178 
2179     flags = get_dotl_openflags(pdu->s, flags);
2180     err = v9fs_co_open2(pdu, fidp, &name, gid,
2181                         flags | O_CREAT, mode, &stbuf);
2182     if (err < 0) {
2183         goto out;
2184     }
2185     fidp->fid_type = P9_FID_FILE;
2186     fidp->open_flags = flags;
2187     if (flags & O_EXCL) {
2188         /*
2189          * We let the host file system do O_EXCL check
2190          * We should not reclaim such fd
2191          */
2192         fidp->flags |= FID_NON_RECLAIMABLE;
2193     }
2194     iounit =  get_iounit(pdu, &fidp->path);
2195     err = stat_to_qid(pdu, &stbuf, &qid);
2196     if (err < 0) {
2197         goto out;
2198     }
2199     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
2200     if (err < 0) {
2201         goto out;
2202     }
2203     err += offset;
2204     trace_v9fs_lcreate_return(pdu->tag, pdu->id,
2205                               qid.type, qid.version, qid.path, iounit);
2206 out:
2207     put_fid(pdu, fidp);
2208 out_nofid:
2209     pdu_complete(pdu, err);
2210     v9fs_string_free(&name);
2211 }
2212 
2213 static void coroutine_fn v9fs_fsync(void *opaque)
2214 {
2215     int err;
2216     int32_t fid;
2217     int datasync;
2218     size_t offset = 7;
2219     V9fsFidState *fidp;
2220     V9fsPDU *pdu = opaque;
2221 
2222     err = pdu_unmarshal(pdu, offset, "dd", &fid, &datasync);
2223     if (err < 0) {
2224         goto out_nofid;
2225     }
2226     trace_v9fs_fsync(pdu->tag, pdu->id, fid, datasync);
2227 
2228     fidp = get_fid(pdu, fid);
2229     if (fidp == NULL) {
2230         err = -ENOENT;
2231         goto out_nofid;
2232     }
2233     err = v9fs_co_fsync(pdu, fidp, datasync);
2234     if (!err) {
2235         err = offset;
2236     }
2237     put_fid(pdu, fidp);
2238 out_nofid:
2239     pdu_complete(pdu, err);
2240 }
2241 
2242 static void coroutine_fn v9fs_clunk(void *opaque)
2243 {
2244     int err;
2245     int32_t fid;
2246     size_t offset = 7;
2247     V9fsFidState *fidp;
2248     V9fsPDU *pdu = opaque;
2249     V9fsState *s = pdu->s;
2250 
2251     err = pdu_unmarshal(pdu, offset, "d", &fid);
2252     if (err < 0) {
2253         goto out_nofid;
2254     }
2255     trace_v9fs_clunk(pdu->tag, pdu->id, fid);
2256 
2257     fidp = clunk_fid(s, fid);
2258     if (fidp == NULL) {
2259         err = -ENOENT;
2260         goto out_nofid;
2261     }
2262     /*
2263      * Bump the ref so that put_fid will
2264      * free the fid.
2265      */
2266     fidp->ref++;
2267     err = put_fid(pdu, fidp);
2268     if (!err) {
2269         err = offset;
2270     }
2271 out_nofid:
2272     pdu_complete(pdu, err);
2273 }
2274 
2275 /*
2276  * Create a QEMUIOVector for a sub-region of PDU iovecs
2277  *
2278  * @qiov:       uninitialized QEMUIOVector
2279  * @skip:       number of bytes to skip from beginning of PDU
2280  * @size:       number of bytes to include
2281  * @is_write:   true - write, false - read
2282  *
2283  * The resulting QEMUIOVector has heap-allocated iovecs and must be cleaned up
2284  * with qemu_iovec_destroy().
2285  */
2286 static void v9fs_init_qiov_from_pdu(QEMUIOVector *qiov, V9fsPDU *pdu,
2287                                     size_t skip, size_t size,
2288                                     bool is_write)
2289 {
2290     QEMUIOVector elem;
2291     struct iovec *iov;
2292     unsigned int niov;
2293 
2294     if (is_write) {
2295         pdu->s->transport->init_out_iov_from_pdu(pdu, &iov, &niov, size + skip);
2296     } else {
2297         pdu->s->transport->init_in_iov_from_pdu(pdu, &iov, &niov, size + skip);
2298     }
2299 
2300     qemu_iovec_init_external(&elem, iov, niov);
2301     qemu_iovec_init(qiov, niov);
2302     qemu_iovec_concat(qiov, &elem, skip, size);
2303 }
2304 
2305 static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
2306                            uint64_t off, uint32_t max_count)
2307 {
2308     ssize_t err;
2309     size_t offset = 7;
2310     uint64_t read_count;
2311     QEMUIOVector qiov_full;
2312 
2313     if (fidp->fs.xattr.len < off) {
2314         read_count = 0;
2315     } else {
2316         read_count = fidp->fs.xattr.len - off;
2317     }
2318     if (read_count > max_count) {
2319         read_count = max_count;
2320     }
2321     err = pdu_marshal(pdu, offset, "d", read_count);
2322     if (err < 0) {
2323         return err;
2324     }
2325     offset += err;
2326 
2327     v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, read_count, false);
2328     err = v9fs_pack(qiov_full.iov, qiov_full.niov, 0,
2329                     ((char *)fidp->fs.xattr.value) + off,
2330                     read_count);
2331     qemu_iovec_destroy(&qiov_full);
2332     if (err < 0) {
2333         return err;
2334     }
2335     offset += err;
2336     return offset;
2337 }
2338 
2339 static int coroutine_fn v9fs_do_readdir_with_stat(V9fsPDU *pdu,
2340                                                   V9fsFidState *fidp,
2341                                                   uint32_t max_count)
2342 {
2343     V9fsPath path;
2344     V9fsStat v9stat;
2345     int len, err = 0;
2346     int32_t count = 0;
2347     struct stat stbuf;
2348     off_t saved_dir_pos;
2349     struct dirent *dent;
2350 
2351     /* save the directory position */
2352     saved_dir_pos = v9fs_co_telldir(pdu, fidp);
2353     if (saved_dir_pos < 0) {
2354         return saved_dir_pos;
2355     }
2356 
2357     while (1) {
2358         v9fs_path_init(&path);
2359 
2360         v9fs_readdir_lock(&fidp->fs.dir);
2361 
2362         err = v9fs_co_readdir(pdu, fidp, &dent);
2363         if (err || !dent) {
2364             break;
2365         }
2366         err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path);
2367         if (err < 0) {
2368             break;
2369         }
2370         err = v9fs_co_lstat(pdu, &path, &stbuf);
2371         if (err < 0) {
2372             break;
2373         }
2374         err = stat_to_v9stat(pdu, &path, dent->d_name, &stbuf, &v9stat);
2375         if (err < 0) {
2376             break;
2377         }
2378         if ((count + v9stat.size + 2) > max_count) {
2379             v9fs_readdir_unlock(&fidp->fs.dir);
2380 
2381             /* Ran out of buffer. Set dir back to old position and return */
2382             v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
2383             v9fs_stat_free(&v9stat);
2384             v9fs_path_free(&path);
2385             return count;
2386         }
2387 
2388         /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
2389         len = pdu_marshal(pdu, 11 + count, "S", &v9stat);
2390 
2391         v9fs_readdir_unlock(&fidp->fs.dir);
2392 
2393         if (len < 0) {
2394             v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
2395             v9fs_stat_free(&v9stat);
2396             v9fs_path_free(&path);
2397             return len;
2398         }
2399         count += len;
2400         v9fs_stat_free(&v9stat);
2401         v9fs_path_free(&path);
2402         saved_dir_pos = qemu_dirent_off(dent);
2403     }
2404 
2405     v9fs_readdir_unlock(&fidp->fs.dir);
2406 
2407     v9fs_path_free(&path);
2408     if (err < 0) {
2409         return err;
2410     }
2411     return count;
2412 }
2413 
2414 static void coroutine_fn v9fs_read(void *opaque)
2415 {
2416     int32_t fid;
2417     uint64_t off;
2418     ssize_t err = 0;
2419     int32_t count = 0;
2420     size_t offset = 7;
2421     uint32_t max_count;
2422     V9fsFidState *fidp;
2423     V9fsPDU *pdu = opaque;
2424     V9fsState *s = pdu->s;
2425 
2426     err = pdu_unmarshal(pdu, offset, "dqd", &fid, &off, &max_count);
2427     if (err < 0) {
2428         goto out_nofid;
2429     }
2430     trace_v9fs_read(pdu->tag, pdu->id, fid, off, max_count);
2431 
2432     fidp = get_fid(pdu, fid);
2433     if (fidp == NULL) {
2434         err = -EINVAL;
2435         goto out_nofid;
2436     }
2437     if (fidp->fid_type == P9_FID_DIR) {
2438         if (s->proto_version != V9FS_PROTO_2000U) {
2439             warn_report_once(
2440                 "9p: bad client: T_read request on directory only expected "
2441                 "with 9P2000.u protocol version"
2442             );
2443             err = -EOPNOTSUPP;
2444             goto out;
2445         }
2446         if (off == 0) {
2447             v9fs_co_rewinddir(pdu, fidp);
2448         }
2449         count = v9fs_do_readdir_with_stat(pdu, fidp, max_count);
2450         if (count < 0) {
2451             err = count;
2452             goto out;
2453         }
2454         err = pdu_marshal(pdu, offset, "d", count);
2455         if (err < 0) {
2456             goto out;
2457         }
2458         err += offset + count;
2459     } else if (fidp->fid_type == P9_FID_FILE) {
2460         QEMUIOVector qiov_full;
2461         QEMUIOVector qiov;
2462         int32_t len;
2463 
2464         v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset + 4, max_count, false);
2465         qemu_iovec_init(&qiov, qiov_full.niov);
2466         do {
2467             qemu_iovec_reset(&qiov);
2468             qemu_iovec_concat(&qiov, &qiov_full, count, qiov_full.size - count);
2469             if (0) {
2470                 print_sg(qiov.iov, qiov.niov);
2471             }
2472             /* Loop in case of EINTR */
2473             do {
2474                 len = v9fs_co_preadv(pdu, fidp, qiov.iov, qiov.niov, off);
2475                 if (len >= 0) {
2476                     off   += len;
2477                     count += len;
2478                 }
2479             } while (len == -EINTR && !pdu->cancelled);
2480             if (len < 0) {
2481                 /* IO error return the error */
2482                 err = len;
2483                 goto out_free_iovec;
2484             }
2485         } while (count < max_count && len > 0);
2486         err = pdu_marshal(pdu, offset, "d", count);
2487         if (err < 0) {
2488             goto out_free_iovec;
2489         }
2490         err += offset + count;
2491 out_free_iovec:
2492         qemu_iovec_destroy(&qiov);
2493         qemu_iovec_destroy(&qiov_full);
2494     } else if (fidp->fid_type == P9_FID_XATTR) {
2495         err = v9fs_xattr_read(s, pdu, fidp, off, max_count);
2496     } else {
2497         err = -EINVAL;
2498     }
2499     trace_v9fs_read_return(pdu->tag, pdu->id, count, err);
2500 out:
2501     put_fid(pdu, fidp);
2502 out_nofid:
2503     pdu_complete(pdu, err);
2504 }
2505 
2506 /**
2507  * v9fs_readdir_response_size() - Returns size required in Rreaddir response
2508  * for the passed dirent @name.
2509  *
2510  * @name: directory entry's name (i.e. file name, directory name)
2511  * Return: required size in bytes
2512  */
2513 size_t v9fs_readdir_response_size(V9fsString *name)
2514 {
2515     /*
2516      * Size of each dirent on the wire: size of qid (13) + size of offset (8)
2517      * size of type (1) + size of name.size (2) + strlen(name.data)
2518      */
2519     return 24 + v9fs_string_size(name);
2520 }
2521 
2522 static void v9fs_free_dirents(struct V9fsDirEnt *e)
2523 {
2524     struct V9fsDirEnt *next = NULL;
2525 
2526     for (; e; e = next) {
2527         next = e->next;
2528         g_free(e->dent);
2529         g_free(e->st);
2530         g_free(e);
2531     }
2532 }
2533 
2534 static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
2535                                         off_t offset, int32_t max_count)
2536 {
2537     size_t size;
2538     V9fsQID qid;
2539     V9fsString name;
2540     int len, err = 0;
2541     int32_t count = 0;
2542     off_t off;
2543     struct dirent *dent;
2544     struct stat *st;
2545     struct V9fsDirEnt *entries = NULL;
2546 
2547     /*
2548      * inode remapping requires the device id, which in turn might be
2549      * different for different directory entries, so if inode remapping is
2550      * enabled we have to make a full stat for each directory entry
2551      */
2552     const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
2553 
2554     /*
2555      * Fetch all required directory entries altogether on a background IO
2556      * thread from fs driver. We don't want to do that for each entry
2557      * individually, because hopping between threads (this main IO thread
2558      * and background IO driver thread) would sum up to huge latencies.
2559      */
2560     count = v9fs_co_readdir_many(pdu, fidp, &entries, offset, max_count,
2561                                  dostat);
2562     if (count < 0) {
2563         err = count;
2564         count = 0;
2565         goto out;
2566     }
2567     count = 0;
2568 
2569     for (struct V9fsDirEnt *e = entries; e; e = e->next) {
2570         dent = e->dent;
2571 
2572         if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
2573             st = e->st;
2574             /* e->st should never be NULL, but just to be sure */
2575             if (!st) {
2576                 err = -1;
2577                 break;
2578             }
2579 
2580             /* remap inode */
2581             err = stat_to_qid(pdu, st, &qid);
2582             if (err < 0) {
2583                 break;
2584             }
2585         } else {
2586             /*
2587              * Fill up just the path field of qid because the client uses
2588              * only that. To fill the entire qid structure we will have
2589              * to stat each dirent found, which is expensive. For the
2590              * latter reason we don't call stat_to_qid() here. Only drawback
2591              * is that no multi-device export detection of stat_to_qid()
2592              * would be done and provided as error to the user here. But
2593              * user would get that error anyway when accessing those
2594              * files/dirs through other ways.
2595              */
2596             size = MIN(sizeof(dent->d_ino), sizeof(qid.path));
2597             memcpy(&qid.path, &dent->d_ino, size);
2598             /* Fill the other fields with dummy values */
2599             qid.type = 0;
2600             qid.version = 0;
2601         }
2602 
2603         off = qemu_dirent_off(dent);
2604         v9fs_string_init(&name);
2605         v9fs_string_sprintf(&name, "%s", dent->d_name);
2606 
2607         /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
2608         len = pdu_marshal(pdu, 11 + count, "Qqbs",
2609                           &qid, off,
2610                           dent->d_type, &name);
2611 
2612         v9fs_string_free(&name);
2613 
2614         if (len < 0) {
2615             err = len;
2616             break;
2617         }
2618 
2619         count += len;
2620     }
2621 
2622 out:
2623     v9fs_free_dirents(entries);
2624     if (err < 0) {
2625         return err;
2626     }
2627     return count;
2628 }
2629 
2630 static void coroutine_fn v9fs_readdir(void *opaque)
2631 {
2632     int32_t fid;
2633     V9fsFidState *fidp;
2634     ssize_t retval = 0;
2635     size_t offset = 7;
2636     uint64_t initial_offset;
2637     int32_t count;
2638     uint32_t max_count;
2639     V9fsPDU *pdu = opaque;
2640     V9fsState *s = pdu->s;
2641 
2642     retval = pdu_unmarshal(pdu, offset, "dqd", &fid,
2643                            &initial_offset, &max_count);
2644     if (retval < 0) {
2645         goto out_nofid;
2646     }
2647     trace_v9fs_readdir(pdu->tag, pdu->id, fid, initial_offset, max_count);
2648 
2649     /* Enough space for a R_readdir header: size[4] Rreaddir tag[2] count[4] */
2650     if (max_count > s->msize - 11) {
2651         max_count = s->msize - 11;
2652         warn_report_once(
2653             "9p: bad client: T_readdir with count > msize - 11"
2654         );
2655     }
2656 
2657     fidp = get_fid(pdu, fid);
2658     if (fidp == NULL) {
2659         retval = -EINVAL;
2660         goto out_nofid;
2661     }
2662     if (fidp->fid_type != P9_FID_DIR) {
2663         warn_report_once("9p: bad client: T_readdir on non-directory stream");
2664         retval = -ENOTDIR;
2665         goto out;
2666     }
2667     if (!fidp->fs.dir.stream) {
2668         retval = -EINVAL;
2669         goto out;
2670     }
2671     if (s->proto_version != V9FS_PROTO_2000L) {
2672         warn_report_once(
2673             "9p: bad client: T_readdir request only expected with 9P2000.L "
2674             "protocol version"
2675         );
2676         retval = -EOPNOTSUPP;
2677         goto out;
2678     }
2679     count = v9fs_do_readdir(pdu, fidp, (off_t) initial_offset, max_count);
2680     if (count < 0) {
2681         retval = count;
2682         goto out;
2683     }
2684     retval = pdu_marshal(pdu, offset, "d", count);
2685     if (retval < 0) {
2686         goto out;
2687     }
2688     retval += count + offset;
2689     trace_v9fs_readdir_return(pdu->tag, pdu->id, count, retval);
2690 out:
2691     put_fid(pdu, fidp);
2692 out_nofid:
2693     pdu_complete(pdu, retval);
2694 }
2695 
2696 static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
2697                             uint64_t off, uint32_t count,
2698                             struct iovec *sg, int cnt)
2699 {
2700     int i, to_copy;
2701     ssize_t err = 0;
2702     uint64_t write_count;
2703     size_t offset = 7;
2704 
2705 
2706     if (fidp->fs.xattr.len < off) {
2707         return -ENOSPC;
2708     }
2709     write_count = fidp->fs.xattr.len - off;
2710     if (write_count > count) {
2711         write_count = count;
2712     }
2713     err = pdu_marshal(pdu, offset, "d", write_count);
2714     if (err < 0) {
2715         return err;
2716     }
2717     err += offset;
2718     fidp->fs.xattr.copied_len += write_count;
2719     /*
2720      * Now copy the content from sg list
2721      */
2722     for (i = 0; i < cnt; i++) {
2723         if (write_count > sg[i].iov_len) {
2724             to_copy = sg[i].iov_len;
2725         } else {
2726             to_copy = write_count;
2727         }
2728         memcpy((char *)fidp->fs.xattr.value + off, sg[i].iov_base, to_copy);
2729         /* updating vs->off since we are not using below */
2730         off += to_copy;
2731         write_count -= to_copy;
2732     }
2733 
2734     return err;
2735 }
2736 
2737 static void coroutine_fn v9fs_write(void *opaque)
2738 {
2739     ssize_t err;
2740     int32_t fid;
2741     uint64_t off;
2742     uint32_t count;
2743     int32_t len = 0;
2744     int32_t total = 0;
2745     size_t offset = 7;
2746     V9fsFidState *fidp;
2747     V9fsPDU *pdu = opaque;
2748     V9fsState *s = pdu->s;
2749     QEMUIOVector qiov_full;
2750     QEMUIOVector qiov;
2751 
2752     err = pdu_unmarshal(pdu, offset, "dqd", &fid, &off, &count);
2753     if (err < 0) {
2754         pdu_complete(pdu, err);
2755         return;
2756     }
2757     offset += err;
2758     v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, count, true);
2759     trace_v9fs_write(pdu->tag, pdu->id, fid, off, count, qiov_full.niov);
2760 
2761     fidp = get_fid(pdu, fid);
2762     if (fidp == NULL) {
2763         err = -EINVAL;
2764         goto out_nofid;
2765     }
2766     if (fidp->fid_type == P9_FID_FILE) {
2767         if (fidp->fs.fd == -1) {
2768             err = -EINVAL;
2769             goto out;
2770         }
2771     } else if (fidp->fid_type == P9_FID_XATTR) {
2772         /*
2773          * setxattr operation
2774          */
2775         err = v9fs_xattr_write(s, pdu, fidp, off, count,
2776                                qiov_full.iov, qiov_full.niov);
2777         goto out;
2778     } else {
2779         err = -EINVAL;
2780         goto out;
2781     }
2782     qemu_iovec_init(&qiov, qiov_full.niov);
2783     do {
2784         qemu_iovec_reset(&qiov);
2785         qemu_iovec_concat(&qiov, &qiov_full, total, qiov_full.size - total);
2786         if (0) {
2787             print_sg(qiov.iov, qiov.niov);
2788         }
2789         /* Loop in case of EINTR */
2790         do {
2791             len = v9fs_co_pwritev(pdu, fidp, qiov.iov, qiov.niov, off);
2792             if (len >= 0) {
2793                 off   += len;
2794                 total += len;
2795             }
2796         } while (len == -EINTR && !pdu->cancelled);
2797         if (len < 0) {
2798             /* IO error return the error */
2799             err = len;
2800             goto out_qiov;
2801         }
2802     } while (total < count && len > 0);
2803 
2804     offset = 7;
2805     err = pdu_marshal(pdu, offset, "d", total);
2806     if (err < 0) {
2807         goto out_qiov;
2808     }
2809     err += offset;
2810     trace_v9fs_write_return(pdu->tag, pdu->id, total, err);
2811 out_qiov:
2812     qemu_iovec_destroy(&qiov);
2813 out:
2814     put_fid(pdu, fidp);
2815 out_nofid:
2816     qemu_iovec_destroy(&qiov_full);
2817     pdu_complete(pdu, err);
2818 }
2819 
2820 static void coroutine_fn v9fs_create(void *opaque)
2821 {
2822     int32_t fid;
2823     int err = 0;
2824     size_t offset = 7;
2825     V9fsFidState *fidp;
2826     V9fsQID qid;
2827     int32_t perm;
2828     int8_t mode;
2829     V9fsPath path;
2830     struct stat stbuf;
2831     V9fsString name;
2832     V9fsString extension;
2833     int iounit;
2834     V9fsPDU *pdu = opaque;
2835     V9fsState *s = pdu->s;
2836 
2837     v9fs_path_init(&path);
2838     v9fs_string_init(&name);
2839     v9fs_string_init(&extension);
2840     err = pdu_unmarshal(pdu, offset, "dsdbs", &fid, &name,
2841                         &perm, &mode, &extension);
2842     if (err < 0) {
2843         goto out_nofid;
2844     }
2845     trace_v9fs_create(pdu->tag, pdu->id, fid, name.data, perm, mode);
2846 
2847     if (name_is_illegal(name.data)) {
2848         err = -ENOENT;
2849         goto out_nofid;
2850     }
2851 
2852     if (!strcmp(".", name.data) || !strcmp("..", name.data)) {
2853         err = -EEXIST;
2854         goto out_nofid;
2855     }
2856 
2857     fidp = get_fid(pdu, fid);
2858     if (fidp == NULL) {
2859         err = -EINVAL;
2860         goto out_nofid;
2861     }
2862     if (fidp->fid_type != P9_FID_NONE) {
2863         err = -EINVAL;
2864         goto out;
2865     }
2866     if (perm & P9_STAT_MODE_DIR) {
2867         err = v9fs_co_mkdir(pdu, fidp, &name, perm & 0777,
2868                             fidp->uid, -1, &stbuf);
2869         if (err < 0) {
2870             goto out;
2871         }
2872         err = v9fs_co_name_to_path(pdu, &fidp->path, name.data, &path);
2873         if (err < 0) {
2874             goto out;
2875         }
2876         v9fs_path_write_lock(s);
2877         v9fs_path_copy(&fidp->path, &path);
2878         v9fs_path_unlock(s);
2879         err = v9fs_co_opendir(pdu, fidp);
2880         if (err < 0) {
2881             goto out;
2882         }
2883         fidp->fid_type = P9_FID_DIR;
2884     } else if (perm & P9_STAT_MODE_SYMLINK) {
2885         err = v9fs_co_symlink(pdu, fidp, &name,
2886                               extension.data, -1 , &stbuf);
2887         if (err < 0) {
2888             goto out;
2889         }
2890         err = v9fs_co_name_to_path(pdu, &fidp->path, name.data, &path);
2891         if (err < 0) {
2892             goto out;
2893         }
2894         v9fs_path_write_lock(s);
2895         v9fs_path_copy(&fidp->path, &path);
2896         v9fs_path_unlock(s);
2897     } else if (perm & P9_STAT_MODE_LINK) {
2898         int32_t ofid = atoi(extension.data);
2899         V9fsFidState *ofidp = get_fid(pdu, ofid);
2900         if (ofidp == NULL) {
2901             err = -EINVAL;
2902             goto out;
2903         }
2904         err = v9fs_co_link(pdu, ofidp, fidp, &name);
2905         put_fid(pdu, ofidp);
2906         if (err < 0) {
2907             goto out;
2908         }
2909         err = v9fs_co_name_to_path(pdu, &fidp->path, name.data, &path);
2910         if (err < 0) {
2911             fidp->fid_type = P9_FID_NONE;
2912             goto out;
2913         }
2914         v9fs_path_write_lock(s);
2915         v9fs_path_copy(&fidp->path, &path);
2916         v9fs_path_unlock(s);
2917         err = v9fs_co_lstat(pdu, &fidp->path, &stbuf);
2918         if (err < 0) {
2919             fidp->fid_type = P9_FID_NONE;
2920             goto out;
2921         }
2922     } else if (perm & P9_STAT_MODE_DEVICE) {
2923         char ctype;
2924         uint32_t major, minor;
2925         mode_t nmode = 0;
2926 
2927         if (sscanf(extension.data, "%c %u %u", &ctype, &major, &minor) != 3) {
2928             err = -errno;
2929             goto out;
2930         }
2931 
2932         switch (ctype) {
2933         case 'c':
2934             nmode = S_IFCHR;
2935             break;
2936         case 'b':
2937             nmode = S_IFBLK;
2938             break;
2939         default:
2940             err = -EIO;
2941             goto out;
2942         }
2943 
2944         nmode |= perm & 0777;
2945         err = v9fs_co_mknod(pdu, fidp, &name, fidp->uid, -1,
2946                             makedev(major, minor), nmode, &stbuf);
2947         if (err < 0) {
2948             goto out;
2949         }
2950         err = v9fs_co_name_to_path(pdu, &fidp->path, name.data, &path);
2951         if (err < 0) {
2952             goto out;
2953         }
2954         v9fs_path_write_lock(s);
2955         v9fs_path_copy(&fidp->path, &path);
2956         v9fs_path_unlock(s);
2957     } else if (perm & P9_STAT_MODE_NAMED_PIPE) {
2958         err = v9fs_co_mknod(pdu, fidp, &name, fidp->uid, -1,
2959                             0, S_IFIFO | (perm & 0777), &stbuf);
2960         if (err < 0) {
2961             goto out;
2962         }
2963         err = v9fs_co_name_to_path(pdu, &fidp->path, name.data, &path);
2964         if (err < 0) {
2965             goto out;
2966         }
2967         v9fs_path_write_lock(s);
2968         v9fs_path_copy(&fidp->path, &path);
2969         v9fs_path_unlock(s);
2970     } else if (perm & P9_STAT_MODE_SOCKET) {
2971         err = v9fs_co_mknod(pdu, fidp, &name, fidp->uid, -1,
2972                             0, S_IFSOCK | (perm & 0777), &stbuf);
2973         if (err < 0) {
2974             goto out;
2975         }
2976         err = v9fs_co_name_to_path(pdu, &fidp->path, name.data, &path);
2977         if (err < 0) {
2978             goto out;
2979         }
2980         v9fs_path_write_lock(s);
2981         v9fs_path_copy(&fidp->path, &path);
2982         v9fs_path_unlock(s);
2983     } else {
2984         err = v9fs_co_open2(pdu, fidp, &name, -1,
2985                             omode_to_uflags(mode) | O_CREAT, perm, &stbuf);
2986         if (err < 0) {
2987             goto out;
2988         }
2989         fidp->fid_type = P9_FID_FILE;
2990         fidp->open_flags = omode_to_uflags(mode);
2991         if (fidp->open_flags & O_EXCL) {
2992             /*
2993              * We let the host file system do O_EXCL check
2994              * We should not reclaim such fd
2995              */
2996             fidp->flags |= FID_NON_RECLAIMABLE;
2997         }
2998     }
2999     iounit = get_iounit(pdu, &fidp->path);
3000     err = stat_to_qid(pdu, &stbuf, &qid);
3001     if (err < 0) {
3002         goto out;
3003     }
3004     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
3005     if (err < 0) {
3006         goto out;
3007     }
3008     err += offset;
3009     trace_v9fs_create_return(pdu->tag, pdu->id,
3010                              qid.type, qid.version, qid.path, iounit);
3011 out:
3012     put_fid(pdu, fidp);
3013 out_nofid:
3014    pdu_complete(pdu, err);
3015    v9fs_string_free(&name);
3016    v9fs_string_free(&extension);
3017    v9fs_path_free(&path);
3018 }
3019 
3020 static void coroutine_fn v9fs_symlink(void *opaque)
3021 {
3022     V9fsPDU *pdu = opaque;
3023     V9fsString name;
3024     V9fsString symname;
3025     V9fsFidState *dfidp;
3026     V9fsQID qid;
3027     struct stat stbuf;
3028     int32_t dfid;
3029     int err = 0;
3030     gid_t gid;
3031     size_t offset = 7;
3032 
3033     v9fs_string_init(&name);
3034     v9fs_string_init(&symname);
3035     err = pdu_unmarshal(pdu, offset, "dssd", &dfid, &name, &symname, &gid);
3036     if (err < 0) {
3037         goto out_nofid;
3038     }
3039     trace_v9fs_symlink(pdu->tag, pdu->id, dfid, name.data, symname.data, gid);
3040 
3041     if (name_is_illegal(name.data)) {
3042         err = -ENOENT;
3043         goto out_nofid;
3044     }
3045 
3046     if (!strcmp(".", name.data) || !strcmp("..", name.data)) {
3047         err = -EEXIST;
3048         goto out_nofid;
3049     }
3050 
3051     dfidp = get_fid(pdu, dfid);
3052     if (dfidp == NULL) {
3053         err = -EINVAL;
3054         goto out_nofid;
3055     }
3056     err = v9fs_co_symlink(pdu, dfidp, &name, symname.data, gid, &stbuf);
3057     if (err < 0) {
3058         goto out;
3059     }
3060     err = stat_to_qid(pdu, &stbuf, &qid);
3061     if (err < 0) {
3062         goto out;
3063     }
3064     err =  pdu_marshal(pdu, offset, "Q", &qid);
3065     if (err < 0) {
3066         goto out;
3067     }
3068     err += offset;
3069     trace_v9fs_symlink_return(pdu->tag, pdu->id,
3070                               qid.type, qid.version, qid.path);
3071 out:
3072     put_fid(pdu, dfidp);
3073 out_nofid:
3074     pdu_complete(pdu, err);
3075     v9fs_string_free(&name);
3076     v9fs_string_free(&symname);
3077 }
3078 
3079 static void coroutine_fn v9fs_flush(void *opaque)
3080 {
3081     ssize_t err;
3082     int16_t tag;
3083     size_t offset = 7;
3084     V9fsPDU *cancel_pdu = NULL;
3085     V9fsPDU *pdu = opaque;
3086     V9fsState *s = pdu->s;
3087 
3088     err = pdu_unmarshal(pdu, offset, "w", &tag);
3089     if (err < 0) {
3090         pdu_complete(pdu, err);
3091         return;
3092     }
3093     trace_v9fs_flush(pdu->tag, pdu->id, tag);
3094 
3095     if (pdu->tag == tag) {
3096         warn_report("the guest sent a self-referencing 9P flush request");
3097     } else {
3098         QLIST_FOREACH(cancel_pdu, &s->active_list, next) {
3099             if (cancel_pdu->tag == tag) {
3100                 break;
3101             }
3102         }
3103     }
3104     if (cancel_pdu) {
3105         cancel_pdu->cancelled = 1;
3106         /*
3107          * Wait for pdu to complete.
3108          */
3109         qemu_co_queue_wait(&cancel_pdu->complete, NULL);
3110         if (!qemu_co_queue_next(&cancel_pdu->complete)) {
3111             cancel_pdu->cancelled = 0;
3112             pdu_free(cancel_pdu);
3113         }
3114     }
3115     pdu_complete(pdu, 7);
3116 }
3117 
3118 static void coroutine_fn v9fs_link(void *opaque)
3119 {
3120     V9fsPDU *pdu = opaque;
3121     int32_t dfid, oldfid;
3122     V9fsFidState *dfidp, *oldfidp;
3123     V9fsString name;
3124     size_t offset = 7;
3125     int err = 0;
3126 
3127     v9fs_string_init(&name);
3128     err = pdu_unmarshal(pdu, offset, "dds", &dfid, &oldfid, &name);
3129     if (err < 0) {
3130         goto out_nofid;
3131     }
3132     trace_v9fs_link(pdu->tag, pdu->id, dfid, oldfid, name.data);
3133 
3134     if (name_is_illegal(name.data)) {
3135         err = -ENOENT;
3136         goto out_nofid;
3137     }
3138 
3139     if (!strcmp(".", name.data) || !strcmp("..", name.data)) {
3140         err = -EEXIST;
3141         goto out_nofid;
3142     }
3143 
3144     dfidp = get_fid(pdu, dfid);
3145     if (dfidp == NULL) {
3146         err = -ENOENT;
3147         goto out_nofid;
3148     }
3149 
3150     oldfidp = get_fid(pdu, oldfid);
3151     if (oldfidp == NULL) {
3152         err = -ENOENT;
3153         goto out;
3154     }
3155     err = v9fs_co_link(pdu, oldfidp, dfidp, &name);
3156     if (!err) {
3157         err = offset;
3158     }
3159     put_fid(pdu, oldfidp);
3160 out:
3161     put_fid(pdu, dfidp);
3162 out_nofid:
3163     v9fs_string_free(&name);
3164     pdu_complete(pdu, err);
3165 }
3166 
3167 /* Only works with path name based fid */
3168 static void coroutine_fn v9fs_remove(void *opaque)
3169 {
3170     int32_t fid;
3171     int err = 0;
3172     size_t offset = 7;
3173     V9fsFidState *fidp;
3174     V9fsPDU *pdu = opaque;
3175 
3176     err = pdu_unmarshal(pdu, offset, "d", &fid);
3177     if (err < 0) {
3178         goto out_nofid;
3179     }
3180     trace_v9fs_remove(pdu->tag, pdu->id, fid);
3181 
3182     fidp = get_fid(pdu, fid);
3183     if (fidp == NULL) {
3184         err = -EINVAL;
3185         goto out_nofid;
3186     }
3187     /* if fs driver is not path based, return EOPNOTSUPP */
3188     if (!(pdu->s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT)) {
3189         err = -EOPNOTSUPP;
3190         goto out_err;
3191     }
3192     /*
3193      * IF the file is unlinked, we cannot reopen
3194      * the file later. So don't reclaim fd
3195      */
3196     err = v9fs_mark_fids_unreclaim(pdu, &fidp->path);
3197     if (err < 0) {
3198         goto out_err;
3199     }
3200     err = v9fs_co_remove(pdu, &fidp->path);
3201     if (!err) {
3202         err = offset;
3203     }
3204 out_err:
3205     /* For TREMOVE we need to clunk the fid even on failed remove */
3206     clunk_fid(pdu->s, fidp->fid);
3207     put_fid(pdu, fidp);
3208 out_nofid:
3209     pdu_complete(pdu, err);
3210 }
3211 
3212 static void coroutine_fn v9fs_unlinkat(void *opaque)
3213 {
3214     int err = 0;
3215     V9fsString name;
3216     int32_t dfid, flags, rflags = 0;
3217     size_t offset = 7;
3218     V9fsPath path;
3219     V9fsFidState *dfidp;
3220     V9fsPDU *pdu = opaque;
3221 
3222     v9fs_string_init(&name);
3223     err = pdu_unmarshal(pdu, offset, "dsd", &dfid, &name, &flags);
3224     if (err < 0) {
3225         goto out_nofid;
3226     }
3227 
3228     if (name_is_illegal(name.data)) {
3229         err = -ENOENT;
3230         goto out_nofid;
3231     }
3232 
3233     if (!strcmp(".", name.data)) {
3234         err = -EINVAL;
3235         goto out_nofid;
3236     }
3237 
3238     if (!strcmp("..", name.data)) {
3239         err = -ENOTEMPTY;
3240         goto out_nofid;
3241     }
3242 
3243     if (flags & ~P9_DOTL_AT_REMOVEDIR) {
3244         err = -EINVAL;
3245         goto out_nofid;
3246     }
3247 
3248     if (flags & P9_DOTL_AT_REMOVEDIR) {
3249         rflags |= AT_REMOVEDIR;
3250     }
3251 
3252     dfidp = get_fid(pdu, dfid);
3253     if (dfidp == NULL) {
3254         err = -EINVAL;
3255         goto out_nofid;
3256     }
3257     /*
3258      * IF the file is unlinked, we cannot reopen
3259      * the file later. So don't reclaim fd
3260      */
3261     v9fs_path_init(&path);
3262     err = v9fs_co_name_to_path(pdu, &dfidp->path, name.data, &path);
3263     if (err < 0) {
3264         goto out_err;
3265     }
3266     err = v9fs_mark_fids_unreclaim(pdu, &path);
3267     if (err < 0) {
3268         goto out_err;
3269     }
3270     err = v9fs_co_unlinkat(pdu, &dfidp->path, &name, rflags);
3271     if (!err) {
3272         err = offset;
3273     }
3274 out_err:
3275     put_fid(pdu, dfidp);
3276     v9fs_path_free(&path);
3277 out_nofid:
3278     pdu_complete(pdu, err);
3279     v9fs_string_free(&name);
3280 }
3281 
3282 
3283 /* Only works with path name based fid */
3284 static int coroutine_fn v9fs_complete_rename(V9fsPDU *pdu, V9fsFidState *fidp,
3285                                              int32_t newdirfid,
3286                                              V9fsString *name)
3287 {
3288     int err = 0;
3289     V9fsPath new_path;
3290     V9fsFidState *tfidp;
3291     V9fsState *s = pdu->s;
3292     V9fsFidState *dirfidp = NULL;
3293     GHashTableIter iter;
3294     gpointer fid;
3295 
3296     v9fs_path_init(&new_path);
3297     if (newdirfid != -1) {
3298         dirfidp = get_fid(pdu, newdirfid);
3299         if (dirfidp == NULL) {
3300             return -ENOENT;
3301         }
3302         if (fidp->fid_type != P9_FID_NONE) {
3303             err = -EINVAL;
3304             goto out;
3305         }
3306         err = v9fs_co_name_to_path(pdu, &dirfidp->path, name->data, &new_path);
3307         if (err < 0) {
3308             goto out;
3309         }
3310     } else {
3311         char *dir_name = g_path_get_dirname(fidp->path.data);
3312         V9fsPath dir_path;
3313 
3314         v9fs_path_init(&dir_path);
3315         v9fs_path_sprintf(&dir_path, "%s", dir_name);
3316         g_free(dir_name);
3317 
3318         err = v9fs_co_name_to_path(pdu, &dir_path, name->data, &new_path);
3319         v9fs_path_free(&dir_path);
3320         if (err < 0) {
3321             goto out;
3322         }
3323     }
3324     err = v9fs_co_rename(pdu, &fidp->path, &new_path);
3325     if (err < 0) {
3326         goto out;
3327     }
3328 
3329     /*
3330      * Fixup fid's pointing to the old name to
3331      * start pointing to the new name
3332      */
3333     g_hash_table_iter_init(&iter, s->fids);
3334     while (g_hash_table_iter_next(&iter, &fid, (gpointer *) &tfidp)) {
3335         if (v9fs_path_is_ancestor(&fidp->path, &tfidp->path)) {
3336             /* replace the name */
3337             v9fs_fix_path(&tfidp->path, &new_path, strlen(fidp->path.data));
3338         }
3339     }
3340 out:
3341     if (dirfidp) {
3342         put_fid(pdu, dirfidp);
3343     }
3344     v9fs_path_free(&new_path);
3345     return err;
3346 }
3347 
3348 /* Only works with path name based fid */
3349 static void coroutine_fn v9fs_rename(void *opaque)
3350 {
3351     int32_t fid;
3352     ssize_t err = 0;
3353     size_t offset = 7;
3354     V9fsString name;
3355     int32_t newdirfid;
3356     V9fsFidState *fidp;
3357     V9fsPDU *pdu = opaque;
3358     V9fsState *s = pdu->s;
3359 
3360     v9fs_string_init(&name);
3361     err = pdu_unmarshal(pdu, offset, "dds", &fid, &newdirfid, &name);
3362     if (err < 0) {
3363         goto out_nofid;
3364     }
3365 
3366     if (name_is_illegal(name.data)) {
3367         err = -ENOENT;
3368         goto out_nofid;
3369     }
3370 
3371     if (!strcmp(".", name.data) || !strcmp("..", name.data)) {
3372         err = -EISDIR;
3373         goto out_nofid;
3374     }
3375 
3376     fidp = get_fid(pdu, fid);
3377     if (fidp == NULL) {
3378         err = -ENOENT;
3379         goto out_nofid;
3380     }
3381     if (fidp->fid_type != P9_FID_NONE) {
3382         err = -EINVAL;
3383         goto out;
3384     }
3385     /* if fs driver is not path based, return EOPNOTSUPP */
3386     if (!(pdu->s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT)) {
3387         err = -EOPNOTSUPP;
3388         goto out;
3389     }
3390     v9fs_path_write_lock(s);
3391     err = v9fs_complete_rename(pdu, fidp, newdirfid, &name);
3392     v9fs_path_unlock(s);
3393     if (!err) {
3394         err = offset;
3395     }
3396 out:
3397     put_fid(pdu, fidp);
3398 out_nofid:
3399     pdu_complete(pdu, err);
3400     v9fs_string_free(&name);
3401 }
3402 
3403 static int coroutine_fn v9fs_fix_fid_paths(V9fsPDU *pdu, V9fsPath *olddir,
3404                                            V9fsString *old_name,
3405                                            V9fsPath *newdir,
3406                                            V9fsString *new_name)
3407 {
3408     V9fsFidState *tfidp;
3409     V9fsPath oldpath, newpath;
3410     V9fsState *s = pdu->s;
3411     int err;
3412     GHashTableIter iter;
3413     gpointer fid;
3414 
3415     v9fs_path_init(&oldpath);
3416     v9fs_path_init(&newpath);
3417     err = v9fs_co_name_to_path(pdu, olddir, old_name->data, &oldpath);
3418     if (err < 0) {
3419         goto out;
3420     }
3421     err = v9fs_co_name_to_path(pdu, newdir, new_name->data, &newpath);
3422     if (err < 0) {
3423         goto out;
3424     }
3425 
3426     /*
3427      * Fixup fid's pointing to the old name to
3428      * start pointing to the new name
3429      */
3430     g_hash_table_iter_init(&iter, s->fids);
3431     while (g_hash_table_iter_next(&iter, &fid, (gpointer *) &tfidp)) {
3432         if (v9fs_path_is_ancestor(&oldpath, &tfidp->path)) {
3433             /* replace the name */
3434             v9fs_fix_path(&tfidp->path, &newpath, strlen(oldpath.data));
3435         }
3436     }
3437 out:
3438     v9fs_path_free(&oldpath);
3439     v9fs_path_free(&newpath);
3440     return err;
3441 }
3442 
3443 static int coroutine_fn v9fs_complete_renameat(V9fsPDU *pdu, int32_t olddirfid,
3444                                                V9fsString *old_name,
3445                                                int32_t newdirfid,
3446                                                V9fsString *new_name)
3447 {
3448     int err = 0;
3449     V9fsState *s = pdu->s;
3450     V9fsFidState *newdirfidp = NULL, *olddirfidp = NULL;
3451 
3452     olddirfidp = get_fid(pdu, olddirfid);
3453     if (olddirfidp == NULL) {
3454         err = -ENOENT;
3455         goto out;
3456     }
3457     if (newdirfid != -1) {
3458         newdirfidp = get_fid(pdu, newdirfid);
3459         if (newdirfidp == NULL) {
3460             err = -ENOENT;
3461             goto out;
3462         }
3463     } else {
3464         newdirfidp = get_fid(pdu, olddirfid);
3465     }
3466 
3467     err = v9fs_co_renameat(pdu, &olddirfidp->path, old_name,
3468                            &newdirfidp->path, new_name);
3469     if (err < 0) {
3470         goto out;
3471     }
3472     if (s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT) {
3473         /* Only for path based fid  we need to do the below fixup */
3474         err = v9fs_fix_fid_paths(pdu, &olddirfidp->path, old_name,
3475                                  &newdirfidp->path, new_name);
3476     }
3477 out:
3478     if (olddirfidp) {
3479         put_fid(pdu, olddirfidp);
3480     }
3481     if (newdirfidp) {
3482         put_fid(pdu, newdirfidp);
3483     }
3484     return err;
3485 }
3486 
3487 static void coroutine_fn v9fs_renameat(void *opaque)
3488 {
3489     ssize_t err = 0;
3490     size_t offset = 7;
3491     V9fsPDU *pdu = opaque;
3492     V9fsState *s = pdu->s;
3493     int32_t olddirfid, newdirfid;
3494     V9fsString old_name, new_name;
3495 
3496     v9fs_string_init(&old_name);
3497     v9fs_string_init(&new_name);
3498     err = pdu_unmarshal(pdu, offset, "dsds", &olddirfid,
3499                         &old_name, &newdirfid, &new_name);
3500     if (err < 0) {
3501         goto out_err;
3502     }
3503 
3504     if (name_is_illegal(old_name.data) || name_is_illegal(new_name.data)) {
3505         err = -ENOENT;
3506         goto out_err;
3507     }
3508 
3509     if (!strcmp(".", old_name.data) || !strcmp("..", old_name.data) ||
3510         !strcmp(".", new_name.data) || !strcmp("..", new_name.data)) {
3511         err = -EISDIR;
3512         goto out_err;
3513     }
3514 
3515     v9fs_path_write_lock(s);
3516     err = v9fs_complete_renameat(pdu, olddirfid,
3517                                  &old_name, newdirfid, &new_name);
3518     v9fs_path_unlock(s);
3519     if (!err) {
3520         err = offset;
3521     }
3522 
3523 out_err:
3524     pdu_complete(pdu, err);
3525     v9fs_string_free(&old_name);
3526     v9fs_string_free(&new_name);
3527 }
3528 
3529 static void coroutine_fn v9fs_wstat(void *opaque)
3530 {
3531     int32_t fid;
3532     int err = 0;
3533     int16_t unused;
3534     V9fsStat v9stat;
3535     size_t offset = 7;
3536     struct stat stbuf;
3537     V9fsFidState *fidp;
3538     V9fsPDU *pdu = opaque;
3539     V9fsState *s = pdu->s;
3540 
3541     v9fs_stat_init(&v9stat);
3542     err = pdu_unmarshal(pdu, offset, "dwS", &fid, &unused, &v9stat);
3543     if (err < 0) {
3544         goto out_nofid;
3545     }
3546     trace_v9fs_wstat(pdu->tag, pdu->id, fid,
3547                      v9stat.mode, v9stat.atime, v9stat.mtime);
3548 
3549     fidp = get_fid(pdu, fid);
3550     if (fidp == NULL) {
3551         err = -EINVAL;
3552         goto out_nofid;
3553     }
3554     /* do we need to sync the file? */
3555     if (donttouch_stat(&v9stat)) {
3556         err = v9fs_co_fsync(pdu, fidp, 0);
3557         goto out;
3558     }
3559     if (v9stat.mode != -1) {
3560         uint32_t v9_mode;
3561         err = v9fs_co_lstat(pdu, &fidp->path, &stbuf);
3562         if (err < 0) {
3563             goto out;
3564         }
3565         v9_mode = stat_to_v9mode(&stbuf);
3566         if ((v9stat.mode & P9_STAT_MODE_TYPE_BITS) !=
3567             (v9_mode & P9_STAT_MODE_TYPE_BITS)) {
3568             /* Attempting to change the type */
3569             err = -EIO;
3570             goto out;
3571         }
3572         err = v9fs_co_chmod(pdu, &fidp->path,
3573                             v9mode_to_mode(v9stat.mode,
3574                                            &v9stat.extension));
3575         if (err < 0) {
3576             goto out;
3577         }
3578     }
3579     if (v9stat.mtime != -1 || v9stat.atime != -1) {
3580         struct timespec times[2];
3581         if (v9stat.atime != -1) {
3582             times[0].tv_sec = v9stat.atime;
3583             times[0].tv_nsec = 0;
3584         } else {
3585             times[0].tv_nsec = UTIME_OMIT;
3586         }
3587         if (v9stat.mtime != -1) {
3588             times[1].tv_sec = v9stat.mtime;
3589             times[1].tv_nsec = 0;
3590         } else {
3591             times[1].tv_nsec = UTIME_OMIT;
3592         }
3593         err = v9fs_co_utimensat(pdu, &fidp->path, times);
3594         if (err < 0) {
3595             goto out;
3596         }
3597     }
3598     if (v9stat.n_gid != -1 || v9stat.n_uid != -1) {
3599         err = v9fs_co_chown(pdu, &fidp->path, v9stat.n_uid, v9stat.n_gid);
3600         if (err < 0) {
3601             goto out;
3602         }
3603     }
3604     if (v9stat.name.size != 0) {
3605         v9fs_path_write_lock(s);
3606         err = v9fs_complete_rename(pdu, fidp, -1, &v9stat.name);
3607         v9fs_path_unlock(s);
3608         if (err < 0) {
3609             goto out;
3610         }
3611     }
3612     if (v9stat.length != -1) {
3613         err = v9fs_co_truncate(pdu, &fidp->path, v9stat.length);
3614         if (err < 0) {
3615             goto out;
3616         }
3617     }
3618     err = offset;
3619 out:
3620     put_fid(pdu, fidp);
3621 out_nofid:
3622     v9fs_stat_free(&v9stat);
3623     pdu_complete(pdu, err);
3624 }
3625 
3626 static int v9fs_fill_statfs(V9fsState *s, V9fsPDU *pdu, struct statfs *stbuf)
3627 {
3628     uint32_t f_type;
3629     uint32_t f_bsize;
3630     uint64_t f_blocks;
3631     uint64_t f_bfree;
3632     uint64_t f_bavail;
3633     uint64_t f_files;
3634     uint64_t f_ffree;
3635     uint64_t fsid_val;
3636     uint32_t f_namelen;
3637     size_t offset = 7;
3638     int32_t bsize_factor;
3639 
3640     /*
3641      * compute bsize factor based on host file system block size
3642      * and client msize
3643      */
3644     bsize_factor = (s->msize - P9_IOHDRSZ) / stbuf->f_bsize;
3645     if (!bsize_factor) {
3646         bsize_factor = 1;
3647     }
3648     f_type  = stbuf->f_type;
3649     f_bsize = stbuf->f_bsize;
3650     f_bsize *= bsize_factor;
3651     /*
3652      * f_bsize is adjusted(multiplied) by bsize factor, so we need to
3653      * adjust(divide) the number of blocks, free blocks and available
3654      * blocks by bsize factor
3655      */
3656     f_blocks = stbuf->f_blocks / bsize_factor;
3657     f_bfree  = stbuf->f_bfree / bsize_factor;
3658     f_bavail = stbuf->f_bavail / bsize_factor;
3659     f_files  = stbuf->f_files;
3660     f_ffree  = stbuf->f_ffree;
3661 #ifdef CONFIG_DARWIN
3662     fsid_val = (unsigned int)stbuf->f_fsid.val[0] |
3663                (unsigned long long)stbuf->f_fsid.val[1] << 32;
3664     f_namelen = NAME_MAX;
3665 #else
3666     fsid_val = (unsigned int) stbuf->f_fsid.__val[0] |
3667                (unsigned long long)stbuf->f_fsid.__val[1] << 32;
3668     f_namelen = stbuf->f_namelen;
3669 #endif
3670 
3671     return pdu_marshal(pdu, offset, "ddqqqqqqd",
3672                        f_type, f_bsize, f_blocks, f_bfree,
3673                        f_bavail, f_files, f_ffree,
3674                        fsid_val, f_namelen);
3675 }
3676 
3677 static void coroutine_fn v9fs_statfs(void *opaque)
3678 {
3679     int32_t fid;
3680     ssize_t retval = 0;
3681     size_t offset = 7;
3682     V9fsFidState *fidp;
3683     struct statfs stbuf;
3684     V9fsPDU *pdu = opaque;
3685     V9fsState *s = pdu->s;
3686 
3687     retval = pdu_unmarshal(pdu, offset, "d", &fid);
3688     if (retval < 0) {
3689         goto out_nofid;
3690     }
3691     fidp = get_fid(pdu, fid);
3692     if (fidp == NULL) {
3693         retval = -ENOENT;
3694         goto out_nofid;
3695     }
3696     retval = v9fs_co_statfs(pdu, &fidp->path, &stbuf);
3697     if (retval < 0) {
3698         goto out;
3699     }
3700     retval = v9fs_fill_statfs(s, pdu, &stbuf);
3701     if (retval < 0) {
3702         goto out;
3703     }
3704     retval += offset;
3705 out:
3706     put_fid(pdu, fidp);
3707 out_nofid:
3708     pdu_complete(pdu, retval);
3709 }
3710 
3711 static void coroutine_fn v9fs_mknod(void *opaque)
3712 {
3713 
3714     int mode;
3715     gid_t gid;
3716     int32_t fid;
3717     V9fsQID qid;
3718     int err = 0;
3719     int major, minor;
3720     size_t offset = 7;
3721     V9fsString name;
3722     struct stat stbuf;
3723     V9fsFidState *fidp;
3724     V9fsPDU *pdu = opaque;
3725 
3726     v9fs_string_init(&name);
3727     err = pdu_unmarshal(pdu, offset, "dsdddd", &fid, &name, &mode,
3728                         &major, &minor, &gid);
3729     if (err < 0) {
3730         goto out_nofid;
3731     }
3732     trace_v9fs_mknod(pdu->tag, pdu->id, fid, mode, major, minor);
3733 
3734     if (name_is_illegal(name.data)) {
3735         err = -ENOENT;
3736         goto out_nofid;
3737     }
3738 
3739     if (!strcmp(".", name.data) || !strcmp("..", name.data)) {
3740         err = -EEXIST;
3741         goto out_nofid;
3742     }
3743 
3744     fidp = get_fid(pdu, fid);
3745     if (fidp == NULL) {
3746         err = -ENOENT;
3747         goto out_nofid;
3748     }
3749     err = v9fs_co_mknod(pdu, fidp, &name, fidp->uid, gid,
3750                         makedev(major, minor), mode, &stbuf);
3751     if (err < 0) {
3752         goto out;
3753     }
3754     err = stat_to_qid(pdu, &stbuf, &qid);
3755     if (err < 0) {
3756         goto out;
3757     }
3758     err = pdu_marshal(pdu, offset, "Q", &qid);
3759     if (err < 0) {
3760         goto out;
3761     }
3762     err += offset;
3763     trace_v9fs_mknod_return(pdu->tag, pdu->id,
3764                             qid.type, qid.version, qid.path);
3765 out:
3766     put_fid(pdu, fidp);
3767 out_nofid:
3768     pdu_complete(pdu, err);
3769     v9fs_string_free(&name);
3770 }
3771 
3772 /*
3773  * Implement posix byte range locking code
3774  * Server side handling of locking code is very simple, because 9p server in
3775  * QEMU can handle only one client. And most of the lock handling
3776  * (like conflict, merging) etc is done by the VFS layer itself, so no need to
3777  * do any thing in * qemu 9p server side lock code path.
3778  * So when a TLOCK request comes, always return success
3779  */
3780 static void coroutine_fn v9fs_lock(void *opaque)
3781 {
3782     V9fsFlock flock;
3783     size_t offset = 7;
3784     struct stat stbuf;
3785     V9fsFidState *fidp;
3786     int32_t fid, err = 0;
3787     V9fsPDU *pdu = opaque;
3788 
3789     v9fs_string_init(&flock.client_id);
3790     err = pdu_unmarshal(pdu, offset, "dbdqqds", &fid, &flock.type,
3791                         &flock.flags, &flock.start, &flock.length,
3792                         &flock.proc_id, &flock.client_id);
3793     if (err < 0) {
3794         goto out_nofid;
3795     }
3796     trace_v9fs_lock(pdu->tag, pdu->id, fid,
3797                     flock.type, flock.start, flock.length);
3798 
3799 
3800     /* We support only block flag now (that too ignored currently) */
3801     if (flock.flags & ~P9_LOCK_FLAGS_BLOCK) {
3802         err = -EINVAL;
3803         goto out_nofid;
3804     }
3805     fidp = get_fid(pdu, fid);
3806     if (fidp == NULL) {
3807         err = -ENOENT;
3808         goto out_nofid;
3809     }
3810     err = v9fs_co_fstat(pdu, fidp, &stbuf);
3811     if (err < 0) {
3812         goto out;
3813     }
3814     err = pdu_marshal(pdu, offset, "b", P9_LOCK_SUCCESS);
3815     if (err < 0) {
3816         goto out;
3817     }
3818     err += offset;
3819     trace_v9fs_lock_return(pdu->tag, pdu->id, P9_LOCK_SUCCESS);
3820 out:
3821     put_fid(pdu, fidp);
3822 out_nofid:
3823     pdu_complete(pdu, err);
3824     v9fs_string_free(&flock.client_id);
3825 }
3826 
3827 /*
3828  * When a TGETLOCK request comes, always return success because all lock
3829  * handling is done by client's VFS layer.
3830  */
3831 static void coroutine_fn v9fs_getlock(void *opaque)
3832 {
3833     size_t offset = 7;
3834     struct stat stbuf;
3835     V9fsFidState *fidp;
3836     V9fsGetlock glock;
3837     int32_t fid, err = 0;
3838     V9fsPDU *pdu = opaque;
3839 
3840     v9fs_string_init(&glock.client_id);
3841     err = pdu_unmarshal(pdu, offset, "dbqqds", &fid, &glock.type,
3842                         &glock.start, &glock.length, &glock.proc_id,
3843                         &glock.client_id);
3844     if (err < 0) {
3845         goto out_nofid;
3846     }
3847     trace_v9fs_getlock(pdu->tag, pdu->id, fid,
3848                        glock.type, glock.start, glock.length);
3849 
3850     fidp = get_fid(pdu, fid);
3851     if (fidp == NULL) {
3852         err = -ENOENT;
3853         goto out_nofid;
3854     }
3855     err = v9fs_co_fstat(pdu, fidp, &stbuf);
3856     if (err < 0) {
3857         goto out;
3858     }
3859     glock.type = P9_LOCK_TYPE_UNLCK;
3860     err = pdu_marshal(pdu, offset, "bqqds", glock.type,
3861                           glock.start, glock.length, glock.proc_id,
3862                           &glock.client_id);
3863     if (err < 0) {
3864         goto out;
3865     }
3866     err += offset;
3867     trace_v9fs_getlock_return(pdu->tag, pdu->id, glock.type, glock.start,
3868                               glock.length, glock.proc_id);
3869 out:
3870     put_fid(pdu, fidp);
3871 out_nofid:
3872     pdu_complete(pdu, err);
3873     v9fs_string_free(&glock.client_id);
3874 }
3875 
3876 static void coroutine_fn v9fs_mkdir(void *opaque)
3877 {
3878     V9fsPDU *pdu = opaque;
3879     size_t offset = 7;
3880     int32_t fid;
3881     struct stat stbuf;
3882     V9fsQID qid;
3883     V9fsString name;
3884     V9fsFidState *fidp;
3885     gid_t gid;
3886     int mode;
3887     int err = 0;
3888 
3889     v9fs_string_init(&name);
3890     err = pdu_unmarshal(pdu, offset, "dsdd", &fid, &name, &mode, &gid);
3891     if (err < 0) {
3892         goto out_nofid;
3893     }
3894     trace_v9fs_mkdir(pdu->tag, pdu->id, fid, name.data, mode, gid);
3895 
3896     if (name_is_illegal(name.data)) {
3897         err = -ENOENT;
3898         goto out_nofid;
3899     }
3900 
3901     if (!strcmp(".", name.data) || !strcmp("..", name.data)) {
3902         err = -EEXIST;
3903         goto out_nofid;
3904     }
3905 
3906     fidp = get_fid(pdu, fid);
3907     if (fidp == NULL) {
3908         err = -ENOENT;
3909         goto out_nofid;
3910     }
3911     err = v9fs_co_mkdir(pdu, fidp, &name, mode, fidp->uid, gid, &stbuf);
3912     if (err < 0) {
3913         goto out;
3914     }
3915     err = stat_to_qid(pdu, &stbuf, &qid);
3916     if (err < 0) {
3917         goto out;
3918     }
3919     err = pdu_marshal(pdu, offset, "Q", &qid);
3920     if (err < 0) {
3921         goto out;
3922     }
3923     err += offset;
3924     trace_v9fs_mkdir_return(pdu->tag, pdu->id,
3925                             qid.type, qid.version, qid.path, err);
3926 out:
3927     put_fid(pdu, fidp);
3928 out_nofid:
3929     pdu_complete(pdu, err);
3930     v9fs_string_free(&name);
3931 }
3932 
3933 static void coroutine_fn v9fs_xattrwalk(void *opaque)
3934 {
3935     int64_t size;
3936     V9fsString name;
3937     ssize_t err = 0;
3938     size_t offset = 7;
3939     int32_t fid, newfid;
3940     V9fsFidState *file_fidp;
3941     V9fsFidState *xattr_fidp = NULL;
3942     V9fsPDU *pdu = opaque;
3943     V9fsState *s = pdu->s;
3944 
3945     v9fs_string_init(&name);
3946     err = pdu_unmarshal(pdu, offset, "dds", &fid, &newfid, &name);
3947     if (err < 0) {
3948         goto out_nofid;
3949     }
3950     trace_v9fs_xattrwalk(pdu->tag, pdu->id, fid, newfid, name.data);
3951 
3952     file_fidp = get_fid(pdu, fid);
3953     if (file_fidp == NULL) {
3954         err = -ENOENT;
3955         goto out_nofid;
3956     }
3957     xattr_fidp = alloc_fid(s, newfid);
3958     if (xattr_fidp == NULL) {
3959         err = -EINVAL;
3960         goto out;
3961     }
3962     v9fs_path_copy(&xattr_fidp->path, &file_fidp->path);
3963     if (!v9fs_string_size(&name)) {
3964         /*
3965          * listxattr request. Get the size first
3966          */
3967         size = v9fs_co_llistxattr(pdu, &xattr_fidp->path, NULL, 0);
3968         if (size < 0) {
3969             err = size;
3970             clunk_fid(s, xattr_fidp->fid);
3971             goto out;
3972         }
3973         /*
3974          * Read the xattr value
3975          */
3976         xattr_fidp->fs.xattr.len = size;
3977         xattr_fidp->fid_type = P9_FID_XATTR;
3978         xattr_fidp->fs.xattr.xattrwalk_fid = true;
3979         xattr_fidp->fs.xattr.value = g_malloc0(size);
3980         if (size) {
3981             err = v9fs_co_llistxattr(pdu, &xattr_fidp->path,
3982                                      xattr_fidp->fs.xattr.value,
3983                                      xattr_fidp->fs.xattr.len);
3984             if (err < 0) {
3985                 clunk_fid(s, xattr_fidp->fid);
3986                 goto out;
3987             }
3988         }
3989         err = pdu_marshal(pdu, offset, "q", size);
3990         if (err < 0) {
3991             goto out;
3992         }
3993         err += offset;
3994     } else {
3995         /*
3996          * specific xattr fid. We check for xattr
3997          * presence also collect the xattr size
3998          */
3999         size = v9fs_co_lgetxattr(pdu, &xattr_fidp->path,
4000                                  &name, NULL, 0);
4001         if (size < 0) {
4002             err = size;
4003             clunk_fid(s, xattr_fidp->fid);
4004             goto out;
4005         }
4006         /*
4007          * Read the xattr value
4008          */
4009         xattr_fidp->fs.xattr.len = size;
4010         xattr_fidp->fid_type = P9_FID_XATTR;
4011         xattr_fidp->fs.xattr.xattrwalk_fid = true;
4012         xattr_fidp->fs.xattr.value = g_malloc0(size);
4013         if (size) {
4014             err = v9fs_co_lgetxattr(pdu, &xattr_fidp->path,
4015                                     &name, xattr_fidp->fs.xattr.value,
4016                                     xattr_fidp->fs.xattr.len);
4017             if (err < 0) {
4018                 clunk_fid(s, xattr_fidp->fid);
4019                 goto out;
4020             }
4021         }
4022         err = pdu_marshal(pdu, offset, "q", size);
4023         if (err < 0) {
4024             goto out;
4025         }
4026         err += offset;
4027     }
4028     trace_v9fs_xattrwalk_return(pdu->tag, pdu->id, size);
4029 out:
4030     put_fid(pdu, file_fidp);
4031     if (xattr_fidp) {
4032         put_fid(pdu, xattr_fidp);
4033     }
4034 out_nofid:
4035     pdu_complete(pdu, err);
4036     v9fs_string_free(&name);
4037 }
4038 
4039 #if defined(CONFIG_LINUX)
4040 /* Currently, only Linux has XATTR_SIZE_MAX */
4041 #define P9_XATTR_SIZE_MAX XATTR_SIZE_MAX
4042 #elif defined(CONFIG_DARWIN)
4043 /*
4044  * Darwin doesn't seem to define a maximum xattr size in its user
4045  * space header, so manually configure it across platforms as 64k.
4046  *
4047  * Having no limit at all can lead to QEMU crashing during large g_malloc()
4048  * calls. Because QEMU does not currently support macOS guests, the below
4049  * preliminary solution only works due to its being a reflection of the limit of
4050  * Linux guests.
4051  */
4052 #define P9_XATTR_SIZE_MAX 65536
4053 #else
4054 #error Missing definition for P9_XATTR_SIZE_MAX for this host system
4055 #endif
4056 
4057 static void coroutine_fn v9fs_xattrcreate(void *opaque)
4058 {
4059     int flags, rflags = 0;
4060     int32_t fid;
4061     uint64_t size;
4062     ssize_t err = 0;
4063     V9fsString name;
4064     size_t offset = 7;
4065     V9fsFidState *file_fidp;
4066     V9fsFidState *xattr_fidp;
4067     V9fsPDU *pdu = opaque;
4068 
4069     v9fs_string_init(&name);
4070     err = pdu_unmarshal(pdu, offset, "dsqd", &fid, &name, &size, &flags);
4071     if (err < 0) {
4072         goto out_nofid;
4073     }
4074     trace_v9fs_xattrcreate(pdu->tag, pdu->id, fid, name.data, size, flags);
4075 
4076     if (flags & ~(P9_XATTR_CREATE | P9_XATTR_REPLACE)) {
4077         err = -EINVAL;
4078         goto out_nofid;
4079     }
4080 
4081     if (flags & P9_XATTR_CREATE) {
4082         rflags |= XATTR_CREATE;
4083     }
4084 
4085     if (flags & P9_XATTR_REPLACE) {
4086         rflags |= XATTR_REPLACE;
4087     }
4088 
4089     if (size > P9_XATTR_SIZE_MAX) {
4090         err = -E2BIG;
4091         goto out_nofid;
4092     }
4093 
4094     file_fidp = get_fid(pdu, fid);
4095     if (file_fidp == NULL) {
4096         err = -EINVAL;
4097         goto out_nofid;
4098     }
4099     if (file_fidp->fid_type != P9_FID_NONE) {
4100         err = -EINVAL;
4101         goto out_put_fid;
4102     }
4103 
4104     /* Make the file fid point to xattr */
4105     xattr_fidp = file_fidp;
4106     xattr_fidp->fid_type = P9_FID_XATTR;
4107     xattr_fidp->fs.xattr.copied_len = 0;
4108     xattr_fidp->fs.xattr.xattrwalk_fid = false;
4109     xattr_fidp->fs.xattr.len = size;
4110     xattr_fidp->fs.xattr.flags = rflags;
4111     v9fs_string_init(&xattr_fidp->fs.xattr.name);
4112     v9fs_string_copy(&xattr_fidp->fs.xattr.name, &name);
4113     xattr_fidp->fs.xattr.value = g_malloc0(size);
4114     err = offset;
4115 out_put_fid:
4116     put_fid(pdu, file_fidp);
4117 out_nofid:
4118     pdu_complete(pdu, err);
4119     v9fs_string_free(&name);
4120 }
4121 
4122 static void coroutine_fn v9fs_readlink(void *opaque)
4123 {
4124     V9fsPDU *pdu = opaque;
4125     size_t offset = 7;
4126     V9fsString target;
4127     int32_t fid;
4128     int err = 0;
4129     V9fsFidState *fidp;
4130 
4131     err = pdu_unmarshal(pdu, offset, "d", &fid);
4132     if (err < 0) {
4133         goto out_nofid;
4134     }
4135     trace_v9fs_readlink(pdu->tag, pdu->id, fid);
4136     fidp = get_fid(pdu, fid);
4137     if (fidp == NULL) {
4138         err = -ENOENT;
4139         goto out_nofid;
4140     }
4141 
4142     v9fs_string_init(&target);
4143     err = v9fs_co_readlink(pdu, &fidp->path, &target);
4144     if (err < 0) {
4145         goto out;
4146     }
4147     err = pdu_marshal(pdu, offset, "s", &target);
4148     if (err < 0) {
4149         v9fs_string_free(&target);
4150         goto out;
4151     }
4152     err += offset;
4153     trace_v9fs_readlink_return(pdu->tag, pdu->id, target.data);
4154     v9fs_string_free(&target);
4155 out:
4156     put_fid(pdu, fidp);
4157 out_nofid:
4158     pdu_complete(pdu, err);
4159 }
4160 
4161 static CoroutineEntry *pdu_co_handlers[] = {
4162     [P9_TREADDIR] = v9fs_readdir,
4163     [P9_TSTATFS] = v9fs_statfs,
4164     [P9_TGETATTR] = v9fs_getattr,
4165     [P9_TSETATTR] = v9fs_setattr,
4166     [P9_TXATTRWALK] = v9fs_xattrwalk,
4167     [P9_TXATTRCREATE] = v9fs_xattrcreate,
4168     [P9_TMKNOD] = v9fs_mknod,
4169     [P9_TRENAME] = v9fs_rename,
4170     [P9_TLOCK] = v9fs_lock,
4171     [P9_TGETLOCK] = v9fs_getlock,
4172     [P9_TRENAMEAT] = v9fs_renameat,
4173     [P9_TREADLINK] = v9fs_readlink,
4174     [P9_TUNLINKAT] = v9fs_unlinkat,
4175     [P9_TMKDIR] = v9fs_mkdir,
4176     [P9_TVERSION] = v9fs_version,
4177     [P9_TLOPEN] = v9fs_open,
4178     [P9_TATTACH] = v9fs_attach,
4179     [P9_TSTAT] = v9fs_stat,
4180     [P9_TWALK] = v9fs_walk,
4181     [P9_TCLUNK] = v9fs_clunk,
4182     [P9_TFSYNC] = v9fs_fsync,
4183     [P9_TOPEN] = v9fs_open,
4184     [P9_TREAD] = v9fs_read,
4185 #if 0
4186     [P9_TAUTH] = v9fs_auth,
4187 #endif
4188     [P9_TFLUSH] = v9fs_flush,
4189     [P9_TLINK] = v9fs_link,
4190     [P9_TSYMLINK] = v9fs_symlink,
4191     [P9_TCREATE] = v9fs_create,
4192     [P9_TLCREATE] = v9fs_lcreate,
4193     [P9_TWRITE] = v9fs_write,
4194     [P9_TWSTAT] = v9fs_wstat,
4195     [P9_TREMOVE] = v9fs_remove,
4196 };
4197 
4198 static void coroutine_fn v9fs_op_not_supp(void *opaque)
4199 {
4200     V9fsPDU *pdu = opaque;
4201     pdu_complete(pdu, -EOPNOTSUPP);
4202 }
4203 
4204 static void coroutine_fn v9fs_fs_ro(void *opaque)
4205 {
4206     V9fsPDU *pdu = opaque;
4207     pdu_complete(pdu, -EROFS);
4208 }
4209 
4210 static inline bool is_read_only_op(V9fsPDU *pdu)
4211 {
4212     switch (pdu->id) {
4213     case P9_TREADDIR:
4214     case P9_TSTATFS:
4215     case P9_TGETATTR:
4216     case P9_TXATTRWALK:
4217     case P9_TLOCK:
4218     case P9_TGETLOCK:
4219     case P9_TREADLINK:
4220     case P9_TVERSION:
4221     case P9_TLOPEN:
4222     case P9_TATTACH:
4223     case P9_TSTAT:
4224     case P9_TWALK:
4225     case P9_TCLUNK:
4226     case P9_TFSYNC:
4227     case P9_TOPEN:
4228     case P9_TREAD:
4229     case P9_TAUTH:
4230     case P9_TFLUSH:
4231         return 1;
4232     default:
4233         return 0;
4234     }
4235 }
4236 
4237 void pdu_submit(V9fsPDU *pdu, P9MsgHeader *hdr)
4238 {
4239     Coroutine *co;
4240     CoroutineEntry *handler;
4241     V9fsState *s = pdu->s;
4242 
4243     pdu->size = le32_to_cpu(hdr->size_le);
4244     pdu->id = hdr->id;
4245     pdu->tag = le16_to_cpu(hdr->tag_le);
4246 
4247     if (pdu->id >= ARRAY_SIZE(pdu_co_handlers) ||
4248         (pdu_co_handlers[pdu->id] == NULL)) {
4249         handler = v9fs_op_not_supp;
4250     } else if (is_ro_export(&s->ctx) && !is_read_only_op(pdu)) {
4251         handler = v9fs_fs_ro;
4252     } else {
4253         handler = pdu_co_handlers[pdu->id];
4254     }
4255 
4256     qemu_co_queue_init(&pdu->complete);
4257     co = qemu_coroutine_create(handler, pdu);
4258     qemu_coroutine_enter(co);
4259 }
4260 
4261 /* Returns 0 on success, 1 on failure. */
4262 int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
4263                                Error **errp)
4264 {
4265     ERRP_GUARD();
4266     int i, len;
4267     struct stat stat;
4268     FsDriverEntry *fse;
4269     V9fsPath path;
4270     int rc = 1;
4271 
4272     assert(!s->transport);
4273     s->transport = t;
4274 
4275     /* initialize pdu allocator */
4276     QLIST_INIT(&s->free_list);
4277     QLIST_INIT(&s->active_list);
4278     for (i = 0; i < MAX_REQ; i++) {
4279         QLIST_INSERT_HEAD(&s->free_list, &s->pdus[i], next);
4280         s->pdus[i].s = s;
4281         s->pdus[i].idx = i;
4282     }
4283 
4284     v9fs_path_init(&path);
4285 
4286     fse = get_fsdev_fsentry(s->fsconf.fsdev_id);
4287 
4288     if (!fse) {
4289         /* We don't have a fsdev identified by fsdev_id */
4290         error_setg(errp, "9pfs device couldn't find fsdev with the "
4291                    "id = %s",
4292                    s->fsconf.fsdev_id ? s->fsconf.fsdev_id : "NULL");
4293         goto out;
4294     }
4295 
4296     if (!s->fsconf.tag) {
4297         /* we haven't specified a mount_tag */
4298         error_setg(errp, "fsdev with id %s needs mount_tag arguments",
4299                    s->fsconf.fsdev_id);
4300         goto out;
4301     }
4302 
4303     s->ctx.export_flags = fse->export_flags;
4304     s->ctx.fs_root = g_strdup(fse->path);
4305     s->ctx.exops.get_st_gen = NULL;
4306     len = strlen(s->fsconf.tag);
4307     if (len > MAX_TAG_LEN - 1) {
4308         error_setg(errp, "mount tag '%s' (%d bytes) is longer than "
4309                    "maximum (%d bytes)", s->fsconf.tag, len, MAX_TAG_LEN - 1);
4310         goto out;
4311     }
4312 
4313     s->tag = g_strdup(s->fsconf.tag);
4314     s->ctx.uid = -1;
4315 
4316     s->ops = fse->ops;
4317 
4318     s->ctx.fmode = fse->fmode;
4319     s->ctx.dmode = fse->dmode;
4320 
4321     s->fids = g_hash_table_new(NULL, NULL);
4322     qemu_co_rwlock_init(&s->rename_lock);
4323 
4324     if (s->ops->init(&s->ctx, errp) < 0) {
4325         error_prepend(errp, "cannot initialize fsdev '%s': ",
4326                       s->fsconf.fsdev_id);
4327         goto out;
4328     }
4329 
4330     /*
4331      * Check details of export path, We need to use fs driver
4332      * call back to do that. Since we are in the init path, we don't
4333      * use co-routines here.
4334      */
4335     if (s->ops->name_to_path(&s->ctx, NULL, "/", &path) < 0) {
4336         error_setg(errp,
4337                    "error in converting name to path %s", strerror(errno));
4338         goto out;
4339     }
4340     if (s->ops->lstat(&s->ctx, &path, &stat)) {
4341         error_setg(errp, "share path %s does not exist", fse->path);
4342         goto out;
4343     } else if (!S_ISDIR(stat.st_mode)) {
4344         error_setg(errp, "share path %s is not a directory", fse->path);
4345         goto out;
4346     }
4347 
4348     s->dev_id = stat.st_dev;
4349 
4350     /* init inode remapping : */
4351     /* hash table for variable length inode suffixes */
4352     qpd_table_init(&s->qpd_table);
4353     /* hash table for slow/full inode remapping (most users won't need it) */
4354     qpf_table_init(&s->qpf_table);
4355     /* hash table for quick inode remapping */
4356     qpp_table_init(&s->qpp_table);
4357     s->qp_ndevices = 0;
4358     s->qp_affix_next = 1; /* reserve 0 to detect overflow */
4359     s->qp_fullpath_next = 1;
4360 
4361     s->ctx.fst = &fse->fst;
4362     fsdev_throttle_init(s->ctx.fst);
4363 
4364     s->reclaiming = false;
4365 
4366     rc = 0;
4367 out:
4368     if (rc) {
4369         v9fs_device_unrealize_common(s);
4370     }
4371     v9fs_path_free(&path);
4372     return rc;
4373 }
4374 
4375 void v9fs_device_unrealize_common(V9fsState *s)
4376 {
4377     if (s->ops && s->ops->cleanup) {
4378         s->ops->cleanup(&s->ctx);
4379     }
4380     if (s->ctx.fst) {
4381         fsdev_throttle_cleanup(s->ctx.fst);
4382     }
4383     if (s->fids) {
4384         g_hash_table_destroy(s->fids);
4385         s->fids = NULL;
4386     }
4387     g_free(s->tag);
4388     qp_table_destroy(&s->qpd_table);
4389     qp_table_destroy(&s->qpp_table);
4390     qp_table_destroy(&s->qpf_table);
4391     g_free(s->ctx.fs_root);
4392 }
4393 
4394 typedef struct VirtfsCoResetData {
4395     V9fsPDU pdu;
4396     bool done;
4397 } VirtfsCoResetData;
4398 
4399 static void coroutine_fn virtfs_co_reset(void *opaque)
4400 {
4401     VirtfsCoResetData *data = opaque;
4402 
4403     virtfs_reset(&data->pdu);
4404     data->done = true;
4405 }
4406 
4407 void v9fs_reset(V9fsState *s)
4408 {
4409     VirtfsCoResetData data = { .pdu = { .s = s }, .done = false };
4410     Coroutine *co;
4411 
4412     while (!QLIST_EMPTY(&s->active_list)) {
4413         aio_poll(qemu_get_aio_context(), true);
4414     }
4415 
4416     co = qemu_coroutine_create(virtfs_co_reset, &data);
4417     qemu_coroutine_enter(co);
4418 
4419     while (!data.done) {
4420         aio_poll(qemu_get_aio_context(), true);
4421     }
4422 }
4423 
4424 static void __attribute__((__constructor__)) v9fs_set_fd_limit(void)
4425 {
4426     struct rlimit rlim;
4427     if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
4428         error_report("Failed to get the resource limit");
4429         exit(1);
4430     }
4431     open_fd_hw = rlim.rlim_cur - MIN(400, rlim.rlim_cur / 3);
4432     open_fd_rc = rlim.rlim_cur / 2;
4433 }
4434