xref: /openbmc/qemu/block/file-posix.c (revision 864814f71b4cbb2e65bc83a502e63b3cbdd43b0f)
1 /*
2  * Block driver for RAW files (posix)
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu/cutils.h"
28 #include "qemu/error-report.h"
29 #include "block/block-io.h"
30 #include "block/block_int.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "qemu/units.h"
34 #include "qemu/memalign.h"
35 #include "trace.h"
36 #include "block/thread-pool.h"
37 #include "qemu/iov.h"
38 #include "block/raw-aio.h"
39 #include "qobject/qdict.h"
40 #include "qobject/qstring.h"
41 
42 #include "scsi/pr-manager.h"
43 #include "scsi/constants.h"
44 #include "scsi/utils.h"
45 
46 #if defined(__APPLE__) && (__MACH__)
47 #include <sys/ioctl.h>
48 #if defined(HAVE_HOST_BLOCK_DEVICE)
49 #include <paths.h>
50 #include <sys/param.h>
51 #include <sys/mount.h>
52 #include <IOKit/IOKitLib.h>
53 #include <IOKit/IOBSD.h>
54 #include <IOKit/storage/IOMediaBSDClient.h>
55 #include <IOKit/storage/IOMedia.h>
56 #include <IOKit/storage/IOCDMedia.h>
57 //#include <IOKit/storage/IOCDTypes.h>
58 #include <IOKit/storage/IODVDMedia.h>
59 #include <CoreFoundation/CoreFoundation.h>
60 #endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
61 #endif
62 
63 #ifdef __sun__
64 #define _POSIX_PTHREAD_SEMANTICS 1
65 #include <sys/dkio.h>
66 #endif
67 #ifdef __linux__
68 #include <sys/ioctl.h>
69 #include <sys/param.h>
70 #include <sys/syscall.h>
71 #include <sys/vfs.h>
72 #if defined(CONFIG_BLKZONED)
73 #include <linux/blkzoned.h>
74 #endif
75 #include <linux/cdrom.h>
76 #include <linux/dm-ioctl.h>
77 #include <linux/fd.h>
78 #include <linux/fs.h>
79 #include <linux/hdreg.h>
80 #include <linux/magic.h>
81 #include <scsi/sg.h>
82 #ifdef __s390__
83 #include <asm/dasd.h>
84 #endif
85 #ifndef FS_NOCOW_FL
86 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
87 #endif
88 #endif
89 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
90 #include <linux/falloc.h>
91 #endif
92 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
93 #include <sys/disk.h>
94 #include <sys/cdio.h>
95 #endif
96 
97 #ifdef __OpenBSD__
98 #include <sys/ioctl.h>
99 #include <sys/disklabel.h>
100 #include <sys/dkio.h>
101 #endif
102 
103 #ifdef __NetBSD__
104 #include <sys/ioctl.h>
105 #include <sys/disklabel.h>
106 #include <sys/dkio.h>
107 #include <sys/disk.h>
108 #endif
109 
110 #ifdef __DragonFly__
111 #include <sys/ioctl.h>
112 #include <sys/diskslice.h>
113 #endif
114 
115 #ifdef EMSCRIPTEN
116 #include <sys/ioctl.h>
117 #endif
118 
119 /* OS X does not have O_DSYNC */
120 #ifndef O_DSYNC
121 #ifdef O_SYNC
122 #define O_DSYNC O_SYNC
123 #elif defined(O_FSYNC)
124 #define O_DSYNC O_FSYNC
125 #endif
126 #endif
127 
128 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
129 #ifndef O_DIRECT
130 #define O_DIRECT O_DSYNC
131 #endif
132 
133 #define FTYPE_FILE   0
134 #define FTYPE_CD     1
135 
136 #define MAX_BLOCKSIZE 4096
137 
138 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
139  * leaving a few more bytes for its future use. */
140 #define RAW_LOCK_PERM_BASE             100
141 #define RAW_LOCK_SHARED_BASE           200
142 
143 /*
144  * Multiple retries are mostly meant for two separate scenarios:
145  *
146  * - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another
147  *   path goes down.
148  *
149  * - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have
150  *   to send another SG_IO to switch to another path group to probe the paths in
151  *   it.
152  *
153  * Even if each path is in a separate path group (path_grouping_policy set to
154  * failover), it's rare to have more than eight path groups - and even then
155  * pretty unlikely that only bad path groups would be chosen in eight retries.
156  */
157 #define SG_IO_MAX_RETRIES 8
158 
159 typedef struct BDRVRawState {
160     int fd;
161     bool use_lock;
162     int type;
163     int open_flags;
164     size_t buf_align;
165 
166     /* The current permissions. */
167     uint64_t perm;
168     uint64_t shared_perm;
169 
170     /* The perms bits whose corresponding bytes are already locked in
171      * s->fd. */
172     uint64_t locked_perm;
173     uint64_t locked_shared_perm;
174 
175     uint64_t aio_max_batch;
176 
177     int perm_change_fd;
178     int perm_change_flags;
179     BDRVReopenState *reopen_state;
180 
181     bool has_discard:1;
182     bool has_write_zeroes:1;
183     bool use_linux_aio:1;
184     bool has_laio_fdsync:1;
185     bool use_linux_io_uring:1;
186     bool use_mpath:1;
187     int page_cache_inconsistent; /* errno from fdatasync failure */
188     bool has_fallocate;
189     bool needs_alignment;
190     bool force_alignment;
191     bool drop_cache;
192     bool check_cache_dropped;
193     struct {
194         uint64_t discard_nb_ok;
195         uint64_t discard_nb_failed;
196         uint64_t discard_bytes_ok;
197     } stats;
198 
199     PRManager *pr_mgr;
200 } BDRVRawState;
201 
202 typedef struct BDRVRawReopenState {
203     int open_flags;
204     bool drop_cache;
205     bool check_cache_dropped;
206 } BDRVRawReopenState;
207 
208 static int fd_open(BlockDriverState *bs)
209 {
210     BDRVRawState *s = bs->opaque;
211 
212     /* this is just to ensure s->fd is sane (its called by io ops) */
213     if (s->fd >= 0) {
214         return 0;
215     }
216     return -EIO;
217 }
218 
219 static int64_t raw_getlength(BlockDriverState *bs);
220 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs);
221 
222 typedef struct RawPosixAIOData {
223     BlockDriverState *bs;
224     int aio_type;
225     int aio_fildes;
226 
227     off_t aio_offset;
228     uint64_t aio_nbytes;
229 
230     union {
231         struct {
232             struct iovec *iov;
233             int niov;
234         } io;
235         struct {
236             uint64_t cmd;
237             void *buf;
238         } ioctl;
239         struct {
240             int aio_fd2;
241             off_t aio_offset2;
242         } copy_range;
243         struct {
244             PreallocMode prealloc;
245             Error **errp;
246         } truncate;
247         struct {
248             unsigned int *nr_zones;
249             BlockZoneDescriptor *zones;
250         } zone_report;
251         struct {
252             unsigned long op;
253         } zone_mgmt;
254     };
255 } RawPosixAIOData;
256 
257 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
258 static int cdrom_reopen(BlockDriverState *bs);
259 #endif
260 
261 /*
262  * Elide EAGAIN and EACCES details when failing to lock, as this
263  * indicates that the specified file region is already locked by
264  * another process, which is considered a common scenario.
265  */
266 #define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
267     do {                                                                \
268         if ((err) == EAGAIN || (err) == EACCES) {                       \
269             error_setg((errp), (fmt), ## __VA_ARGS__);                  \
270         } else {                                                        \
271             error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
272         }                                                               \
273     } while (0)
274 
275 #if defined(__NetBSD__)
276 static int raw_normalize_devicepath(const char **filename, Error **errp)
277 {
278     static char namebuf[PATH_MAX];
279     const char *dp, *fname;
280     struct stat sb;
281 
282     fname = *filename;
283     dp = strrchr(fname, '/');
284     if (lstat(fname, &sb) < 0) {
285         error_setg_file_open(errp, errno, fname);
286         return -errno;
287     }
288 
289     if (!S_ISBLK(sb.st_mode)) {
290         return 0;
291     }
292 
293     if (dp == NULL) {
294         snprintf(namebuf, PATH_MAX, "r%s", fname);
295     } else {
296         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
297             (int)(dp - fname), fname, dp + 1);
298     }
299     *filename = namebuf;
300     warn_report("%s is a block device, using %s", fname, *filename);
301 
302     return 0;
303 }
304 #else
305 static int raw_normalize_devicepath(const char **filename, Error **errp)
306 {
307     return 0;
308 }
309 #endif
310 
311 /*
312  * Get logical block size via ioctl. On success store it in @sector_size_p.
313  */
314 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
315 {
316     unsigned int sector_size;
317     bool success = false;
318     int i;
319 
320     errno = ENOTSUP;
321     static const unsigned long ioctl_list[] = {
322 #ifdef BLKSSZGET
323         BLKSSZGET,
324 #endif
325 #ifdef DKIOCGETBLOCKSIZE
326         DKIOCGETBLOCKSIZE,
327 #endif
328 #ifdef DIOCGSECTORSIZE
329         DIOCGSECTORSIZE,
330 #endif
331     };
332 
333     /* Try a few ioctls to get the right size */
334     for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
335         if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
336             *sector_size_p = sector_size;
337             success = true;
338         }
339     }
340 
341     return success ? 0 : -errno;
342 }
343 
344 /**
345  * Get physical block size of @fd.
346  * On success, store it in @blk_size and return 0.
347  * On failure, return -errno.
348  */
349 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
350 {
351 #ifdef BLKPBSZGET
352     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
353         return -errno;
354     }
355     return 0;
356 #else
357     return -ENOTSUP;
358 #endif
359 }
360 
361 /*
362  * Returns true if no alignment restrictions are necessary even for files
363  * opened with O_DIRECT.
364  *
365  * raw_probe_alignment() probes the required alignment and assume that 1 means
366  * the probing failed, so it falls back to a safe default of 4k. This can be
367  * avoided if we know that byte alignment is okay for the file.
368  */
369 static bool dio_byte_aligned(int fd)
370 {
371 #ifdef __linux__
372     struct statfs buf;
373     int ret;
374 
375     ret = fstatfs(fd, &buf);
376     if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
377         return true;
378     }
379 #endif
380     return false;
381 }
382 
383 static bool raw_needs_alignment(BlockDriverState *bs)
384 {
385     BDRVRawState *s = bs->opaque;
386 
387     if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
388         return true;
389     }
390 
391     return s->force_alignment;
392 }
393 
394 /* Check if read is allowed with given memory buffer and length.
395  *
396  * This function is used to check O_DIRECT memory buffer and request alignment.
397  */
398 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
399 {
400     ssize_t ret = pread(fd, buf, len, 0);
401 
402     if (ret >= 0) {
403         return true;
404     }
405 
406 #ifdef __linux__
407     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
408      * other errors (e.g. real I/O error), which could happen on a failed
409      * drive, since we only care about probing alignment.
410      */
411     if (errno != EINVAL) {
412         return true;
413     }
414 #endif
415 
416     return false;
417 }
418 
419 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
420 {
421     BDRVRawState *s = bs->opaque;
422     char *buf;
423     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
424     size_t alignments[] = {1, 512, 1024, 2048, 4096};
425 
426     /* For SCSI generic devices the alignment is not really used.
427        With buffered I/O, we don't have any restrictions. */
428     if (bdrv_is_sg(bs) || !s->needs_alignment) {
429         bs->bl.request_alignment = 1;
430         s->buf_align = 1;
431         return;
432     }
433 
434     bs->bl.request_alignment = 0;
435     s->buf_align = 0;
436     /* Let's try to use the logical blocksize for the alignment. */
437     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
438         bs->bl.request_alignment = 0;
439     }
440 
441 #ifdef __linux__
442     /*
443      * The XFS ioctl definitions are shipped in extra packages that might
444      * not always be available. Since we just need the XFS_IOC_DIOINFO ioctl
445      * here, we simply use our own definition instead:
446      */
447     struct xfs_dioattr {
448         uint32_t d_mem;
449         uint32_t d_miniosz;
450         uint32_t d_maxiosz;
451     } da;
452     if (ioctl(fd, _IOR('X', 30, struct xfs_dioattr), &da) >= 0) {
453         bs->bl.request_alignment = da.d_miniosz;
454         /* The kernel returns wrong information for d_mem */
455         /* s->buf_align = da.d_mem; */
456     }
457 #endif
458 
459     /*
460      * If we could not get the sizes so far, we can only guess them. First try
461      * to detect request alignment, since it is more likely to succeed. Then
462      * try to detect buf_align, which cannot be detected in some cases (e.g.
463      * Gluster). If buf_align cannot be detected, we fallback to the value of
464      * request_alignment.
465      */
466 
467     if (!bs->bl.request_alignment) {
468         int i;
469         size_t align;
470         buf = qemu_memalign(max_align, max_align);
471         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
472             align = alignments[i];
473             if (raw_is_io_aligned(fd, buf, align)) {
474                 /* Fallback to safe value. */
475                 bs->bl.request_alignment = (align != 1) ? align : max_align;
476                 break;
477             }
478         }
479         qemu_vfree(buf);
480     }
481 
482     if (!s->buf_align) {
483         int i;
484         size_t align;
485         buf = qemu_memalign(max_align, 2 * max_align);
486         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
487             align = alignments[i];
488             if (raw_is_io_aligned(fd, buf + align, max_align)) {
489                 /* Fallback to request_alignment. */
490                 s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
491                 break;
492             }
493         }
494         qemu_vfree(buf);
495     }
496 
497     if (!s->buf_align || !bs->bl.request_alignment) {
498         error_setg(errp, "Could not find working O_DIRECT alignment");
499         error_append_hint(errp, "Try cache.direct=off\n");
500     }
501 }
502 
503 static int check_hdev_writable(int fd)
504 {
505 #if defined(BLKROGET)
506     /* Linux block devices can be configured "read-only" using blockdev(8).
507      * This is independent of device node permissions and therefore open(2)
508      * with O_RDWR succeeds.  Actual writes fail with EPERM.
509      *
510      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
511      * check for read-only block devices so that Linux block devices behave
512      * properly.
513      */
514     struct stat st;
515     int readonly = 0;
516 
517     if (fstat(fd, &st)) {
518         return -errno;
519     }
520 
521     if (!S_ISBLK(st.st_mode)) {
522         return 0;
523     }
524 
525     if (ioctl(fd, BLKROGET, &readonly) < 0) {
526         return -errno;
527     }
528 
529     if (readonly) {
530         return -EACCES;
531     }
532 #endif /* defined(BLKROGET) */
533     return 0;
534 }
535 
536 static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
537 {
538     bool read_write = false;
539     assert(open_flags != NULL);
540 
541     *open_flags |= O_BINARY;
542     *open_flags &= ~O_ACCMODE;
543 
544     if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
545         read_write = has_writers;
546     } else if (bdrv_flags & BDRV_O_RDWR) {
547         read_write = true;
548     }
549 
550     if (read_write) {
551         *open_flags |= O_RDWR;
552     } else {
553         *open_flags |= O_RDONLY;
554     }
555 
556     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
557      * and O_DIRECT for no caching. */
558     if ((bdrv_flags & BDRV_O_NOCACHE)) {
559         *open_flags |= O_DIRECT;
560     }
561 }
562 
563 static void raw_parse_filename(const char *filename, QDict *options,
564                                Error **errp)
565 {
566     bdrv_parse_filename_strip_prefix(filename, "file:", options);
567 }
568 
569 static QemuOptsList raw_runtime_opts = {
570     .name = "raw",
571     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
572     .desc = {
573         {
574             .name = "filename",
575             .type = QEMU_OPT_STRING,
576             .help = "File name of the image",
577         },
578         {
579             .name = "aio",
580             .type = QEMU_OPT_STRING,
581             .help = "host AIO implementation (threads, native, io_uring)",
582         },
583         {
584             .name = "aio-max-batch",
585             .type = QEMU_OPT_NUMBER,
586             .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
587         },
588         {
589             .name = "locking",
590             .type = QEMU_OPT_STRING,
591             .help = "file locking mode (on/off/auto, default: auto)",
592         },
593         {
594             .name = "pr-manager",
595             .type = QEMU_OPT_STRING,
596             .help = "id of persistent reservation manager object (default: none)",
597         },
598 #if defined(__linux__)
599         {
600             .name = "drop-cache",
601             .type = QEMU_OPT_BOOL,
602             .help = "invalidate page cache during live migration (default: on)",
603         },
604 #endif
605         {
606             .name = "x-check-cache-dropped",
607             .type = QEMU_OPT_BOOL,
608             .help = "check that page cache was dropped on live migration (default: off)"
609         },
610         { /* end of list */ }
611     },
612 };
613 
614 static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
615 
616 static int raw_open_common(BlockDriverState *bs, QDict *options,
617                            int bdrv_flags, int open_flags,
618                            bool device, Error **errp)
619 {
620     BDRVRawState *s = bs->opaque;
621     QemuOpts *opts;
622     Error *local_err = NULL;
623     const char *filename = NULL;
624     const char *str;
625     BlockdevAioOptions aio, aio_default;
626     int fd, ret;
627     struct stat st;
628     OnOffAuto locking;
629 
630     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
631     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
632         ret = -EINVAL;
633         goto fail;
634     }
635 
636     filename = qemu_opt_get(opts, "filename");
637 
638     ret = raw_normalize_devicepath(&filename, errp);
639     if (ret != 0) {
640         goto fail;
641     }
642 
643     if (bdrv_flags & BDRV_O_NATIVE_AIO) {
644         aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
645 #ifdef CONFIG_LINUX_IO_URING
646     } else if (bdrv_flags & BDRV_O_IO_URING) {
647         aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
648 #endif
649     } else {
650         aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
651     }
652 
653     aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
654                           qemu_opt_get(opts, "aio"),
655                           aio_default, &local_err);
656     if (local_err) {
657         error_propagate(errp, local_err);
658         ret = -EINVAL;
659         goto fail;
660     }
661 
662     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
663 #ifdef CONFIG_LINUX_IO_URING
664     s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
665 #endif
666 
667     s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
668 
669     locking = qapi_enum_parse(&OnOffAuto_lookup,
670                               qemu_opt_get(opts, "locking"),
671                               ON_OFF_AUTO_AUTO, &local_err);
672     if (local_err) {
673         error_propagate(errp, local_err);
674         ret = -EINVAL;
675         goto fail;
676     }
677     switch (locking) {
678     case ON_OFF_AUTO_ON:
679         s->use_lock = true;
680         if (!qemu_has_ofd_lock()) {
681             warn_report("File lock requested but OFD locking syscall is "
682                         "unavailable, falling back to POSIX file locks");
683             error_printf("Due to the implementation, locks can be lost "
684                          "unexpectedly.\n");
685         }
686         break;
687     case ON_OFF_AUTO_OFF:
688         s->use_lock = false;
689         break;
690     case ON_OFF_AUTO_AUTO:
691         s->use_lock = qemu_has_ofd_lock();
692         break;
693     default:
694         abort();
695     }
696 
697     str = qemu_opt_get(opts, "pr-manager");
698     if (str) {
699         s->pr_mgr = pr_manager_lookup(str, &local_err);
700         if (local_err) {
701             error_propagate(errp, local_err);
702             ret = -EINVAL;
703             goto fail;
704         }
705     }
706 
707     s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
708     s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
709                                                false);
710 
711     s->open_flags = open_flags;
712     raw_parse_flags(bdrv_flags, &s->open_flags, false);
713 
714     s->fd = -1;
715     fd = qemu_open(filename, s->open_flags, errp);
716     ret = fd < 0 ? -errno : 0;
717 
718     if (ret < 0) {
719         if (ret == -EROFS) {
720             ret = -EACCES;
721         }
722         goto fail;
723     }
724     s->fd = fd;
725 
726     /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
727     if (s->open_flags & O_RDWR) {
728         ret = check_hdev_writable(s->fd);
729         if (ret < 0) {
730             error_setg_errno(errp, -ret, "The device is not writable");
731             goto fail;
732         }
733     }
734 
735     s->perm = 0;
736     s->shared_perm = BLK_PERM_ALL;
737 
738 #ifdef CONFIG_LINUX_AIO
739      /* Currently Linux does AIO only for files opened with O_DIRECT */
740     if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
741         error_setg(errp, "aio=native was specified, but it requires "
742                          "cache.direct=on, which was not specified.");
743         ret = -EINVAL;
744         goto fail;
745     }
746     if (s->use_linux_aio) {
747         s->has_laio_fdsync = laio_has_fdsync(s->fd);
748     }
749 #else
750     if (s->use_linux_aio) {
751         error_setg(errp, "aio=native was specified, but is not supported "
752                          "in this build.");
753         ret = -EINVAL;
754         goto fail;
755     }
756 #endif /* !defined(CONFIG_LINUX_AIO) */
757 
758     if (s->use_linux_io_uring) {
759 #ifdef CONFIG_LINUX_IO_URING
760         if (!aio_has_io_uring()) {
761             error_setg(errp, "aio=io_uring was specified, but is not "
762                              "available (disabled via io_uring_disabled "
763                              "sysctl or blocked by container runtime "
764                              "seccomp policy?)");
765             ret = -EINVAL;
766             goto fail;
767         }
768 #else
769         error_setg(errp, "aio=io_uring was specified, but is not supported "
770                          "in this build");
771         ret = -EINVAL;
772         goto fail;
773 #endif /* !defined(CONFIG_LINUX_IO_URING) */
774     }
775 
776     s->has_discard = true;
777     s->has_write_zeroes = true;
778 
779     if (fstat(s->fd, &st) < 0) {
780         ret = -errno;
781         error_setg_errno(errp, errno, "Could not stat file");
782         goto fail;
783     }
784 
785     if (!device) {
786         if (!S_ISREG(st.st_mode)) {
787             error_setg(errp, "'%s' driver requires '%s' to be a regular file",
788                        bs->drv->format_name, bs->filename);
789             ret = -EINVAL;
790             goto fail;
791         } else {
792             s->has_fallocate = true;
793         }
794     } else {
795         if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
796             error_setg(errp, "'%s' driver requires '%s' to be either "
797                        "a character or block device",
798                        bs->drv->format_name, bs->filename);
799             ret = -EINVAL;
800             goto fail;
801         }
802     }
803 #ifdef CONFIG_BLKZONED
804     /*
805      * The kernel page cache does not reliably work for writes to SWR zones
806      * of zoned block device because it can not guarantee the order of writes.
807      */
808     if ((bs->bl.zoned != BLK_Z_NONE) &&
809         (!(s->open_flags & O_DIRECT))) {
810         error_setg(errp, "The driver supports zoned devices, and it requires "
811                          "cache.direct=on, which was not specified.");
812         return -EINVAL; /* No host kernel page cache */
813     }
814 #endif
815 
816 #ifdef __FreeBSD__
817     if (S_ISCHR(st.st_mode)) {
818         /*
819          * The file is a char device (disk), which on FreeBSD isn't behind
820          * a pager, so force all requests to be aligned. This is needed
821          * so QEMU makes sure all IO operations on the device are aligned
822          * to sector size, or else FreeBSD will reject them with EINVAL.
823          */
824         s->force_alignment = true;
825     }
826 #endif
827     s->needs_alignment = raw_needs_alignment(bs);
828 
829     bs->supported_write_flags = BDRV_REQ_FUA;
830     if (s->use_linux_aio && !laio_has_fua()) {
831         bs->supported_write_flags &= ~BDRV_REQ_FUA;
832     } else if (s->use_linux_io_uring && !luring_has_fua()) {
833         bs->supported_write_flags &= ~BDRV_REQ_FUA;
834     }
835 
836     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
837     if (S_ISREG(st.st_mode)) {
838         /* When extending regular files, we get zeros from the OS */
839         bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
840     }
841     ret = 0;
842 fail:
843     if (ret < 0 && s->fd != -1) {
844         qemu_close(s->fd);
845     }
846     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
847         unlink(filename);
848     }
849     qemu_opts_del(opts);
850     return ret;
851 }
852 
853 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
854                     Error **errp)
855 {
856     BDRVRawState *s = bs->opaque;
857 
858     s->type = FTYPE_FILE;
859     return raw_open_common(bs, options, flags, 0, false, errp);
860 }
861 
862 typedef enum {
863     RAW_PL_PREPARE,
864     RAW_PL_COMMIT,
865     RAW_PL_ABORT,
866 } RawPermLockOp;
867 
868 #define PERM_FOREACH(i) \
869     for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
870 
871 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
872  * file; if @unlock == true, also unlock the unneeded bytes.
873  * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
874  */
875 static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
876                                 uint64_t perm_lock_bits,
877                                 uint64_t shared_perm_lock_bits,
878                                 bool unlock, Error **errp)
879 {
880     int ret;
881     int i;
882     uint64_t locked_perm, locked_shared_perm;
883 
884     if (s) {
885         locked_perm = s->locked_perm;
886         locked_shared_perm = s->locked_shared_perm;
887     } else {
888         /*
889          * We don't have the previous bits, just lock/unlock for each of the
890          * requested bits.
891          */
892         if (unlock) {
893             locked_perm = BLK_PERM_ALL;
894             locked_shared_perm = BLK_PERM_ALL;
895         } else {
896             locked_perm = 0;
897             locked_shared_perm = 0;
898         }
899     }
900 
901     PERM_FOREACH(i) {
902         int off = RAW_LOCK_PERM_BASE + i;
903         uint64_t bit = (1ULL << i);
904         if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
905             ret = qemu_lock_fd(fd, off, 1, false);
906             if (ret) {
907                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
908                                           off);
909                 return ret;
910             } else if (s) {
911                 s->locked_perm |= bit;
912             }
913         } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
914             ret = qemu_unlock_fd(fd, off, 1);
915             if (ret) {
916                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
917                 return ret;
918             } else if (s) {
919                 s->locked_perm &= ~bit;
920             }
921         }
922     }
923     PERM_FOREACH(i) {
924         int off = RAW_LOCK_SHARED_BASE + i;
925         uint64_t bit = (1ULL << i);
926         if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
927             ret = qemu_lock_fd(fd, off, 1, false);
928             if (ret) {
929                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
930                                           off);
931                 return ret;
932             } else if (s) {
933                 s->locked_shared_perm |= bit;
934             }
935         } else if (unlock && (locked_shared_perm & bit) &&
936                    !(shared_perm_lock_bits & bit)) {
937             ret = qemu_unlock_fd(fd, off, 1);
938             if (ret) {
939                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
940                 return ret;
941             } else if (s) {
942                 s->locked_shared_perm &= ~bit;
943             }
944         }
945     }
946     return 0;
947 }
948 
949 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
950 static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
951                                 Error **errp)
952 {
953     int ret;
954     int i;
955 
956     PERM_FOREACH(i) {
957         int off = RAW_LOCK_SHARED_BASE + i;
958         uint64_t p = 1ULL << i;
959         if (perm & p) {
960             ret = qemu_lock_fd_test(fd, off, 1, true);
961             if (ret) {
962                 char *perm_name = bdrv_perm_names(p);
963 
964                 raw_lock_error_setg_errno(errp, -ret,
965                                           "Failed to get \"%s\" lock",
966                                           perm_name);
967                 g_free(perm_name);
968                 return ret;
969             }
970         }
971     }
972     PERM_FOREACH(i) {
973         int off = RAW_LOCK_PERM_BASE + i;
974         uint64_t p = 1ULL << i;
975         if (!(shared_perm & p)) {
976             ret = qemu_lock_fd_test(fd, off, 1, true);
977             if (ret) {
978                 char *perm_name = bdrv_perm_names(p);
979 
980                 raw_lock_error_setg_errno(errp, -ret,
981                                           "Failed to get shared \"%s\" lock",
982                                           perm_name);
983                 g_free(perm_name);
984                 return ret;
985             }
986         }
987     }
988     return 0;
989 }
990 
991 static int raw_handle_perm_lock(BlockDriverState *bs,
992                                 RawPermLockOp op,
993                                 uint64_t new_perm, uint64_t new_shared,
994                                 Error **errp)
995 {
996     BDRVRawState *s = bs->opaque;
997     int ret = 0;
998     Error *local_err = NULL;
999 
1000     if (!s->use_lock) {
1001         return 0;
1002     }
1003 
1004     if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
1005         return 0;
1006     }
1007 
1008     switch (op) {
1009     case RAW_PL_PREPARE:
1010         if ((s->perm | new_perm) == s->perm &&
1011             (s->shared_perm & new_shared) == s->shared_perm)
1012         {
1013             /*
1014              * We are going to unlock bytes, it should not fail. If it fail due
1015              * to some fs-dependent permission-unrelated reasons (which occurs
1016              * sometimes on NFS and leads to abort in bdrv_replace_child) we
1017              * can't prevent such errors by any check here. And we ignore them
1018              * anyway in ABORT and COMMIT.
1019              */
1020             return 0;
1021         }
1022         ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
1023                                    ~s->shared_perm | ~new_shared,
1024                                    false, errp);
1025         if (!ret) {
1026             ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
1027             if (!ret) {
1028                 return 0;
1029             }
1030             error_append_hint(errp,
1031                               "Is another process using the image [%s]?\n",
1032                               bs->filename);
1033         }
1034         /* fall through to unlock bytes. */
1035     case RAW_PL_ABORT:
1036         raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
1037                              true, &local_err);
1038         if (local_err) {
1039             /* Theoretically the above call only unlocks bytes and it cannot
1040              * fail. Something weird happened, report it.
1041              */
1042             warn_report_err(local_err);
1043         }
1044         break;
1045     case RAW_PL_COMMIT:
1046         raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
1047                              true, &local_err);
1048         if (local_err) {
1049             /* Theoretically the above call only unlocks bytes and it cannot
1050              * fail. Something weird happened, report it.
1051              */
1052             warn_report_err(local_err);
1053         }
1054         break;
1055     }
1056     return ret;
1057 }
1058 
1059 /* Sets a specific flag */
1060 static int fcntl_setfl(int fd, int flag)
1061 {
1062     int flags;
1063 
1064     flags = fcntl(fd, F_GETFL);
1065     if (flags == -1) {
1066         return -errno;
1067     }
1068     if (fcntl(fd, F_SETFL, flags | flag) == -1) {
1069         return -errno;
1070     }
1071     return 0;
1072 }
1073 
1074 static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
1075                                  int *open_flags, uint64_t perm, Error **errp)
1076 {
1077     BDRVRawState *s = bs->opaque;
1078     int fd = -1;
1079     int ret;
1080     bool has_writers = perm &
1081         (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
1082     int fcntl_flags = O_APPEND | O_NONBLOCK;
1083 #ifdef O_NOATIME
1084     fcntl_flags |= O_NOATIME;
1085 #endif
1086 
1087     *open_flags = 0;
1088     if (s->type == FTYPE_CD) {
1089         *open_flags |= O_NONBLOCK;
1090     }
1091 
1092     raw_parse_flags(flags, open_flags, has_writers);
1093 
1094 #ifdef O_ASYNC
1095     /* Not all operating systems have O_ASYNC, and those that don't
1096      * will not let us track the state into rs->open_flags (typically
1097      * you achieve the same effect with an ioctl, for example I_SETSIG
1098      * on Solaris). But we do not use O_ASYNC, so that's fine.
1099      */
1100     assert((s->open_flags & O_ASYNC) == 0);
1101 #endif
1102 
1103     if (*open_flags == s->open_flags) {
1104         /* We're lucky, the existing fd is fine */
1105         return s->fd;
1106     }
1107 
1108     if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
1109         /* dup the original fd */
1110         fd = qemu_dup(s->fd);
1111         if (fd >= 0) {
1112             ret = fcntl_setfl(fd, *open_flags);
1113             if (ret) {
1114                 qemu_close(fd);
1115                 fd = -1;
1116             }
1117         }
1118     }
1119 
1120     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
1121     if (fd == -1) {
1122         const char *normalized_filename = bs->filename;
1123         ret = raw_normalize_devicepath(&normalized_filename, errp);
1124         if (ret >= 0) {
1125             fd = qemu_open(normalized_filename, *open_flags, errp);
1126             if (fd == -1) {
1127                 return -1;
1128             }
1129         }
1130     }
1131 
1132     if (fd != -1 && (*open_flags & O_RDWR)) {
1133         ret = check_hdev_writable(fd);
1134         if (ret < 0) {
1135             qemu_close(fd);
1136             error_setg_errno(errp, -ret, "The device is not writable");
1137             return -1;
1138         }
1139     }
1140 
1141     return fd;
1142 }
1143 
1144 static int raw_reopen_prepare(BDRVReopenState *state,
1145                               BlockReopenQueue *queue, Error **errp)
1146 {
1147     BDRVRawState *s;
1148     BDRVRawReopenState *rs;
1149     QemuOpts *opts;
1150     int ret;
1151 
1152     assert(state != NULL);
1153     assert(state->bs != NULL);
1154 
1155     s = state->bs->opaque;
1156 
1157     state->opaque = g_new0(BDRVRawReopenState, 1);
1158     rs = state->opaque;
1159 
1160     /* Handle options changes */
1161     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1162     if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
1163         ret = -EINVAL;
1164         goto out;
1165     }
1166 
1167     rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1168     rs->check_cache_dropped =
1169         qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1170 
1171     /* This driver's reopen function doesn't currently allow changing
1172      * other options, so let's put them back in the original QDict and
1173      * bdrv_reopen_prepare() will detect changes and complain. */
1174     qemu_opts_to_qdict(opts, state->options);
1175 
1176     /*
1177      * As part of reopen prepare we also want to create new fd by
1178      * raw_reconfigure_getfd(). But it wants updated "perm", when in
1179      * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
1180      * permission update. Happily, permission update is always a part
1181      * (a separate stage) of bdrv_reopen_multiple() so we can rely on this
1182      * fact and reconfigure fd in raw_check_perm().
1183      */
1184 
1185     s->reopen_state = state;
1186     ret = 0;
1187 
1188 out:
1189     qemu_opts_del(opts);
1190     return ret;
1191 }
1192 
1193 static void raw_reopen_commit(BDRVReopenState *state)
1194 {
1195     BDRVRawReopenState *rs = state->opaque;
1196     BDRVRawState *s = state->bs->opaque;
1197 
1198     s->drop_cache = rs->drop_cache;
1199     s->check_cache_dropped = rs->check_cache_dropped;
1200     s->open_flags = rs->open_flags;
1201     g_free(state->opaque);
1202     state->opaque = NULL;
1203 
1204     assert(s->reopen_state == state);
1205     s->reopen_state = NULL;
1206 }
1207 
1208 
1209 static void raw_reopen_abort(BDRVReopenState *state)
1210 {
1211     BDRVRawReopenState *rs = state->opaque;
1212     BDRVRawState *s = state->bs->opaque;
1213 
1214      /* nothing to do if NULL, we didn't get far enough */
1215     if (rs == NULL) {
1216         return;
1217     }
1218 
1219     g_free(state->opaque);
1220     state->opaque = NULL;
1221 
1222     assert(s->reopen_state == state);
1223     s->reopen_state = NULL;
1224 }
1225 
1226 static int hdev_get_max_hw_transfer(int fd, struct stat *st)
1227 {
1228 #ifdef BLKSECTGET
1229     if (S_ISBLK(st->st_mode)) {
1230         unsigned short max_sectors = 0;
1231         if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
1232             return max_sectors * 512;
1233         }
1234     } else {
1235         int max_bytes = 0;
1236         if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1237             return max_bytes;
1238         }
1239     }
1240     return -errno;
1241 #else
1242     return -ENOSYS;
1243 #endif
1244 }
1245 
1246 /*
1247  * Get a sysfs attribute value as character string.
1248  */
1249 #ifdef CONFIG_LINUX
1250 static int get_sysfs_str_val(struct stat *st, const char *attribute,
1251                              char **val) {
1252     g_autofree char *sysfspath = NULL;
1253     size_t len;
1254 
1255     if (!S_ISBLK(st->st_mode)) {
1256         return -ENOTSUP;
1257     }
1258 
1259     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
1260                                 major(st->st_rdev), minor(st->st_rdev),
1261                                 attribute);
1262     if (!g_file_get_contents(sysfspath, val, &len, NULL)) {
1263         return -ENOENT;
1264     }
1265 
1266     /* The file is ended with '\n' */
1267     char *p;
1268     p = *val;
1269     if (*(p + len - 1) == '\n') {
1270         *(p + len - 1) = '\0';
1271     }
1272     return 0;
1273 }
1274 #endif
1275 
1276 #if defined(CONFIG_BLKZONED)
1277 static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
1278 {
1279     g_autofree char *val = NULL;
1280     int ret;
1281 
1282     ret = get_sysfs_str_val(st, "zoned", &val);
1283     if (ret < 0) {
1284         return ret;
1285     }
1286 
1287     if (strcmp(val, "host-managed") == 0) {
1288         *zoned = BLK_Z_HM;
1289     } else if (strcmp(val, "host-aware") == 0) {
1290         *zoned = BLK_Z_HA;
1291     } else if (strcmp(val, "none") == 0) {
1292         *zoned = BLK_Z_NONE;
1293     } else {
1294         return -ENOTSUP;
1295     }
1296     return 0;
1297 }
1298 #endif /* defined(CONFIG_BLKZONED) */
1299 
1300 #ifdef CONFIG_LINUX
1301 /*
1302  * Get a sysfs attribute value as a long integer.
1303  */
1304 static long get_sysfs_long_val(struct stat *st, const char *attribute)
1305 {
1306     g_autofree char *str = NULL;
1307     const char *end;
1308     long val;
1309     int ret;
1310 
1311     ret = get_sysfs_str_val(st, attribute, &str);
1312     if (ret < 0) {
1313         return ret;
1314     }
1315 
1316     /* The file is ended with '\n', pass 'end' to accept that. */
1317     ret = qemu_strtol(str, &end, 10, &val);
1318     if (ret == 0 && end && *end == '\0') {
1319         ret = val;
1320     }
1321     return ret;
1322 }
1323 
1324 /*
1325  * Get a sysfs attribute value as a uint32_t.
1326  */
1327 static int get_sysfs_u32_val(struct stat *st, const char *attribute,
1328                              uint32_t *u32)
1329 {
1330     g_autofree char *str = NULL;
1331     const char *end;
1332     unsigned int val;
1333     int ret;
1334 
1335     ret = get_sysfs_str_val(st, attribute, &str);
1336     if (ret < 0) {
1337         return ret;
1338     }
1339 
1340     /* The file is ended with '\n', pass 'end' to accept that. */
1341     ret = qemu_strtoui(str, &end, 10, &val);
1342     if (ret == 0 && end && *end == '\0') {
1343         *u32 = val;
1344     }
1345     return ret;
1346 }
1347 #endif
1348 
1349 static int hdev_get_max_segments(int fd, struct stat *st)
1350 {
1351 #ifdef CONFIG_LINUX
1352     int ret;
1353 
1354     if (S_ISCHR(st->st_mode)) {
1355         if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
1356             return ret;
1357         }
1358         return -ENOTSUP;
1359     }
1360     return get_sysfs_long_val(st, "max_segments");
1361 #else
1362     return -ENOTSUP;
1363 #endif
1364 }
1365 
1366 /*
1367  * Fills in *dalign with the discard alignment and returns 0 on success,
1368  * -errno otherwise.
1369  */
1370 static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign)
1371 {
1372 #ifdef CONFIG_LINUX
1373     /*
1374      * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux
1375      * "discard_alignment" is something else.
1376      */
1377     return get_sysfs_u32_val(st, "discard_granularity", dalign);
1378 #else
1379     return -ENOTSUP;
1380 #endif
1381 }
1382 
1383 #if defined(CONFIG_BLKZONED)
1384 /*
1385  * If the reset_all flag is true, then the wps of zone whose state is
1386  * not readonly or offline should be all reset to the start sector.
1387  * Else, take the real wp of the device.
1388  */
1389 static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
1390                         unsigned int nrz, bool reset_all)
1391 {
1392     struct blk_zone *blkz;
1393     size_t rep_size;
1394     uint64_t sector = offset >> BDRV_SECTOR_BITS;
1395     BlockZoneWps *wps = bs->wps;
1396     unsigned int j = offset / bs->bl.zone_size;
1397     unsigned int n = 0, i = 0;
1398     int ret;
1399     rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
1400     g_autofree struct blk_zone_report *rep = NULL;
1401 
1402     rep = g_malloc(rep_size);
1403     blkz = (struct blk_zone *)(rep + 1);
1404     while (n < nrz) {
1405         memset(rep, 0, rep_size);
1406         rep->sector = sector;
1407         rep->nr_zones = nrz - n;
1408 
1409         do {
1410             ret = ioctl(fd, BLKREPORTZONE, rep);
1411         } while (ret != 0 && errno == EINTR);
1412         if (ret != 0) {
1413             error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
1414                     fd, offset, errno);
1415             return -errno;
1416         }
1417 
1418         if (!rep->nr_zones) {
1419             break;
1420         }
1421 
1422         for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
1423             /*
1424              * The wp tracking cares only about sequential writes required and
1425              * sequential write preferred zones so that the wp can advance to
1426              * the right location.
1427              * Use the most significant bit of the wp location to indicate the
1428              * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
1429              */
1430             if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
1431                 wps->wp[j] |= 1ULL << 63;
1432             } else {
1433                 switch(blkz[i].cond) {
1434                 case BLK_ZONE_COND_FULL:
1435                 case BLK_ZONE_COND_READONLY:
1436                     /* Zone not writable */
1437                     wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
1438                     break;
1439                 case BLK_ZONE_COND_OFFLINE:
1440                     /* Zone not writable nor readable */
1441                     wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
1442                     break;
1443                 default:
1444                     if (reset_all) {
1445                         wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
1446                     } else {
1447                         wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
1448                     }
1449                     break;
1450                 }
1451             }
1452         }
1453         sector = blkz[i - 1].start + blkz[i - 1].len;
1454     }
1455 
1456     return 0;
1457 }
1458 
1459 static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
1460                             unsigned int nrz)
1461 {
1462     if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
1463         error_report("update zone wp failed");
1464     }
1465 }
1466 
1467 static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
1468                                      Error **errp)
1469 {
1470     BDRVRawState *s = bs->opaque;
1471     BlockZoneModel zoned = BLK_Z_NONE;
1472     int ret;
1473 
1474     ret = get_sysfs_zoned_model(st, &zoned);
1475     if (ret < 0 || zoned == BLK_Z_NONE) {
1476         goto no_zoned;
1477     }
1478     bs->bl.zoned = zoned;
1479 
1480     ret = get_sysfs_long_val(st, "max_open_zones");
1481     if (ret >= 0) {
1482         bs->bl.max_open_zones = ret;
1483     }
1484 
1485     ret = get_sysfs_long_val(st, "max_active_zones");
1486     if (ret >= 0) {
1487         bs->bl.max_active_zones = ret;
1488     }
1489 
1490     /*
1491      * The zoned device must at least have zone size and nr_zones fields.
1492      */
1493     ret = get_sysfs_long_val(st, "chunk_sectors");
1494     if (ret < 0) {
1495         error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
1496                                      "sysfs attribute");
1497         goto no_zoned;
1498     } else if (!ret) {
1499         error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
1500         goto no_zoned;
1501     }
1502     bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
1503 
1504     ret = get_sysfs_long_val(st, "nr_zones");
1505     if (ret < 0) {
1506         error_setg_errno(errp, -ret, "Unable to read nr_zones "
1507                                      "sysfs attribute");
1508         goto no_zoned;
1509     } else if (!ret) {
1510         error_setg(errp, "Read 0 from nr_zones sysfs attribute");
1511         goto no_zoned;
1512     }
1513     bs->bl.nr_zones = ret;
1514 
1515     ret = get_sysfs_long_val(st, "zone_append_max_bytes");
1516     if (ret > 0) {
1517         bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
1518     }
1519 
1520     ret = get_sysfs_long_val(st, "physical_block_size");
1521     if (ret >= 0) {
1522         bs->bl.write_granularity = ret;
1523     }
1524 
1525     /* The refresh_limits() function can be called multiple times. */
1526     g_free(bs->wps);
1527     bs->wps = g_malloc(sizeof(BlockZoneWps) +
1528             sizeof(int64_t) * bs->bl.nr_zones);
1529     ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
1530     if (ret < 0) {
1531         error_setg_errno(errp, -ret, "report wps failed");
1532         goto no_zoned;
1533     }
1534     qemu_co_mutex_init(&bs->wps->colock);
1535     return;
1536 
1537 no_zoned:
1538     bs->bl.zoned = BLK_Z_NONE;
1539     g_free(bs->wps);
1540     bs->wps = NULL;
1541 }
1542 #else /* !defined(CONFIG_BLKZONED) */
1543 static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
1544                                      Error **errp)
1545 {
1546     bs->bl.zoned = BLK_Z_NONE;
1547 }
1548 #endif /* !defined(CONFIG_BLKZONED) */
1549 
1550 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1551 {
1552     BDRVRawState *s = bs->opaque;
1553     struct stat st;
1554 
1555     s->needs_alignment = raw_needs_alignment(bs);
1556     raw_probe_alignment(bs, s->fd, errp);
1557 
1558     bs->bl.min_mem_alignment = s->buf_align;
1559     bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size());
1560 
1561     /*
1562      * Maximum transfers are best effort, so it is okay to ignore any
1563      * errors.  That said, based on the man page errors in fstat would be
1564      * very much unexpected; the only possible case seems to be ENOMEM.
1565      */
1566     if (fstat(s->fd, &st)) {
1567         return;
1568     }
1569 
1570 #if defined(__APPLE__) && (__MACH__)
1571     struct statfs buf;
1572 
1573     if (!fstatfs(s->fd, &buf)) {
1574         bs->bl.opt_transfer = buf.f_iosize;
1575         bs->bl.pdiscard_alignment = buf.f_bsize;
1576     }
1577 #endif
1578 
1579     if (bdrv_is_sg(bs) || S_ISBLK(st.st_mode)) {
1580         int ret = hdev_get_max_hw_transfer(s->fd, &st);
1581 
1582         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1583             bs->bl.max_hw_transfer = ret;
1584         }
1585 
1586         ret = hdev_get_max_segments(s->fd, &st);
1587         if (ret > 0) {
1588             bs->bl.max_hw_iov = ret;
1589         }
1590     }
1591 
1592     if (S_ISBLK(st.st_mode)) {
1593         uint32_t dalign = 0;
1594         int ret;
1595 
1596         ret = hdev_get_pdiscard_alignment(&st, &dalign);
1597         if (ret == 0 && dalign != 0) {
1598             uint32_t ralign = bs->bl.request_alignment;
1599 
1600             /* Probably never happens, but handle it just in case */
1601             if (dalign < ralign && (ralign % dalign == 0)) {
1602                 dalign = ralign;
1603             }
1604 
1605             /* The block layer requires a multiple of request_alignment */
1606             if (dalign % ralign != 0) {
1607                 error_setg(errp, "Invalid pdiscard_alignment limit %u is not a "
1608                         "multiple of request_alignment %u", dalign, ralign);
1609                 return;
1610             }
1611 
1612             bs->bl.pdiscard_alignment = dalign;
1613         }
1614 
1615 #ifdef __linux__
1616         /*
1617          * Linux requires logical block size alignment for write zeroes even
1618          * when normal reads/writes do not require alignment.
1619          */
1620         if (!s->needs_alignment) {
1621             ret = probe_logical_blocksize(s->fd,
1622                                           &bs->bl.pwrite_zeroes_alignment);
1623             if (ret < 0) {
1624                 error_setg_errno(errp, -ret,
1625                                  "Failed to probe logical block size");
1626                 return;
1627             }
1628         }
1629 #endif /* __linux__ */
1630     }
1631 
1632     raw_refresh_zoned_limits(bs, &st, errp);
1633 }
1634 
1635 static int check_for_dasd(int fd)
1636 {
1637 #ifdef BIODASDINFO2
1638     struct dasd_information2_t info = {0};
1639 
1640     return ioctl(fd, BIODASDINFO2, &info);
1641 #else
1642     return -1;
1643 #endif
1644 }
1645 
1646 /**
1647  * Try to get @bs's logical and physical block size.
1648  * On success, store them in @bsz and return zero.
1649  * On failure, return negative errno.
1650  */
1651 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1652 {
1653     BDRVRawState *s = bs->opaque;
1654     int ret;
1655 
1656     /* If DASD or zoned devices, get blocksizes */
1657     if (check_for_dasd(s->fd) < 0) {
1658         /* zoned devices are not DASD */
1659         if (bs->bl.zoned == BLK_Z_NONE) {
1660             return -ENOTSUP;
1661         }
1662     }
1663     ret = probe_logical_blocksize(s->fd, &bsz->log);
1664     if (ret < 0) {
1665         return ret;
1666     }
1667     return probe_physical_blocksize(s->fd, &bsz->phys);
1668 }
1669 
1670 /**
1671  * Try to get @bs's geometry: cyls, heads, sectors.
1672  * On success, store them in @geo and return 0.
1673  * On failure return -errno.
1674  * (Allows block driver to assign default geometry values that guest sees)
1675  */
1676 #ifdef __linux__
1677 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1678 {
1679     BDRVRawState *s = bs->opaque;
1680     struct hd_geometry ioctl_geo = {0};
1681 
1682     /* If DASD, get its geometry */
1683     if (check_for_dasd(s->fd) < 0) {
1684         return -ENOTSUP;
1685     }
1686     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1687         return -errno;
1688     }
1689     /* HDIO_GETGEO may return success even though geo contains zeros
1690        (e.g. certain multipath setups) */
1691     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1692         return -ENOTSUP;
1693     }
1694     /* Do not return a geometry for partition */
1695     if (ioctl_geo.start != 0) {
1696         return -ENOTSUP;
1697     }
1698     geo->heads = ioctl_geo.heads;
1699     geo->sectors = ioctl_geo.sectors;
1700     geo->cylinders = ioctl_geo.cylinders;
1701 
1702     return 0;
1703 }
1704 #else /* __linux__ */
1705 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1706 {
1707     return -ENOTSUP;
1708 }
1709 #endif
1710 
1711 #if defined(__linux__)
1712 static int handle_aiocb_ioctl(void *opaque)
1713 {
1714     RawPosixAIOData *aiocb = opaque;
1715     int ret;
1716 
1717     ret = RETRY_ON_EINTR(
1718         ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf)
1719     );
1720     if (ret == -1) {
1721         return -errno;
1722     }
1723 
1724     return 0;
1725 }
1726 #endif /* linux */
1727 
1728 static int handle_aiocb_flush(void *opaque)
1729 {
1730     RawPosixAIOData *aiocb = opaque;
1731     BDRVRawState *s = aiocb->bs->opaque;
1732     int ret;
1733 
1734     if (s->page_cache_inconsistent) {
1735         return -s->page_cache_inconsistent;
1736     }
1737 
1738     ret = qemu_fdatasync(aiocb->aio_fildes);
1739     if (ret == -1) {
1740         trace_file_flush_fdatasync_failed(errno);
1741 
1742         /* There is no clear definition of the semantics of a failing fsync(),
1743          * so we may have to assume the worst. The sad truth is that this
1744          * assumption is correct for Linux. Some pages are now probably marked
1745          * clean in the page cache even though they are inconsistent with the
1746          * on-disk contents. The next fdatasync() call would succeed, but no
1747          * further writeback attempt will be made. We can't get back to a state
1748          * in which we know what is on disk (we would have to rewrite
1749          * everything that was touched since the last fdatasync() at least), so
1750          * make bdrv_flush() fail permanently. Given that the behaviour isn't
1751          * really defined, I have little hope that other OSes are doing better.
1752          *
1753          * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1754          * cache. */
1755         if ((s->open_flags & O_DIRECT) == 0) {
1756             s->page_cache_inconsistent = errno;
1757         }
1758         return -errno;
1759     }
1760     return 0;
1761 }
1762 
1763 #ifdef CONFIG_PREADV
1764 
1765 static bool preadv_present = true;
1766 
1767 static ssize_t
1768 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1769 {
1770     return preadv(fd, iov, nr_iov, offset);
1771 }
1772 
1773 static ssize_t
1774 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1775 {
1776     return pwritev(fd, iov, nr_iov, offset);
1777 }
1778 
1779 #else
1780 
1781 static bool preadv_present = false;
1782 
1783 static ssize_t
1784 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1785 {
1786     return -ENOSYS;
1787 }
1788 
1789 static ssize_t
1790 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1791 {
1792     return -ENOSYS;
1793 }
1794 
1795 #endif
1796 
1797 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1798 {
1799     ssize_t len;
1800 
1801     len = RETRY_ON_EINTR(
1802         (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
1803             qemu_pwritev(aiocb->aio_fildes,
1804                            aiocb->io.iov,
1805                            aiocb->io.niov,
1806                            aiocb->aio_offset) :
1807             qemu_preadv(aiocb->aio_fildes,
1808                           aiocb->io.iov,
1809                           aiocb->io.niov,
1810                           aiocb->aio_offset)
1811     );
1812 
1813     if (len == -1) {
1814         return -errno;
1815     }
1816     return len;
1817 }
1818 
1819 /*
1820  * Read/writes the data to/from a given linear buffer.
1821  *
1822  * Returns the number of bytes handles or -errno in case of an error. Short
1823  * reads are only returned if the end of the file is reached.
1824  */
1825 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1826 {
1827     ssize_t offset = 0;
1828     ssize_t len;
1829 
1830     while (offset < aiocb->aio_nbytes) {
1831         if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
1832             len = pwrite(aiocb->aio_fildes,
1833                          (const char *)buf + offset,
1834                          aiocb->aio_nbytes - offset,
1835                          aiocb->aio_offset + offset);
1836         } else {
1837             len = pread(aiocb->aio_fildes,
1838                         buf + offset,
1839                         aiocb->aio_nbytes - offset,
1840                         aiocb->aio_offset + offset);
1841         }
1842         if (len == -1 && errno == EINTR) {
1843             continue;
1844         } else if (len == -1 && errno == EINVAL &&
1845                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1846                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1847                    offset > 0) {
1848             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1849              * after a short read.  Assume that O_DIRECT short reads only occur
1850              * at EOF.  Therefore this is a short read, not an I/O error.
1851              */
1852             break;
1853         } else if (len == -1) {
1854             offset = -errno;
1855             break;
1856         } else if (len == 0) {
1857             break;
1858         }
1859         offset += len;
1860     }
1861 
1862     return offset;
1863 }
1864 
1865 static int handle_aiocb_rw(void *opaque)
1866 {
1867     RawPosixAIOData *aiocb = opaque;
1868     ssize_t nbytes;
1869     char *buf;
1870 
1871     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1872         /*
1873          * If there is just a single buffer, and it is properly aligned
1874          * we can just use plain pread/pwrite without any problems.
1875          */
1876         if (aiocb->io.niov == 1) {
1877             nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1878             goto out;
1879         }
1880         /*
1881          * We have more than one iovec, and all are properly aligned.
1882          *
1883          * Try preadv/pwritev first and fall back to linearizing the
1884          * buffer if it's not supported.
1885          */
1886         if (preadv_present) {
1887             nbytes = handle_aiocb_rw_vector(aiocb);
1888             if (nbytes == aiocb->aio_nbytes ||
1889                 (nbytes < 0 && nbytes != -ENOSYS)) {
1890                 goto out;
1891             }
1892             preadv_present = false;
1893         }
1894 
1895         /*
1896          * XXX(hch): short read/write.  no easy way to handle the reminder
1897          * using these interfaces.  For now retry using plain
1898          * pread/pwrite?
1899          */
1900     }
1901 
1902     /*
1903      * Ok, we have to do it the hard way, copy all segments into
1904      * a single aligned buffer.
1905      */
1906     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1907     if (buf == NULL) {
1908         nbytes = -ENOMEM;
1909         goto out;
1910     }
1911 
1912     if (aiocb->aio_type & QEMU_AIO_WRITE) {
1913         char *p = buf;
1914         int i;
1915 
1916         for (i = 0; i < aiocb->io.niov; ++i) {
1917             memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1918             p += aiocb->io.iov[i].iov_len;
1919         }
1920         assert(p - buf == aiocb->aio_nbytes);
1921     }
1922 
1923     nbytes = handle_aiocb_rw_linear(aiocb, buf);
1924     if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
1925         char *p = buf;
1926         size_t count = aiocb->aio_nbytes, copy;
1927         int i;
1928 
1929         for (i = 0; i < aiocb->io.niov && count; ++i) {
1930             copy = count;
1931             if (copy > aiocb->io.iov[i].iov_len) {
1932                 copy = aiocb->io.iov[i].iov_len;
1933             }
1934             memcpy(aiocb->io.iov[i].iov_base, p, copy);
1935             assert(count >= copy);
1936             p     += copy;
1937             count -= copy;
1938         }
1939         assert(count == 0);
1940     }
1941     qemu_vfree(buf);
1942 
1943 out:
1944     if (nbytes == aiocb->aio_nbytes) {
1945         return 0;
1946     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1947         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1948             return -EINVAL;
1949         } else {
1950             iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1951                       0, aiocb->aio_nbytes - nbytes);
1952             return 0;
1953         }
1954     } else {
1955         assert(nbytes < 0);
1956         return nbytes;
1957     }
1958 }
1959 
1960 #if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
1961 static int translate_err(int err)
1962 {
1963     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1964         err == -ENOTTY) {
1965         err = -ENOTSUP;
1966     }
1967     return err;
1968 }
1969 #endif
1970 
1971 #ifdef CONFIG_FALLOCATE
1972 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1973 {
1974     do {
1975         if (fallocate(fd, mode, offset, len) == 0) {
1976             return 0;
1977         }
1978     } while (errno == EINTR);
1979     return translate_err(-errno);
1980 }
1981 #endif
1982 
1983 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1984 {
1985     int ret = -ENOTSUP;
1986     BDRVRawState *s = aiocb->bs->opaque;
1987 
1988     if (!s->has_write_zeroes) {
1989         return -ENOTSUP;
1990     }
1991 
1992 #ifdef BLKZEROOUT
1993     /* The BLKZEROOUT implementation in the kernel doesn't set
1994      * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1995      * fallbacks. */
1996     if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1997         do {
1998             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1999             if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
2000                 return 0;
2001             }
2002         } while (errno == EINTR);
2003 
2004         ret = translate_err(-errno);
2005         if (ret == -ENOTSUP) {
2006             s->has_write_zeroes = false;
2007         }
2008     }
2009 #endif
2010 
2011     return ret;
2012 }
2013 
2014 static int handle_aiocb_write_zeroes(void *opaque)
2015 {
2016     RawPosixAIOData *aiocb = opaque;
2017 #ifdef CONFIG_FALLOCATE
2018     BDRVRawState *s = aiocb->bs->opaque;
2019     int64_t len;
2020 #endif
2021 
2022     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
2023         return handle_aiocb_write_zeroes_block(aiocb);
2024     }
2025 
2026 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
2027     if (s->has_write_zeroes) {
2028         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
2029                                aiocb->aio_offset, aiocb->aio_nbytes);
2030         if (ret == -ENOTSUP) {
2031             s->has_write_zeroes = false;
2032         } else if (ret == 0 || ret != -EINVAL) {
2033             return ret;
2034         }
2035         /*
2036          * Note: Some file systems do not like unaligned byte ranges, and
2037          * return EINVAL in such a case, though they should not do it according
2038          * to the man-page of fallocate(). Thus we simply ignore this return
2039          * value and try the other fallbacks instead.
2040          */
2041     }
2042 #endif
2043 
2044 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
2045     if (s->has_discard && s->has_fallocate) {
2046         int ret = do_fallocate(s->fd,
2047                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
2048                                aiocb->aio_offset, aiocb->aio_nbytes);
2049         if (ret == 0) {
2050             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
2051             if (ret == 0 || ret != -ENOTSUP) {
2052                 return ret;
2053             }
2054             s->has_fallocate = false;
2055         } else if (ret == -EINVAL) {
2056             /*
2057              * Some file systems like older versions of GPFS do not like un-
2058              * aligned byte ranges, and return EINVAL in such a case, though
2059              * they should not do it according to the man-page of fallocate().
2060              * Warn about the bad filesystem and try the final fallback instead.
2061              */
2062             warn_report_once("Your file system is misbehaving: "
2063                              "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
2064                              "Please report this bug to your file system "
2065                              "vendor.");
2066         } else if (ret != -ENOTSUP) {
2067             return ret;
2068         } else {
2069             s->has_discard = false;
2070         }
2071     }
2072 #endif
2073 
2074 #ifdef CONFIG_FALLOCATE
2075     /* Last resort: we are trying to extend the file with zeroed data. This
2076      * can be done via fallocate(fd, 0) */
2077     len = raw_getlength(aiocb->bs);
2078     if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
2079         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
2080         if (ret == 0 || ret != -ENOTSUP) {
2081             return ret;
2082         }
2083         s->has_fallocate = false;
2084     }
2085 #endif
2086 
2087     return -ENOTSUP;
2088 }
2089 
2090 static int handle_aiocb_write_zeroes_unmap(void *opaque)
2091 {
2092     RawPosixAIOData *aiocb = opaque;
2093     BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
2094 
2095     /* First try to write zeros and unmap at the same time */
2096 
2097 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
2098     int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
2099                            aiocb->aio_offset, aiocb->aio_nbytes);
2100     switch (ret) {
2101     case -ENOTSUP:
2102     case -EINVAL:
2103     case -EBUSY:
2104         break;
2105     default:
2106         return ret;
2107     }
2108 #endif
2109 
2110     /* If we couldn't manage to unmap while guaranteed that the area reads as
2111      * all-zero afterwards, just write zeroes without unmapping */
2112     return handle_aiocb_write_zeroes(aiocb);
2113 }
2114 
2115 #ifndef HAVE_COPY_FILE_RANGE
2116 #ifndef EMSCRIPTEN
2117 static
2118 #endif
2119 ssize_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
2120                         off_t *out_off, size_t len, unsigned int flags)
2121 {
2122 #ifdef __NR_copy_file_range
2123     return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
2124                    out_off, len, flags);
2125 #else
2126     errno = ENOSYS;
2127     return -1;
2128 #endif
2129 }
2130 #endif
2131 
2132 /*
2133  * parse_zone - Fill a zone descriptor
2134  */
2135 #if defined(CONFIG_BLKZONED)
2136 static inline int parse_zone(struct BlockZoneDescriptor *zone,
2137                               const struct blk_zone *blkz) {
2138     zone->start = blkz->start << BDRV_SECTOR_BITS;
2139     zone->length = blkz->len << BDRV_SECTOR_BITS;
2140     zone->wp = blkz->wp << BDRV_SECTOR_BITS;
2141 
2142 #ifdef HAVE_BLK_ZONE_REP_CAPACITY
2143     zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
2144 #else
2145     zone->cap = blkz->len << BDRV_SECTOR_BITS;
2146 #endif
2147 
2148     switch (blkz->type) {
2149     case BLK_ZONE_TYPE_SEQWRITE_REQ:
2150         zone->type = BLK_ZT_SWR;
2151         break;
2152     case BLK_ZONE_TYPE_SEQWRITE_PREF:
2153         zone->type = BLK_ZT_SWP;
2154         break;
2155     case BLK_ZONE_TYPE_CONVENTIONAL:
2156         zone->type = BLK_ZT_CONV;
2157         break;
2158     default:
2159         error_report("Unsupported zone type: 0x%x", blkz->type);
2160         return -ENOTSUP;
2161     }
2162 
2163     switch (blkz->cond) {
2164     case BLK_ZONE_COND_NOT_WP:
2165         zone->state = BLK_ZS_NOT_WP;
2166         break;
2167     case BLK_ZONE_COND_EMPTY:
2168         zone->state = BLK_ZS_EMPTY;
2169         break;
2170     case BLK_ZONE_COND_IMP_OPEN:
2171         zone->state = BLK_ZS_IOPEN;
2172         break;
2173     case BLK_ZONE_COND_EXP_OPEN:
2174         zone->state = BLK_ZS_EOPEN;
2175         break;
2176     case BLK_ZONE_COND_CLOSED:
2177         zone->state = BLK_ZS_CLOSED;
2178         break;
2179     case BLK_ZONE_COND_READONLY:
2180         zone->state = BLK_ZS_RDONLY;
2181         break;
2182     case BLK_ZONE_COND_FULL:
2183         zone->state = BLK_ZS_FULL;
2184         break;
2185     case BLK_ZONE_COND_OFFLINE:
2186         zone->state = BLK_ZS_OFFLINE;
2187         break;
2188     default:
2189         error_report("Unsupported zone state: 0x%x", blkz->cond);
2190         return -ENOTSUP;
2191     }
2192     return 0;
2193 }
2194 #endif
2195 
2196 #if defined(CONFIG_BLKZONED)
2197 static int handle_aiocb_zone_report(void *opaque)
2198 {
2199     RawPosixAIOData *aiocb = opaque;
2200     int fd = aiocb->aio_fildes;
2201     unsigned int *nr_zones = aiocb->zone_report.nr_zones;
2202     BlockZoneDescriptor *zones = aiocb->zone_report.zones;
2203     /* zoned block devices use 512-byte sectors */
2204     uint64_t sector = aiocb->aio_offset / 512;
2205 
2206     struct blk_zone *blkz;
2207     size_t rep_size;
2208     unsigned int nrz;
2209     int ret;
2210     unsigned int n = 0, i = 0;
2211 
2212     nrz = *nr_zones;
2213     rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
2214     g_autofree struct blk_zone_report *rep = NULL;
2215     rep = g_malloc(rep_size);
2216 
2217     blkz = (struct blk_zone *)(rep + 1);
2218     while (n < nrz) {
2219         memset(rep, 0, rep_size);
2220         rep->sector = sector;
2221         rep->nr_zones = nrz - n;
2222 
2223         do {
2224             ret = ioctl(fd, BLKREPORTZONE, rep);
2225         } while (ret != 0 && errno == EINTR);
2226         if (ret != 0) {
2227             error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
2228                          fd, sector, errno);
2229             return -errno;
2230         }
2231 
2232         if (!rep->nr_zones) {
2233             break;
2234         }
2235 
2236         for (i = 0; i < rep->nr_zones; i++, n++) {
2237             ret = parse_zone(&zones[n], &blkz[i]);
2238             if (ret != 0) {
2239                 return ret;
2240             }
2241 
2242             /* The next report should start after the last zone reported */
2243             sector = blkz[i].start + blkz[i].len;
2244         }
2245     }
2246 
2247     *nr_zones = n;
2248     return 0;
2249 }
2250 #endif
2251 
2252 #if defined(CONFIG_BLKZONED)
2253 static int handle_aiocb_zone_mgmt(void *opaque)
2254 {
2255     RawPosixAIOData *aiocb = opaque;
2256     int fd = aiocb->aio_fildes;
2257     uint64_t sector = aiocb->aio_offset / 512;
2258     int64_t nr_sectors = aiocb->aio_nbytes / 512;
2259     struct blk_zone_range range;
2260     int ret;
2261 
2262     /* Execute the operation */
2263     range.sector = sector;
2264     range.nr_sectors = nr_sectors;
2265     do {
2266         ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
2267     } while (ret != 0 && errno == EINTR);
2268 
2269     return ret < 0 ? -errno : ret;
2270 }
2271 #endif
2272 
2273 static int handle_aiocb_copy_range(void *opaque)
2274 {
2275     RawPosixAIOData *aiocb = opaque;
2276     uint64_t bytes = aiocb->aio_nbytes;
2277     off_t in_off = aiocb->aio_offset;
2278     off_t out_off = aiocb->copy_range.aio_offset2;
2279 
2280     while (bytes) {
2281         ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
2282                                       aiocb->copy_range.aio_fd2, &out_off,
2283                                       bytes, 0);
2284         trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
2285                                    aiocb->copy_range.aio_fd2, out_off, bytes,
2286                                    0, ret);
2287         if (ret == 0) {
2288             /* No progress (e.g. when beyond EOF), let the caller fall back to
2289              * buffer I/O. */
2290             return -ENOSPC;
2291         }
2292         if (ret < 0) {
2293             switch (errno) {
2294             case ENOSYS:
2295                 return -ENOTSUP;
2296             case EINTR:
2297                 continue;
2298             default:
2299                 return -errno;
2300             }
2301         }
2302         bytes -= ret;
2303     }
2304     return 0;
2305 }
2306 
2307 static int handle_aiocb_discard(void *opaque)
2308 {
2309     RawPosixAIOData *aiocb = opaque;
2310     int ret = -ENOTSUP;
2311     BDRVRawState *s = aiocb->bs->opaque;
2312 
2313     if (!s->has_discard) {
2314         return -ENOTSUP;
2315     }
2316 
2317     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
2318 #ifdef BLKDISCARD
2319         do {
2320             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
2321             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
2322                 return 0;
2323             }
2324         } while (errno == EINTR);
2325 
2326         ret = translate_err(-errno);
2327 #endif
2328     } else {
2329 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
2330         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
2331                            aiocb->aio_offset, aiocb->aio_nbytes);
2332         ret = translate_err(ret);
2333 #elif defined(__APPLE__) && (__MACH__)
2334         fpunchhole_t fpunchhole;
2335         fpunchhole.fp_flags = 0;
2336         fpunchhole.reserved = 0;
2337         fpunchhole.fp_offset = aiocb->aio_offset;
2338         fpunchhole.fp_length = aiocb->aio_nbytes;
2339         if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
2340             ret = errno == ENODEV ? -ENOTSUP : -errno;
2341         } else {
2342             ret = 0;
2343         }
2344 #endif
2345     }
2346 
2347     if (ret == -ENOTSUP) {
2348         s->has_discard = false;
2349     }
2350     return ret;
2351 }
2352 
2353 /*
2354  * Help alignment probing by allocating the first block.
2355  *
2356  * When reading with direct I/O from unallocated area on Gluster backed by XFS,
2357  * reading succeeds regardless of request length. In this case we fallback to
2358  * safe alignment which is not optimal. Allocating the first block avoids this
2359  * fallback.
2360  *
2361  * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
2362  * request alignment, so we use safe values.
2363  *
2364  * Returns: 0 on success, -errno on failure. Since this is an optimization,
2365  * caller may ignore failures.
2366  */
2367 static int allocate_first_block(int fd, size_t max_size)
2368 {
2369     size_t write_size = (max_size < MAX_BLOCKSIZE)
2370         ? BDRV_SECTOR_SIZE
2371         : MAX_BLOCKSIZE;
2372     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
2373     void *buf;
2374     ssize_t n;
2375     int ret;
2376 
2377     buf = qemu_memalign(max_align, write_size);
2378     memset(buf, 0, write_size);
2379 
2380     n = RETRY_ON_EINTR(pwrite(fd, buf, write_size, 0));
2381 
2382     ret = (n == -1) ? -errno : 0;
2383 
2384     qemu_vfree(buf);
2385     return ret;
2386 }
2387 
2388 static int handle_aiocb_truncate(void *opaque)
2389 {
2390     RawPosixAIOData *aiocb = opaque;
2391     int result = 0;
2392     int64_t current_length = 0;
2393     char *buf = NULL;
2394     struct stat st;
2395     int fd = aiocb->aio_fildes;
2396     int64_t offset = aiocb->aio_offset;
2397     PreallocMode prealloc = aiocb->truncate.prealloc;
2398     Error **errp = aiocb->truncate.errp;
2399 
2400     if (fstat(fd, &st) < 0) {
2401         result = -errno;
2402         error_setg_errno(errp, -result, "Could not stat file");
2403         return result;
2404     }
2405 
2406     current_length = st.st_size;
2407     if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
2408         error_setg(errp, "Cannot use preallocation for shrinking files");
2409         return -ENOTSUP;
2410     }
2411 
2412     switch (prealloc) {
2413 #ifdef CONFIG_POSIX_FALLOCATE
2414     case PREALLOC_MODE_FALLOC:
2415         /*
2416          * Truncating before posix_fallocate() makes it about twice slower on
2417          * file systems that do not support fallocate(), trying to check if a
2418          * block is allocated before allocating it, so don't do that here.
2419          */
2420         if (offset != current_length) {
2421             result = -posix_fallocate(fd, current_length,
2422                                       offset - current_length);
2423             if (result != 0) {
2424                 /* posix_fallocate() doesn't set errno. */
2425                 error_setg_errno(errp, -result,
2426                                  "Could not preallocate new data");
2427             } else if (current_length == 0) {
2428                 /*
2429                  * posix_fallocate() uses fallocate() if the filesystem
2430                  * supports it, or fallback to manually writing zeroes. If
2431                  * fallocate() was used, unaligned reads from the fallocated
2432                  * area in raw_probe_alignment() will succeed, hence we need to
2433                  * allocate the first block.
2434                  *
2435                  * Optimize future alignment probing; ignore failures.
2436                  */
2437                 allocate_first_block(fd, offset);
2438             }
2439         } else {
2440             result = 0;
2441         }
2442         goto out;
2443 #endif
2444     case PREALLOC_MODE_FULL:
2445     {
2446         int64_t num = 0, left = offset - current_length;
2447         off_t seek_result;
2448 
2449         /*
2450          * Knowing the final size from the beginning could allow the file
2451          * system driver to do less allocations and possibly avoid
2452          * fragmentation of the file.
2453          */
2454         if (ftruncate(fd, offset) != 0) {
2455             result = -errno;
2456             error_setg_errno(errp, -result, "Could not resize file");
2457             goto out;
2458         }
2459 
2460         buf = g_malloc0(65536);
2461 
2462         seek_result = lseek(fd, current_length, SEEK_SET);
2463         if (seek_result < 0) {
2464             result = -errno;
2465             error_setg_errno(errp, -result,
2466                              "Failed to seek to the old end of file");
2467             goto out;
2468         }
2469 
2470         while (left > 0) {
2471             num = MIN(left, 65536);
2472             result = write(fd, buf, num);
2473             if (result < 0) {
2474                 if (errno == EINTR) {
2475                     continue;
2476                 }
2477                 result = -errno;
2478                 error_setg_errno(errp, -result,
2479                                  "Could not write zeros for preallocation");
2480                 goto out;
2481             }
2482             left -= result;
2483         }
2484         if (result >= 0) {
2485             result = fsync(fd);
2486             if (result < 0) {
2487                 result = -errno;
2488                 error_setg_errno(errp, -result,
2489                                  "Could not flush file to disk");
2490                 goto out;
2491             }
2492         }
2493         goto out;
2494     }
2495     case PREALLOC_MODE_OFF:
2496         if (ftruncate(fd, offset) != 0) {
2497             result = -errno;
2498             error_setg_errno(errp, -result, "Could not resize file");
2499         } else if (current_length == 0 && offset > current_length) {
2500             /* Optimize future alignment probing; ignore failures. */
2501             allocate_first_block(fd, offset);
2502         }
2503         return result;
2504     default:
2505         result = -ENOTSUP;
2506         error_setg(errp, "Unsupported preallocation mode: %s",
2507                    PreallocMode_str(prealloc));
2508         return result;
2509     }
2510 
2511 out:
2512     if (result < 0) {
2513         if (ftruncate(fd, current_length) < 0) {
2514             error_report("Failed to restore old file length: %s",
2515                          strerror(errno));
2516         }
2517     }
2518 
2519     g_free(buf);
2520     return result;
2521 }
2522 
2523 static int coroutine_fn raw_thread_pool_submit(ThreadPoolFunc func, void *arg)
2524 {
2525     return thread_pool_submit_co(func, arg);
2526 }
2527 
2528 /*
2529  * Check if all memory in this vector is sector aligned.
2530  */
2531 static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2532 {
2533     int i;
2534     size_t alignment = bdrv_min_mem_align(bs);
2535     size_t len = bs->bl.request_alignment;
2536     IO_CODE();
2537 
2538     for (i = 0; i < qiov->niov; i++) {
2539         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2540             return false;
2541         }
2542         if (qiov->iov[i].iov_len % len) {
2543             return false;
2544         }
2545     }
2546 
2547     return true;
2548 }
2549 
2550 #ifdef CONFIG_LINUX_AIO
2551 static inline bool raw_check_linux_aio(BDRVRawState *s)
2552 {
2553     Error *local_err = NULL;
2554     AioContext *ctx;
2555 
2556     if (!s->use_linux_aio) {
2557         return false;
2558     }
2559 
2560     ctx = qemu_get_current_aio_context();
2561     if (unlikely(!aio_setup_linux_aio(ctx, &local_err))) {
2562         error_reportf_err(local_err, "Unable to use Linux AIO, "
2563                                      "falling back to thread pool: ");
2564         s->use_linux_aio = false;
2565         return false;
2566     }
2567     return true;
2568 }
2569 #endif
2570 
2571 static int coroutine_fn GRAPH_RDLOCK
2572 raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes,
2573            QEMUIOVector *qiov, int type, int flags)
2574 {
2575     BDRVRawState *s = bs->opaque;
2576     RawPosixAIOData acb;
2577     int ret;
2578     uint64_t offset = *offset_ptr;
2579 
2580     if (fd_open(bs) < 0)
2581         return -EIO;
2582 #if defined(CONFIG_BLKZONED)
2583     if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
2584         bs->bl.zoned != BLK_Z_NONE) {
2585         qemu_co_mutex_lock(&bs->wps->colock);
2586         if (type & QEMU_AIO_ZONE_APPEND) {
2587             int index = offset / bs->bl.zone_size;
2588             offset = bs->wps->wp[index];
2589         }
2590     }
2591 #endif
2592 
2593     /*
2594      * When using O_DIRECT, the request must be aligned to be able to use
2595      * either libaio or io_uring interface. If not fail back to regular thread
2596      * pool read/write code which emulates this for us if we
2597      * set QEMU_AIO_MISALIGNED.
2598      */
2599     if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
2600         type |= QEMU_AIO_MISALIGNED;
2601 #ifdef CONFIG_LINUX_IO_URING
2602     } else if (s->use_linux_io_uring) {
2603         assert(qiov->size == bytes);
2604         ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
2605         goto out;
2606 #endif
2607 #ifdef CONFIG_LINUX_AIO
2608     } else if (raw_check_linux_aio(s)) {
2609         assert(qiov->size == bytes);
2610         ret = laio_co_submit(s->fd, offset, qiov, type, flags,
2611                               s->aio_max_batch);
2612         goto out;
2613 #endif
2614     }
2615 
2616     acb = (RawPosixAIOData) {
2617         .bs             = bs,
2618         .aio_fildes     = s->fd,
2619         .aio_type       = type,
2620         .aio_offset     = offset,
2621         .aio_nbytes     = bytes,
2622         .io             = {
2623             .iov            = qiov->iov,
2624             .niov           = qiov->niov,
2625         },
2626     };
2627 
2628     assert(qiov->size == bytes);
2629     ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
2630     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
2631         /* TODO Use pwritev2() instead if it's available */
2632         ret = bdrv_co_flush(bs);
2633     }
2634     goto out; /* Avoid the compiler err of unused label */
2635 
2636 out:
2637 #if defined(CONFIG_BLKZONED)
2638     if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
2639         bs->bl.zoned != BLK_Z_NONE) {
2640         BlockZoneWps *wps = bs->wps;
2641         if (ret == 0) {
2642             uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
2643             if (!BDRV_ZT_IS_CONV(*wp)) {
2644                 if (type & QEMU_AIO_ZONE_APPEND) {
2645                     *offset_ptr = *wp;
2646                     trace_zbd_zone_append_complete(bs, *offset_ptr
2647                         >> BDRV_SECTOR_BITS);
2648                 }
2649                 /* Advance the wp if needed */
2650                 if (offset + bytes > *wp) {
2651                     *wp = offset + bytes;
2652                 }
2653             }
2654         } else {
2655             /*
2656              * write and append write are not allowed to cross zone boundaries
2657              */
2658             update_zones_wp(bs, s->fd, offset, 1);
2659         }
2660 
2661         qemu_co_mutex_unlock(&wps->colock);
2662     }
2663 #endif
2664     return ret;
2665 }
2666 
2667 static int coroutine_fn GRAPH_RDLOCK
2668 raw_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
2669               QEMUIOVector *qiov, BdrvRequestFlags flags)
2670 {
2671     return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags);
2672 }
2673 
2674 static int coroutine_fn GRAPH_RDLOCK
2675 raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
2676                QEMUIOVector *qiov, BdrvRequestFlags flags)
2677 {
2678     return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags);
2679 }
2680 
2681 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
2682 {
2683     BDRVRawState *s = bs->opaque;
2684     RawPosixAIOData acb;
2685     int ret;
2686 
2687     ret = fd_open(bs);
2688     if (ret < 0) {
2689         return ret;
2690     }
2691 
2692     acb = (RawPosixAIOData) {
2693         .bs             = bs,
2694         .aio_fildes     = s->fd,
2695         .aio_type       = QEMU_AIO_FLUSH,
2696     };
2697 
2698 #ifdef CONFIG_LINUX_IO_URING
2699     if (s->use_linux_io_uring) {
2700         return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
2701     }
2702 #endif
2703 #ifdef CONFIG_LINUX_AIO
2704     if (s->has_laio_fdsync && raw_check_linux_aio(s)) {
2705         return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0, 0);
2706     }
2707 #endif
2708     return raw_thread_pool_submit(handle_aiocb_flush, &acb);
2709 }
2710 
2711 static void raw_close(BlockDriverState *bs)
2712 {
2713     BDRVRawState *s = bs->opaque;
2714 
2715     if (s->fd >= 0) {
2716 #if defined(CONFIG_BLKZONED)
2717         g_free(bs->wps);
2718 #endif
2719         qemu_close(s->fd);
2720         s->fd = -1;
2721     }
2722 }
2723 
2724 /**
2725  * Truncates the given regular file @fd to @offset and, when growing, fills the
2726  * new space according to @prealloc.
2727  *
2728  * Returns: 0 on success, -errno on failure.
2729  */
2730 static int coroutine_fn
2731 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2732                      PreallocMode prealloc, Error **errp)
2733 {
2734     RawPosixAIOData acb;
2735 
2736     acb = (RawPosixAIOData) {
2737         .bs             = bs,
2738         .aio_fildes     = fd,
2739         .aio_type       = QEMU_AIO_TRUNCATE,
2740         .aio_offset     = offset,
2741         .truncate       = {
2742             .prealloc       = prealloc,
2743             .errp           = errp,
2744         },
2745     };
2746 
2747     return raw_thread_pool_submit(handle_aiocb_truncate, &acb);
2748 }
2749 
2750 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2751                                         bool exact, PreallocMode prealloc,
2752                                         BdrvRequestFlags flags, Error **errp)
2753 {
2754     BDRVRawState *s = bs->opaque;
2755     struct stat st;
2756     int ret;
2757 
2758     if (fstat(s->fd, &st)) {
2759         ret = -errno;
2760         error_setg_errno(errp, -ret, "Failed to fstat() the file");
2761         return ret;
2762     }
2763 
2764     if (S_ISREG(st.st_mode)) {
2765         /* Always resizes to the exact @offset */
2766         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2767     }
2768 
2769     if (prealloc != PREALLOC_MODE_OFF) {
2770         error_setg(errp, "Preallocation mode '%s' unsupported for this "
2771                    "non-regular file", PreallocMode_str(prealloc));
2772         return -ENOTSUP;
2773     }
2774 
2775     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2776         int64_t cur_length = raw_getlength(bs);
2777 
2778         if (offset != cur_length && exact) {
2779             error_setg(errp, "Cannot resize device files");
2780             return -ENOTSUP;
2781         } else if (offset > cur_length) {
2782             error_setg(errp, "Cannot grow device files");
2783             return -EINVAL;
2784         }
2785     } else {
2786         error_setg(errp, "Resizing this file is not supported");
2787         return -ENOTSUP;
2788     }
2789 
2790     return 0;
2791 }
2792 
2793 #ifdef __OpenBSD__
2794 static int64_t raw_getlength(BlockDriverState *bs)
2795 {
2796     BDRVRawState *s = bs->opaque;
2797     int fd = s->fd;
2798     struct stat st;
2799 
2800     if (fstat(fd, &st))
2801         return -errno;
2802     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2803         struct disklabel dl;
2804 
2805         if (ioctl(fd, DIOCGDINFO, &dl))
2806             return -errno;
2807         return (uint64_t)dl.d_secsize *
2808             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2809     } else
2810         return st.st_size;
2811 }
2812 #elif defined(__NetBSD__)
2813 static int64_t raw_getlength(BlockDriverState *bs)
2814 {
2815     BDRVRawState *s = bs->opaque;
2816     int fd = s->fd;
2817     struct stat st;
2818 
2819     if (fstat(fd, &st))
2820         return -errno;
2821     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2822         struct dkwedge_info dkw;
2823 
2824         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2825             return dkw.dkw_size * 512;
2826         } else {
2827             struct disklabel dl;
2828 
2829             if (ioctl(fd, DIOCGDINFO, &dl))
2830                 return -errno;
2831             return (uint64_t)dl.d_secsize *
2832                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2833         }
2834     } else
2835         return st.st_size;
2836 }
2837 #elif defined(__sun__)
2838 static int64_t raw_getlength(BlockDriverState *bs)
2839 {
2840     BDRVRawState *s = bs->opaque;
2841     struct dk_minfo minfo;
2842     int ret;
2843     int64_t size;
2844 
2845     ret = fd_open(bs);
2846     if (ret < 0) {
2847         return ret;
2848     }
2849 
2850     /*
2851      * Use the DKIOCGMEDIAINFO ioctl to read the size.
2852      */
2853     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2854     if (ret != -1) {
2855         return minfo.dki_lbsize * minfo.dki_capacity;
2856     }
2857 
2858     /*
2859      * There are reports that lseek on some devices fails, but
2860      * irc discussion said that contingency on contingency was overkill.
2861      */
2862     size = lseek(s->fd, 0, SEEK_END);
2863     if (size < 0) {
2864         return -errno;
2865     }
2866     return size;
2867 }
2868 #elif defined(CONFIG_BSD)
2869 static int64_t raw_getlength(BlockDriverState *bs)
2870 {
2871     BDRVRawState *s = bs->opaque;
2872     int fd = s->fd;
2873     int64_t size;
2874     struct stat sb;
2875 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2876     int reopened = 0;
2877 #endif
2878     int ret;
2879 
2880     ret = fd_open(bs);
2881     if (ret < 0)
2882         return ret;
2883 
2884 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2885 again:
2886 #endif
2887     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2888         size = 0;
2889 #ifdef DIOCGMEDIASIZE
2890         if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
2891             size = 0;
2892         }
2893 #endif
2894 #ifdef DIOCGPART
2895         if (size == 0) {
2896             struct partinfo pi;
2897             if (ioctl(fd, DIOCGPART, &pi) == 0) {
2898                 size = pi.media_size;
2899             }
2900         }
2901 #endif
2902 #if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
2903         if (size == 0) {
2904             uint64_t sectors = 0;
2905             uint32_t sector_size = 0;
2906 
2907             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2908                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2909                 size = sectors * sector_size;
2910             }
2911         }
2912 #endif
2913         if (size == 0) {
2914             size = lseek(fd, 0LL, SEEK_END);
2915         }
2916         if (size < 0) {
2917             return -errno;
2918         }
2919 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2920         switch(s->type) {
2921         case FTYPE_CD:
2922             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2923             if (size == 2048LL * (unsigned)-1)
2924                 size = 0;
2925             /* XXX no disc?  maybe we need to reopen... */
2926             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2927                 reopened = 1;
2928                 goto again;
2929             }
2930         }
2931 #endif
2932     } else {
2933         size = lseek(fd, 0, SEEK_END);
2934         if (size < 0) {
2935             return -errno;
2936         }
2937     }
2938     return size;
2939 }
2940 #else
2941 static int64_t raw_getlength(BlockDriverState *bs)
2942 {
2943     BDRVRawState *s = bs->opaque;
2944     int ret;
2945     int64_t size;
2946 
2947     ret = fd_open(bs);
2948     if (ret < 0) {
2949         return ret;
2950     }
2951 
2952     size = lseek(s->fd, 0, SEEK_END);
2953     if (size < 0) {
2954         return -errno;
2955     }
2956     return size;
2957 }
2958 #endif
2959 
2960 static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
2961 {
2962     return raw_getlength(bs);
2963 }
2964 
2965 static int64_t coroutine_fn raw_co_get_allocated_file_size(BlockDriverState *bs)
2966 {
2967     struct stat st;
2968     BDRVRawState *s = bs->opaque;
2969 
2970     if (fstat(s->fd, &st) < 0) {
2971         return -errno;
2972     }
2973     return (int64_t)st.st_blocks * 512;
2974 }
2975 
2976 static int coroutine_fn
2977 raw_co_create(BlockdevCreateOptions *options, Error **errp)
2978 {
2979     BlockdevCreateOptionsFile *file_opts;
2980     Error *local_err = NULL;
2981     int fd;
2982     uint64_t perm, shared;
2983     int result = 0;
2984 
2985     /* Validate options and set default values */
2986     assert(options->driver == BLOCKDEV_DRIVER_FILE);
2987     file_opts = &options->u.file;
2988 
2989     if (!file_opts->has_nocow) {
2990         file_opts->nocow = false;
2991     }
2992     if (!file_opts->has_preallocation) {
2993         file_opts->preallocation = PREALLOC_MODE_OFF;
2994     }
2995     if (!file_opts->has_extent_size_hint) {
2996         file_opts->extent_size_hint = 1 * MiB;
2997     }
2998     if (file_opts->extent_size_hint > UINT32_MAX) {
2999         result = -EINVAL;
3000         error_setg(errp, "Extent size hint is too large");
3001         goto out;
3002     }
3003 
3004     /* Create file */
3005     fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
3006     if (fd < 0) {
3007         result = -errno;
3008         goto out;
3009     }
3010 
3011     /* Take permissions: We want to discard everything, so we need
3012      * BLK_PERM_WRITE; and truncation to the desired size requires
3013      * BLK_PERM_RESIZE.
3014      * On the other hand, we cannot share the RESIZE permission
3015      * because we promise that after this function, the file has the
3016      * size given in the options.  If someone else were to resize it
3017      * concurrently, we could not guarantee that.
3018      * Note that after this function, we can no longer guarantee that
3019      * the file is not touched by a third party, so it may be resized
3020      * then. */
3021     perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
3022     shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
3023 
3024     /* Step one: Take locks */
3025     result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
3026     if (result < 0) {
3027         goto out_close;
3028     }
3029 
3030     /* Step two: Check that nobody else has taken conflicting locks */
3031     result = raw_check_lock_bytes(fd, perm, shared, errp);
3032     if (result < 0) {
3033         error_append_hint(errp,
3034                           "Is another process using the image [%s]?\n",
3035                           file_opts->filename);
3036         goto out_unlock;
3037     }
3038 
3039     /* Clear the file by truncating it to 0 */
3040     result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
3041     if (result < 0) {
3042         goto out_unlock;
3043     }
3044 
3045     if (file_opts->nocow) {
3046 #ifdef __linux__
3047         /* Set NOCOW flag to solve performance issue on fs like btrfs.
3048          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
3049          * will be ignored since any failure of this operation should not
3050          * block the left work.
3051          */
3052         int attr;
3053         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
3054             attr |= FS_NOCOW_FL;
3055             ioctl(fd, FS_IOC_SETFLAGS, &attr);
3056         }
3057 #endif
3058     }
3059 #ifdef FS_IOC_FSSETXATTR
3060     /*
3061      * Try to set the extent size hint. Failure is not fatal, and a warning is
3062      * only printed if the option was explicitly specified.
3063      */
3064     {
3065         struct fsxattr attr;
3066         result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
3067         if (result == 0) {
3068             attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
3069             attr.fsx_extsize = file_opts->extent_size_hint;
3070             result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
3071         }
3072         if (result < 0 && file_opts->has_extent_size_hint &&
3073             file_opts->extent_size_hint)
3074         {
3075             warn_report("Failed to set extent size hint: %s",
3076                         strerror(errno));
3077         }
3078     }
3079 #endif
3080 
3081     /* Resize and potentially preallocate the file to the desired
3082      * final size */
3083     result = raw_regular_truncate(NULL, fd, file_opts->size,
3084                                   file_opts->preallocation, errp);
3085     if (result < 0) {
3086         goto out_unlock;
3087     }
3088 
3089 out_unlock:
3090     raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
3091     if (local_err) {
3092         /* The above call should not fail, and if it does, that does
3093          * not mean the whole creation operation has failed.  So
3094          * report it the user for their convenience, but do not report
3095          * it to the caller. */
3096         warn_report_err(local_err);
3097     }
3098 
3099 out_close:
3100     if (qemu_close(fd) != 0 && result == 0) {
3101         result = -errno;
3102         error_setg_errno(errp, -result, "Could not close the new file");
3103     }
3104 out:
3105     return result;
3106 }
3107 
3108 static int coroutine_fn GRAPH_RDLOCK
3109 raw_co_create_opts(BlockDriver *drv, const char *filename,
3110                    QemuOpts *opts, Error **errp)
3111 {
3112     BlockdevCreateOptions options;
3113     int64_t total_size = 0;
3114     int64_t extent_size_hint = 0;
3115     bool has_extent_size_hint = false;
3116     bool nocow = false;
3117     PreallocMode prealloc;
3118     char *buf = NULL;
3119     Error *local_err = NULL;
3120 
3121     /* Skip file: protocol prefix */
3122     strstart(filename, "file:", &filename);
3123 
3124     /* Read out options */
3125     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
3126                           BDRV_SECTOR_SIZE);
3127     if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
3128         has_extent_size_hint = true;
3129         extent_size_hint =
3130             qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
3131     }
3132     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
3133     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
3134     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
3135                                PREALLOC_MODE_OFF, &local_err);
3136     g_free(buf);
3137     if (local_err) {
3138         error_propagate(errp, local_err);
3139         return -EINVAL;
3140     }
3141 
3142     options = (BlockdevCreateOptions) {
3143         .driver     = BLOCKDEV_DRIVER_FILE,
3144         .u.file     = {
3145             .filename           = (char *) filename,
3146             .size               = total_size,
3147             .has_preallocation  = true,
3148             .preallocation      = prealloc,
3149             .has_nocow          = true,
3150             .nocow              = nocow,
3151             .has_extent_size_hint = has_extent_size_hint,
3152             .extent_size_hint   = extent_size_hint,
3153         },
3154     };
3155     return raw_co_create(&options, errp);
3156 }
3157 
3158 static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
3159                                            Error **errp)
3160 {
3161     struct stat st;
3162     int ret;
3163 
3164     if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
3165         error_setg_errno(errp, ENOENT, "%s is not a regular file",
3166                          bs->filename);
3167         return -ENOENT;
3168     }
3169 
3170     ret = unlink(bs->filename);
3171     if (ret < 0) {
3172         ret = -errno;
3173         error_setg_errno(errp, -ret, "Error when deleting file %s",
3174                          bs->filename);
3175     }
3176 
3177     return ret;
3178 }
3179 
3180 /*
3181  * Find allocation range in @bs around offset @start.
3182  * May change underlying file descriptor's file offset.
3183  * If @start is not in a hole, store @start in @data, and the
3184  * beginning of the next hole in @hole, and return 0.
3185  * If @start is in a non-trailing hole, store @start in @hole and the
3186  * beginning of the next non-hole in @data, and return 0.
3187  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
3188  * If we can't find out, return a negative errno other than -ENXIO.
3189  */
3190 static int find_allocation(BlockDriverState *bs, off_t start,
3191                            off_t *data, off_t *hole)
3192 {
3193 #if defined SEEK_HOLE && defined SEEK_DATA
3194     BDRVRawState *s = bs->opaque;
3195     off_t offs;
3196 
3197     /*
3198      * SEEK_DATA cases:
3199      * D1. offs == start: start is in data
3200      * D2. offs > start: start is in a hole, next data at offs
3201      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
3202      *                              or start is beyond EOF
3203      *     If the latter happens, the file has been truncated behind
3204      *     our back since we opened it.  All bets are off then.
3205      *     Treating like a trailing hole is simplest.
3206      * D4. offs < 0, errno != ENXIO: we learned nothing
3207      */
3208     offs = lseek(s->fd, start, SEEK_DATA);
3209     if (offs < 0) {
3210         return -errno;          /* D3 or D4 */
3211     }
3212 
3213     if (offs < start) {
3214         /* This is not a valid return by lseek().  We are safe to just return
3215          * -EIO in this case, and we'll treat it like D4. */
3216         return -EIO;
3217     }
3218 
3219     if (offs > start) {
3220         /* D2: in hole, next data at offs */
3221         *hole = start;
3222         *data = offs;
3223         return 0;
3224     }
3225 
3226     /* D1: in data, end not yet known */
3227 
3228     /*
3229      * SEEK_HOLE cases:
3230      * H1. offs == start: start is in a hole
3231      *     If this happens here, a hole has been dug behind our back
3232      *     since the previous lseek().
3233      * H2. offs > start: either start is in data, next hole at offs,
3234      *                   or start is in trailing hole, EOF at offs
3235      *     Linux treats trailing holes like any other hole: offs ==
3236      *     start.  Solaris seeks to EOF instead: offs > start (blech).
3237      *     If that happens here, a hole has been dug behind our back
3238      *     since the previous lseek().
3239      * H3. offs < 0, errno = ENXIO: start is beyond EOF
3240      *     If this happens, the file has been truncated behind our
3241      *     back since we opened it.  Treat it like a trailing hole.
3242      * H4. offs < 0, errno != ENXIO: we learned nothing
3243      *     Pretend we know nothing at all, i.e. "forget" about D1.
3244      */
3245     offs = lseek(s->fd, start, SEEK_HOLE);
3246     if (offs < 0) {
3247         return -errno;          /* D1 and (H3 or H4) */
3248     }
3249 
3250     if (offs < start) {
3251         /* This is not a valid return by lseek().  We are safe to just return
3252          * -EIO in this case, and we'll treat it like H4. */
3253         return -EIO;
3254     }
3255 
3256     if (offs > start) {
3257         /*
3258          * D1 and H2: either in data, next hole at offs, or it was in
3259          * data but is now in a trailing hole.  In the latter case,
3260          * all bets are off.  Treating it as if it there was data all
3261          * the way to EOF is safe, so simply do that.
3262          */
3263         *data = start;
3264         *hole = offs;
3265         return 0;
3266     }
3267 
3268     /* D1 and H1 */
3269     return -EBUSY;
3270 #else
3271     return -ENOTSUP;
3272 #endif
3273 }
3274 
3275 /*
3276  * Returns the allocation status of the specified offset.
3277  *
3278  * The block layer guarantees 'offset' and 'bytes' are within bounds.
3279  *
3280  * 'pnum' is set to the number of bytes (including and immediately following
3281  * the specified offset) that are known to be in the same
3282  * allocated/unallocated state.
3283  *
3284  * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
3285  * well exceed it.
3286  */
3287 static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
3288                                             unsigned int mode,
3289                                             int64_t offset,
3290                                             int64_t bytes, int64_t *pnum,
3291                                             int64_t *map,
3292                                             BlockDriverState **file)
3293 {
3294     off_t data = 0, hole = 0;
3295     int ret;
3296 
3297     assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
3298 
3299     ret = fd_open(bs);
3300     if (ret < 0) {
3301         return ret;
3302     }
3303 
3304     if (!(mode & BDRV_WANT_ZERO)) {
3305         /* There is no backing file - all bytes are allocated in this file.  */
3306         *pnum = bytes;
3307         *map = offset;
3308         *file = bs;
3309         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
3310     }
3311 
3312     ret = find_allocation(bs, offset, &data, &hole);
3313     if (ret == -ENXIO) {
3314         /* Trailing hole */
3315         *pnum = bytes;
3316         ret = BDRV_BLOCK_ZERO;
3317     } else if (ret < 0) {
3318         /* No info available, so pretend there are no holes */
3319         *pnum = bytes;
3320         ret = BDRV_BLOCK_DATA;
3321     } else if (data == offset) {
3322         /* On a data extent, compute bytes to the end of the extent,
3323          * possibly including a partial sector at EOF. */
3324         *pnum = hole - offset;
3325 
3326         /*
3327          * We are not allowed to return partial sectors, though, so
3328          * round up if necessary.
3329          */
3330         if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
3331             int64_t file_length = raw_getlength(bs);
3332             if (file_length > 0) {
3333                 /* Ignore errors, this is just a safeguard */
3334                 assert(hole == file_length);
3335             }
3336             *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
3337         }
3338 
3339         ret = BDRV_BLOCK_DATA;
3340     } else {
3341         /* On a hole, compute bytes to the beginning of the next extent.  */
3342         assert(hole == offset);
3343         *pnum = data - offset;
3344         ret = BDRV_BLOCK_ZERO;
3345     }
3346     *map = offset;
3347     *file = bs;
3348     return ret | BDRV_BLOCK_OFFSET_VALID;
3349 }
3350 
3351 #if defined(__linux__)
3352 /* Verify that the file is not in the page cache */
3353 static void check_cache_dropped(BlockDriverState *bs, Error **errp)
3354 {
3355     const size_t window_size = 128 * 1024 * 1024;
3356     BDRVRawState *s = bs->opaque;
3357     void *window = NULL;
3358     size_t length = 0;
3359     unsigned char *vec;
3360     size_t page_size;
3361     off_t offset;
3362     off_t end;
3363 
3364     /* mincore(2) page status information requires 1 byte per page */
3365     page_size = sysconf(_SC_PAGESIZE);
3366     vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
3367 
3368     end = raw_getlength(bs);
3369 
3370     for (offset = 0; offset < end; offset += window_size) {
3371         void *new_window;
3372         size_t new_length;
3373         size_t vec_end;
3374         size_t i;
3375         int ret;
3376 
3377         /* Unmap previous window if size has changed */
3378         new_length = MIN(end - offset, window_size);
3379         if (new_length != length) {
3380             munmap(window, length);
3381             window = NULL;
3382             length = 0;
3383         }
3384 
3385         new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
3386                           s->fd, offset);
3387         if (new_window == MAP_FAILED) {
3388             error_setg_errno(errp, errno, "mmap failed");
3389             break;
3390         }
3391 
3392         window = new_window;
3393         length = new_length;
3394 
3395         ret = mincore(window, length, vec);
3396         if (ret < 0) {
3397             error_setg_errno(errp, errno, "mincore failed");
3398             break;
3399         }
3400 
3401         vec_end = DIV_ROUND_UP(length, page_size);
3402         for (i = 0; i < vec_end; i++) {
3403             if (vec[i] & 0x1) {
3404                 break;
3405             }
3406         }
3407         if (i < vec_end) {
3408             error_setg(errp, "page cache still in use!");
3409             break;
3410         }
3411     }
3412 
3413     if (window) {
3414         munmap(window, length);
3415     }
3416 
3417     g_free(vec);
3418 }
3419 #endif /* __linux__ */
3420 
3421 static void coroutine_fn GRAPH_RDLOCK
3422 raw_co_invalidate_cache(BlockDriverState *bs, Error **errp)
3423 {
3424     BDRVRawState *s = bs->opaque;
3425     int ret;
3426 
3427     ret = fd_open(bs);
3428     if (ret < 0) {
3429         error_setg_errno(errp, -ret, "The file descriptor is not open");
3430         return;
3431     }
3432 
3433     if (!s->drop_cache) {
3434         return;
3435     }
3436 
3437     if (s->open_flags & O_DIRECT) {
3438         return; /* No host kernel page cache */
3439     }
3440 
3441 #if defined(__linux__)
3442     /* This sets the scene for the next syscall... */
3443     ret = bdrv_co_flush(bs);
3444     if (ret < 0) {
3445         error_setg_errno(errp, -ret, "flush failed");
3446         return;
3447     }
3448 
3449     /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
3450      * process.  These limitations are okay because we just fsynced the file,
3451      * we don't use mmap, and the file should not be in use by other processes.
3452      */
3453     ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
3454     if (ret != 0) { /* the return value is a positive errno */
3455         error_setg_errno(errp, ret, "fadvise failed");
3456         return;
3457     }
3458 
3459     if (s->check_cache_dropped) {
3460         check_cache_dropped(bs, errp);
3461     }
3462 #else /* __linux__ */
3463     /* Do nothing.  Live migration to a remote host with cache.direct=off is
3464      * unsupported on other host operating systems.  Cache consistency issues
3465      * may occur but no error is reported here, partly because that's the
3466      * historical behavior and partly because it's hard to differentiate valid
3467      * configurations that should not cause errors.
3468      */
3469 #endif /* !__linux__ */
3470 }
3471 
3472 static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
3473 {
3474     if (ret) {
3475         s->stats.discard_nb_failed++;
3476     } else {
3477         s->stats.discard_nb_ok++;
3478         s->stats.discard_bytes_ok += nbytes;
3479     }
3480 }
3481 
3482 /*
3483  * zone report - Get a zone block device's information in the form
3484  * of an array of zone descriptors.
3485  * zones is an array of zone descriptors to hold zone information on reply;
3486  * offset can be any byte within the entire size of the device;
3487  * nr_zones is the maximum number of sectors the command should operate on.
3488  */
3489 #if defined(CONFIG_BLKZONED)
3490 static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
3491                                            unsigned int *nr_zones,
3492                                            BlockZoneDescriptor *zones) {
3493     BDRVRawState *s = bs->opaque;
3494     RawPosixAIOData acb = (RawPosixAIOData) {
3495         .bs         = bs,
3496         .aio_fildes = s->fd,
3497         .aio_type   = QEMU_AIO_ZONE_REPORT,
3498         .aio_offset = offset,
3499         .zone_report    = {
3500             .nr_zones       = nr_zones,
3501             .zones          = zones,
3502         },
3503     };
3504 
3505     trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
3506     return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
3507 }
3508 #endif
3509 
3510 /*
3511  * zone management operations - Execute an operation on a zone
3512  */
3513 #if defined(CONFIG_BLKZONED)
3514 static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
3515         int64_t offset, int64_t len) {
3516     BDRVRawState *s = bs->opaque;
3517     RawPosixAIOData acb;
3518     int64_t zone_size, zone_size_mask;
3519     const char *op_name;
3520     unsigned long zo;
3521     int ret;
3522     BlockZoneWps *wps = bs->wps;
3523     int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
3524 
3525     zone_size = bs->bl.zone_size;
3526     zone_size_mask = zone_size - 1;
3527     if (offset & zone_size_mask) {
3528         error_report("sector offset %" PRId64 " is not aligned to zone size "
3529                      "%" PRId64 "", offset / 512, zone_size / 512);
3530         return -EINVAL;
3531     }
3532 
3533     if (((offset + len) < capacity && len & zone_size_mask) ||
3534         offset + len > capacity) {
3535         error_report("number of sectors %" PRId64 " is not aligned to zone size"
3536                       " %" PRId64 "", len / 512, zone_size / 512);
3537         return -EINVAL;
3538     }
3539 
3540     uint32_t i = offset / bs->bl.zone_size;
3541     uint32_t nrz = len / bs->bl.zone_size;
3542     uint64_t *wp = &wps->wp[i];
3543     if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
3544         error_report("zone mgmt operations are not allowed for conventional zones");
3545         return -EIO;
3546     }
3547 
3548     switch (op) {
3549     case BLK_ZO_OPEN:
3550         op_name = "BLKOPENZONE";
3551         zo = BLKOPENZONE;
3552         break;
3553     case BLK_ZO_CLOSE:
3554         op_name = "BLKCLOSEZONE";
3555         zo = BLKCLOSEZONE;
3556         break;
3557     case BLK_ZO_FINISH:
3558         op_name = "BLKFINISHZONE";
3559         zo = BLKFINISHZONE;
3560         break;
3561     case BLK_ZO_RESET:
3562         op_name = "BLKRESETZONE";
3563         zo = BLKRESETZONE;
3564         break;
3565     default:
3566         error_report("Unsupported zone op: 0x%x", op);
3567         return -ENOTSUP;
3568     }
3569 
3570     acb = (RawPosixAIOData) {
3571         .bs             = bs,
3572         .aio_fildes     = s->fd,
3573         .aio_type       = QEMU_AIO_ZONE_MGMT,
3574         .aio_offset     = offset,
3575         .aio_nbytes     = len,
3576         .zone_mgmt  = {
3577             .op = zo,
3578         },
3579     };
3580 
3581     trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
3582                         len >> BDRV_SECTOR_BITS);
3583     ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
3584     if (ret != 0) {
3585         update_zones_wp(bs, s->fd, offset, nrz);
3586         error_report("ioctl %s failed %d", op_name, ret);
3587         return ret;
3588     }
3589 
3590     if (zo == BLKRESETZONE && len == capacity) {
3591         ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
3592         if (ret < 0) {
3593             error_report("reporting single wp failed");
3594             return ret;
3595         }
3596     } else if (zo == BLKRESETZONE) {
3597         for (unsigned int j = 0; j < nrz; ++j) {
3598             wp[j] = offset + j * zone_size;
3599         }
3600     } else if (zo == BLKFINISHZONE) {
3601         for (unsigned int j = 0; j < nrz; ++j) {
3602             /* The zoned device allows the last zone smaller that the
3603              * zone size. */
3604             wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
3605         }
3606     }
3607 
3608     return ret;
3609 }
3610 #endif
3611 
3612 #if defined(CONFIG_BLKZONED)
3613 static int coroutine_fn GRAPH_RDLOCK
3614 raw_co_zone_append(BlockDriverState *bs,
3615                    int64_t *offset,
3616                    QEMUIOVector *qiov,
3617                    BdrvRequestFlags flags) {
3618     assert(flags == 0);
3619     int64_t zone_size_mask = bs->bl.zone_size - 1;
3620     int64_t iov_len = 0;
3621     int64_t len = 0;
3622 
3623     if (*offset & zone_size_mask) {
3624         error_report("sector offset %" PRId64 " is not aligned to zone size "
3625                      "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
3626         return -EINVAL;
3627     }
3628 
3629     int64_t wg = bs->bl.write_granularity;
3630     int64_t wg_mask = wg - 1;
3631     for (int i = 0; i < qiov->niov; i++) {
3632         iov_len = qiov->iov[i].iov_len;
3633         if (iov_len & wg_mask) {
3634             error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
3635                          "block size %" PRId64 "", i, iov_len, wg);
3636             return -EINVAL;
3637         }
3638         len += iov_len;
3639     }
3640 
3641     trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
3642     return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND, 0);
3643 }
3644 #endif
3645 
3646 static coroutine_fn int
3647 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
3648                 bool blkdev)
3649 {
3650     BDRVRawState *s = bs->opaque;
3651     RawPosixAIOData acb;
3652     int ret;
3653 
3654     acb = (RawPosixAIOData) {
3655         .bs             = bs,
3656         .aio_fildes     = s->fd,
3657         .aio_type       = QEMU_AIO_DISCARD,
3658         .aio_offset     = offset,
3659         .aio_nbytes     = bytes,
3660     };
3661 
3662     if (blkdev) {
3663         acb.aio_type |= QEMU_AIO_BLKDEV;
3664     }
3665 
3666     ret = raw_thread_pool_submit(handle_aiocb_discard, &acb);
3667     raw_account_discard(s, bytes, ret);
3668     return ret;
3669 }
3670 
3671 static coroutine_fn int
3672 raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
3673 {
3674     return raw_do_pdiscard(bs, offset, bytes, false);
3675 }
3676 
3677 static int coroutine_fn
3678 raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
3679                      BdrvRequestFlags flags, bool blkdev)
3680 {
3681     BDRVRawState *s = bs->opaque;
3682     RawPosixAIOData acb;
3683     ThreadPoolFunc *handler;
3684 
3685 #ifdef CONFIG_FALLOCATE
3686     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
3687         BdrvTrackedRequest *req;
3688 
3689         /*
3690          * This is a workaround for a bug in the Linux XFS driver,
3691          * where writes submitted through the AIO interface will be
3692          * discarded if they happen beyond a concurrently running
3693          * fallocate() that increases the file length (i.e., both the
3694          * write and the fallocate() happen beyond the EOF).
3695          *
3696          * To work around it, we extend the tracked request for this
3697          * zero write until INT64_MAX (effectively infinity), and mark
3698          * it as serializing.
3699          *
3700          * We have to enable this workaround for all filesystems and
3701          * AIO modes (not just XFS with aio=native), because for
3702          * remote filesystems we do not know the host configuration.
3703          */
3704 
3705         req = bdrv_co_get_self_request(bs);
3706         assert(req);
3707         assert(req->type == BDRV_TRACKED_WRITE);
3708         assert(req->offset <= offset);
3709         assert(req->offset + req->bytes >= offset + bytes);
3710 
3711         req->bytes = BDRV_MAX_LENGTH - req->offset;
3712 
3713         bdrv_check_request(req->offset, req->bytes, &error_abort);
3714 
3715         bdrv_make_request_serialising(req, bs->bl.request_alignment);
3716     }
3717 #endif
3718 
3719     acb = (RawPosixAIOData) {
3720         .bs             = bs,
3721         .aio_fildes     = s->fd,
3722         .aio_type       = QEMU_AIO_WRITE_ZEROES,
3723         .aio_offset     = offset,
3724         .aio_nbytes     = bytes,
3725     };
3726 
3727     if (blkdev) {
3728         acb.aio_type |= QEMU_AIO_BLKDEV;
3729     }
3730     if (flags & BDRV_REQ_NO_FALLBACK) {
3731         acb.aio_type |= QEMU_AIO_NO_FALLBACK;
3732     }
3733 
3734     if (flags & BDRV_REQ_MAY_UNMAP) {
3735         acb.aio_type |= QEMU_AIO_DISCARD;
3736         handler = handle_aiocb_write_zeroes_unmap;
3737     } else {
3738         handler = handle_aiocb_write_zeroes;
3739     }
3740 
3741     return raw_thread_pool_submit(handler, &acb);
3742 }
3743 
3744 static int coroutine_fn raw_co_pwrite_zeroes(
3745     BlockDriverState *bs, int64_t offset,
3746     int64_t bytes, BdrvRequestFlags flags)
3747 {
3748     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
3749 }
3750 
3751 static int coroutine_fn
3752 raw_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3753 {
3754     return 0;
3755 }
3756 
3757 static ImageInfoSpecific *raw_get_specific_info(BlockDriverState *bs,
3758                                                 Error **errp)
3759 {
3760     ImageInfoSpecificFile *file_info = g_new0(ImageInfoSpecificFile, 1);
3761     ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
3762 
3763     *spec_info = (ImageInfoSpecific){
3764         .type = IMAGE_INFO_SPECIFIC_KIND_FILE,
3765         .u.file.data = file_info,
3766     };
3767 
3768 #ifdef FS_IOC_FSGETXATTR
3769     {
3770         BDRVRawState *s = bs->opaque;
3771         struct fsxattr attr;
3772         int ret;
3773 
3774         ret = ioctl(s->fd, FS_IOC_FSGETXATTR, &attr);
3775         if (!ret && attr.fsx_extsize != 0) {
3776             file_info->has_extent_size_hint = true;
3777             file_info->extent_size_hint = attr.fsx_extsize;
3778         }
3779     }
3780 #endif
3781 
3782     return spec_info;
3783 }
3784 
3785 static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
3786 {
3787     BDRVRawState *s = bs->opaque;
3788     return (BlockStatsSpecificFile) {
3789         .discard_nb_ok = s->stats.discard_nb_ok,
3790         .discard_nb_failed = s->stats.discard_nb_failed,
3791         .discard_bytes_ok = s->stats.discard_bytes_ok,
3792     };
3793 }
3794 
3795 static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
3796 {
3797     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3798 
3799     stats->driver = BLOCKDEV_DRIVER_FILE;
3800     stats->u.file = get_blockstats_specific_file(bs);
3801 
3802     return stats;
3803 }
3804 
3805 #if defined(HAVE_HOST_BLOCK_DEVICE)
3806 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
3807 {
3808     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3809 
3810     stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
3811     stats->u.host_device = get_blockstats_specific_file(bs);
3812 
3813     return stats;
3814 }
3815 #endif /* HAVE_HOST_BLOCK_DEVICE */
3816 
3817 static QemuOptsList raw_create_opts = {
3818     .name = "raw-create-opts",
3819     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
3820     .desc = {
3821         {
3822             .name = BLOCK_OPT_SIZE,
3823             .type = QEMU_OPT_SIZE,
3824             .help = "Virtual disk size"
3825         },
3826         {
3827             .name = BLOCK_OPT_NOCOW,
3828             .type = QEMU_OPT_BOOL,
3829             .help = "Turn off copy-on-write (valid only on btrfs)"
3830         },
3831         {
3832             .name = BLOCK_OPT_PREALLOC,
3833             .type = QEMU_OPT_STRING,
3834             .help = "Preallocation mode (allowed values: off"
3835 #ifdef CONFIG_POSIX_FALLOCATE
3836                     ", falloc"
3837 #endif
3838                     ", full)"
3839         },
3840         {
3841             .name = BLOCK_OPT_EXTENT_SIZE_HINT,
3842             .type = QEMU_OPT_SIZE,
3843             .help = "Extent size hint for the image file, 0 to disable"
3844         },
3845         { /* end of list */ }
3846     }
3847 };
3848 
3849 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
3850                           Error **errp)
3851 {
3852     BDRVRawState *s = bs->opaque;
3853     int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
3854     int open_flags;
3855     int ret;
3856 
3857     /* We may need a new fd if auto-read-only switches the mode */
3858     ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm, errp);
3859     if (ret < 0) {
3860         return ret;
3861     } else if (ret != s->fd) {
3862         Error *local_err = NULL;
3863 
3864         /*
3865          * Fail already check_perm() if we can't get a working O_DIRECT
3866          * alignment with the new fd.
3867          */
3868         raw_probe_alignment(bs, ret, &local_err);
3869         if (local_err) {
3870             error_propagate(errp, local_err);
3871             return -EINVAL;
3872         }
3873 
3874         s->perm_change_fd = ret;
3875         s->perm_change_flags = open_flags;
3876     }
3877 
3878     /* Prepare permissions on old fd to avoid conflicts between old and new,
3879      * but keep everything locked that new will need. */
3880     ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
3881     if (ret < 0) {
3882         goto fail;
3883     }
3884 
3885     /* Copy locks to the new fd */
3886     if (s->perm_change_fd && s->use_lock) {
3887         ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
3888                                    false, errp);
3889         if (ret < 0) {
3890             raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3891             goto fail;
3892         }
3893     }
3894     return 0;
3895 
3896 fail:
3897     if (s->perm_change_fd) {
3898         qemu_close(s->perm_change_fd);
3899     }
3900     s->perm_change_fd = 0;
3901     return ret;
3902 }
3903 
3904 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3905 {
3906     BDRVRawState *s = bs->opaque;
3907 
3908     /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3909      * called after .bdrv_reopen_commit) */
3910     if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3911         qemu_close(s->fd);
3912         s->fd = s->perm_change_fd;
3913         s->open_flags = s->perm_change_flags;
3914     }
3915     s->perm_change_fd = 0;
3916 
3917     raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3918     s->perm = perm;
3919     s->shared_perm = shared;
3920 }
3921 
3922 static void raw_abort_perm_update(BlockDriverState *bs)
3923 {
3924     BDRVRawState *s = bs->opaque;
3925 
3926     /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3927      * the file descriptor. */
3928     if (s->perm_change_fd) {
3929         qemu_close(s->perm_change_fd);
3930     }
3931     s->perm_change_fd = 0;
3932 
3933     raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3934 }
3935 
3936 static int coroutine_fn GRAPH_RDLOCK raw_co_copy_range_from(
3937         BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
3938         BdrvChild *dst, int64_t dst_offset, int64_t bytes,
3939         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3940 {
3941     return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3942                                  read_flags, write_flags);
3943 }
3944 
3945 static int coroutine_fn GRAPH_RDLOCK
3946 raw_co_copy_range_to(BlockDriverState *bs,
3947                      BdrvChild *src, int64_t src_offset,
3948                      BdrvChild *dst, int64_t dst_offset,
3949                      int64_t bytes, BdrvRequestFlags read_flags,
3950                      BdrvRequestFlags write_flags)
3951 {
3952     RawPosixAIOData acb;
3953     BDRVRawState *s = bs->opaque;
3954     BDRVRawState *src_s;
3955 
3956     assert(dst->bs == bs);
3957     if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3958         return -ENOTSUP;
3959     }
3960 
3961     src_s = src->bs->opaque;
3962     if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3963         return -EIO;
3964     }
3965 
3966     acb = (RawPosixAIOData) {
3967         .bs             = bs,
3968         .aio_type       = QEMU_AIO_COPY_RANGE,
3969         .aio_fildes     = src_s->fd,
3970         .aio_offset     = src_offset,
3971         .aio_nbytes     = bytes,
3972         .copy_range     = {
3973             .aio_fd2        = s->fd,
3974             .aio_offset2    = dst_offset,
3975         },
3976     };
3977 
3978     return raw_thread_pool_submit(handle_aiocb_copy_range, &acb);
3979 }
3980 
3981 BlockDriver bdrv_file = {
3982     .format_name = "file",
3983     .protocol_name = "file",
3984     .instance_size = sizeof(BDRVRawState),
3985     .bdrv_needs_filename = true,
3986     .bdrv_probe = NULL, /* no probe for protocols */
3987     .bdrv_parse_filename = raw_parse_filename,
3988     .bdrv_open      = raw_open,
3989     .bdrv_reopen_prepare = raw_reopen_prepare,
3990     .bdrv_reopen_commit = raw_reopen_commit,
3991     .bdrv_reopen_abort = raw_reopen_abort,
3992     .bdrv_close = raw_close,
3993     .bdrv_co_create = raw_co_create,
3994     .bdrv_co_create_opts = raw_co_create_opts,
3995     .bdrv_has_zero_init = bdrv_has_zero_init_1,
3996     .bdrv_co_block_status = raw_co_block_status,
3997     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3998     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3999     .bdrv_co_delete_file = raw_co_delete_file,
4000 
4001     .bdrv_co_preadv         = raw_co_preadv,
4002     .bdrv_co_pwritev        = raw_co_pwritev,
4003     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
4004     .bdrv_co_pdiscard       = raw_co_pdiscard,
4005     .bdrv_co_copy_range_from = raw_co_copy_range_from,
4006     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
4007     .bdrv_refresh_limits = raw_refresh_limits,
4008 
4009     .bdrv_co_truncate                   = raw_co_truncate,
4010     .bdrv_co_getlength                  = raw_co_getlength,
4011     .bdrv_co_get_info                   = raw_co_get_info,
4012     .bdrv_get_specific_info             = raw_get_specific_info,
4013     .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
4014     .bdrv_get_specific_stats = raw_get_specific_stats,
4015     .bdrv_check_perm = raw_check_perm,
4016     .bdrv_set_perm   = raw_set_perm,
4017     .bdrv_abort_perm_update = raw_abort_perm_update,
4018     .create_opts = &raw_create_opts,
4019     .mutable_opts = mutable_opts,
4020 };
4021 
4022 /***********************************************/
4023 /* host device */
4024 
4025 #if defined(HAVE_HOST_BLOCK_DEVICE)
4026 
4027 #if defined(__APPLE__) && defined(__MACH__)
4028 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
4029                                 CFIndex maxPathSize, int flags);
4030 
4031 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
4032 {
4033     kern_return_t kernResult = KERN_FAILURE;
4034     mach_port_t mainPort;
4035     CFMutableDictionaryRef  classesToMatch;
4036     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
4037     char *mediaType = NULL;
4038 
4039     kernResult = IOMainPort(MACH_PORT_NULL, &mainPort);
4040     if ( KERN_SUCCESS != kernResult ) {
4041         printf("IOMainPort returned %d\n", kernResult);
4042     }
4043 
4044     int index;
4045     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
4046         classesToMatch = IOServiceMatching(matching_array[index]);
4047         if (classesToMatch == NULL) {
4048             error_report("IOServiceMatching returned NULL for %s",
4049                          matching_array[index]);
4050             continue;
4051         }
4052         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
4053                              kCFBooleanTrue);
4054         kernResult = IOServiceGetMatchingServices(mainPort, classesToMatch,
4055                                                   mediaIterator);
4056         if (kernResult != KERN_SUCCESS) {
4057             error_report("Note: IOServiceGetMatchingServices returned %d",
4058                          kernResult);
4059             continue;
4060         }
4061 
4062         /* If a match was found, leave the loop */
4063         if (*mediaIterator != 0) {
4064             trace_file_FindEjectableOpticalMedia(matching_array[index]);
4065             mediaType = g_strdup(matching_array[index]);
4066             break;
4067         }
4068     }
4069     return mediaType;
4070 }
4071 
4072 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
4073                          CFIndex maxPathSize, int flags)
4074 {
4075     io_object_t     nextMedia;
4076     kern_return_t   kernResult = KERN_FAILURE;
4077     *bsdPath = '\0';
4078     nextMedia = IOIteratorNext( mediaIterator );
4079     if ( nextMedia )
4080     {
4081         CFTypeRef   bsdPathAsCFString;
4082     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
4083         if ( bsdPathAsCFString ) {
4084             size_t devPathLength;
4085             strcpy( bsdPath, _PATH_DEV );
4086             if (flags & BDRV_O_NOCACHE) {
4087                 strcat(bsdPath, "r");
4088             }
4089             devPathLength = strlen( bsdPath );
4090             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
4091                 kernResult = KERN_SUCCESS;
4092             }
4093             CFRelease( bsdPathAsCFString );
4094         }
4095         IOObjectRelease( nextMedia );
4096     }
4097 
4098     return kernResult;
4099 }
4100 
4101 /* Sets up a real cdrom for use in QEMU */
4102 static bool setup_cdrom(char *bsd_path, Error **errp)
4103 {
4104     int index, num_of_test_partitions = 2, fd;
4105     char test_partition[MAXPATHLEN];
4106     bool partition_found = false;
4107 
4108     /* look for a working partition */
4109     for (index = 0; index < num_of_test_partitions; index++) {
4110         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
4111                  index);
4112         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
4113         if (fd >= 0) {
4114             partition_found = true;
4115             qemu_close(fd);
4116             break;
4117         }
4118     }
4119 
4120     /* if a working partition on the device was not found */
4121     if (partition_found == false) {
4122         error_setg(errp, "Failed to find a working partition on disc");
4123     } else {
4124         trace_file_setup_cdrom(test_partition);
4125         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
4126     }
4127     return partition_found;
4128 }
4129 
4130 /* Prints directions on mounting and unmounting a device */
4131 static void print_unmounting_directions(const char *file_name)
4132 {
4133     error_report("If device %s is mounted on the desktop, unmount"
4134                  " it first before using it in QEMU", file_name);
4135     error_report("Command to unmount device: diskutil unmountDisk %s",
4136                  file_name);
4137     error_report("Command to mount device: diskutil mountDisk %s", file_name);
4138 }
4139 
4140 #endif /* defined(__APPLE__) && defined(__MACH__) */
4141 
4142 static int hdev_probe_device(const char *filename)
4143 {
4144     struct stat st;
4145 
4146     /* allow a dedicated CD-ROM driver to match with a higher priority */
4147     if (strstart(filename, "/dev/cdrom", NULL))
4148         return 50;
4149 
4150     if (stat(filename, &st) >= 0 &&
4151             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
4152         return 100;
4153     }
4154 
4155     return 0;
4156 }
4157 
4158 static void hdev_parse_filename(const char *filename, QDict *options,
4159                                 Error **errp)
4160 {
4161     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
4162 }
4163 
4164 static bool hdev_is_sg(BlockDriverState *bs)
4165 {
4166 
4167 #if defined(__linux__)
4168 
4169     BDRVRawState *s = bs->opaque;
4170     struct stat st;
4171     struct sg_scsi_id scsiid;
4172     int sg_version;
4173     int ret;
4174 
4175     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
4176         return false;
4177     }
4178 
4179     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
4180     if (ret < 0) {
4181         return false;
4182     }
4183 
4184     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
4185     if (ret >= 0) {
4186         trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
4187         return true;
4188     }
4189 
4190 #endif
4191 
4192     return false;
4193 }
4194 
4195 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
4196                      Error **errp)
4197 {
4198     BDRVRawState *s = bs->opaque;
4199     int ret;
4200 
4201 #if defined(__APPLE__) && defined(__MACH__)
4202     /*
4203      * Caution: while qdict_get_str() is fine, getting non-string types
4204      * would require more care.  When @options come from -blockdev or
4205      * blockdev_add, its members are typed according to the QAPI
4206      * schema, but when they come from -drive, they're all QString.
4207      */
4208     const char *filename = qdict_get_str(options, "filename");
4209     char bsd_path[MAXPATHLEN] = "";
4210     bool error_occurred = false;
4211 
4212     /* If using a real cdrom */
4213     if (strcmp(filename, "/dev/cdrom") == 0) {
4214         char *mediaType = NULL;
4215         kern_return_t ret_val;
4216         io_iterator_t mediaIterator = 0;
4217 
4218         mediaType = FindEjectableOpticalMedia(&mediaIterator);
4219         if (mediaType == NULL) {
4220             error_setg(errp, "Please make sure your CD/DVD is in the optical"
4221                        " drive");
4222             error_occurred = true;
4223             goto hdev_open_Mac_error;
4224         }
4225 
4226         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
4227         if (ret_val != KERN_SUCCESS) {
4228             error_setg(errp, "Could not get BSD path for optical drive");
4229             error_occurred = true;
4230             goto hdev_open_Mac_error;
4231         }
4232 
4233         /* If a real optical drive was not found */
4234         if (bsd_path[0] == '\0') {
4235             error_setg(errp, "Failed to obtain bsd path for optical drive");
4236             error_occurred = true;
4237             goto hdev_open_Mac_error;
4238         }
4239 
4240         /* If using a cdrom disc and finding a partition on the disc failed */
4241         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
4242             setup_cdrom(bsd_path, errp) == false) {
4243             print_unmounting_directions(bsd_path);
4244             error_occurred = true;
4245             goto hdev_open_Mac_error;
4246         }
4247 
4248         qdict_put_str(options, "filename", bsd_path);
4249 
4250 hdev_open_Mac_error:
4251         g_free(mediaType);
4252         if (mediaIterator) {
4253             IOObjectRelease(mediaIterator);
4254         }
4255         if (error_occurred) {
4256             return -ENOENT;
4257         }
4258     }
4259 #endif /* defined(__APPLE__) && defined(__MACH__) */
4260 
4261     s->type = FTYPE_FILE;
4262 
4263     ret = raw_open_common(bs, options, flags, 0, true, errp);
4264     if (ret < 0) {
4265 #if defined(__APPLE__) && defined(__MACH__)
4266         if (*bsd_path) {
4267             filename = bsd_path;
4268         }
4269         /* if a physical device experienced an error while being opened */
4270         if (strncmp(filename, "/dev/", 5) == 0) {
4271             print_unmounting_directions(filename);
4272         }
4273 #endif /* defined(__APPLE__) && defined(__MACH__) */
4274         return ret;
4275     }
4276 
4277     /* Since this does ioctl the device must be already opened */
4278     bs->sg = hdev_is_sg(bs);
4279 
4280     /* sg devices aren't even block devices and can't use dm-mpath */
4281     s->use_mpath = !bs->sg;
4282 
4283     return ret;
4284 }
4285 
4286 #if defined(__linux__)
4287 #if defined(DM_MPATH_PROBE_PATHS)
4288 static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
4289 {
4290     if (ret < 0) {
4291         /* Path errors sometimes result in -ENODEV */
4292         return ret == -ENODEV;
4293     }
4294 
4295     if (io_hdr->host_status != SCSI_HOST_OK) {
4296         return true;
4297     }
4298 
4299     switch (io_hdr->status) {
4300     case GOOD:
4301     case CONDITION_GOOD:
4302     case INTERMEDIATE_GOOD:
4303     case INTERMEDIATE_C_GOOD:
4304     case RESERVATION_CONFLICT:
4305     case COMMAND_TERMINATED:
4306         return false;
4307     case CHECK_CONDITION:
4308         return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp,
4309                                                     io_hdr->mx_sb_len);
4310     default:
4311         return true;
4312     }
4313 }
4314 
4315 static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
4316 {
4317     BDRVRawState *s = acb->bs->opaque;
4318     RawPosixAIOData probe_acb;
4319 
4320     if (!s->use_mpath) {
4321         return false;
4322     }
4323 
4324     if (!sgio_path_error(ret, acb->ioctl.buf)) {
4325         return false;
4326     }
4327 
4328     probe_acb = (RawPosixAIOData) {
4329         .bs         = acb->bs,
4330         .aio_type   = QEMU_AIO_IOCTL,
4331         .aio_fildes = s->fd,
4332         .aio_offset = 0,
4333         .ioctl      = {
4334             .buf        = NULL,
4335             .cmd        = DM_MPATH_PROBE_PATHS,
4336         },
4337     };
4338 
4339     ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb);
4340     if (ret == -ENOTTY) {
4341         s->use_mpath = false;
4342     } else if (ret == -EAGAIN) {
4343         /* The device might be suspended for a table reload, worth retrying */
4344         return true;
4345     }
4346 
4347     return ret == 0;
4348 }
4349 #else
4350 static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
4351 {
4352     return false;
4353 }
4354 #endif /* DM_MPATH_PROBE_PATHS */
4355 
4356 static int coroutine_fn
4357 hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4358 {
4359     BDRVRawState *s = bs->opaque;
4360     RawPosixAIOData acb;
4361     uint64_t eagain_sleep_ns = 1 * SCALE_MS;
4362     int retries = SG_IO_MAX_RETRIES;
4363     int ret;
4364 
4365     ret = fd_open(bs);
4366     if (ret < 0) {
4367         return ret;
4368     }
4369 
4370     if (req == SG_IO && s->pr_mgr) {
4371         struct sg_io_hdr *io_hdr = buf;
4372         if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
4373             io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
4374             return pr_manager_execute(s->pr_mgr, qemu_get_current_aio_context(),
4375                                       s->fd, io_hdr);
4376         }
4377     }
4378 
4379     acb = (RawPosixAIOData) {
4380         .bs         = bs,
4381         .aio_type   = QEMU_AIO_IOCTL,
4382         .aio_fildes = s->fd,
4383         .aio_offset = 0,
4384         .ioctl      = {
4385             .buf        = buf,
4386             .cmd        = req,
4387         },
4388     };
4389 
4390 retry:
4391     ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
4392     if (req == SG_IO && s->use_mpath) {
4393         if (ret == -EAGAIN && eagain_sleep_ns < NANOSECONDS_PER_SECOND) {
4394             /*
4395              * If this is a multipath device, it is probably suspended.
4396              *
4397              * This can happen while the dm table is reloaded, e.g. because a
4398              * path is added or removed. This is an operation that should
4399              * complete within 1ms, so just wait a bit and retry.
4400              *
4401              * There are also some cases in which libmpathpersist must recover
4402              * from path failure during its operation, which can leave the
4403              * device suspended for a bit longer while the library brings back
4404              * reservations into the expected state.
4405              *
4406              * Use increasing delays to cover both cases without waiting
4407              * excessively, and stop after a bit more than a second (1023 ms).
4408              * This is a tolerable delay before we return an error and
4409              * potentially stop the VM.
4410              */
4411             qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, eagain_sleep_ns);
4412             eagain_sleep_ns *= 2;
4413             goto retry;
4414         }
4415 
4416         /* Even for ret == 0, the SG_IO header can contain an error */
4417         if (retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)) {
4418             goto retry;
4419         }
4420     }
4421 
4422     return ret;
4423 }
4424 #endif /* linux */
4425 
4426 static coroutine_fn int
4427 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
4428 {
4429     BDRVRawState *s = bs->opaque;
4430     int ret;
4431 
4432     ret = fd_open(bs);
4433     if (ret < 0) {
4434         raw_account_discard(s, bytes, ret);
4435         return ret;
4436     }
4437     return raw_do_pdiscard(bs, offset, bytes, true);
4438 }
4439 
4440 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
4441     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
4442 {
4443     int rc;
4444 
4445     rc = fd_open(bs);
4446     if (rc < 0) {
4447         return rc;
4448     }
4449 
4450     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
4451 }
4452 
4453 static BlockDriver bdrv_host_device = {
4454     .format_name        = "host_device",
4455     .protocol_name        = "host_device",
4456     .instance_size      = sizeof(BDRVRawState),
4457     .bdrv_needs_filename = true,
4458     .bdrv_probe_device  = hdev_probe_device,
4459     .bdrv_parse_filename = hdev_parse_filename,
4460     .bdrv_open          = hdev_open,
4461     .bdrv_close         = raw_close,
4462     .bdrv_reopen_prepare = raw_reopen_prepare,
4463     .bdrv_reopen_commit  = raw_reopen_commit,
4464     .bdrv_reopen_abort   = raw_reopen_abort,
4465     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
4466     .create_opts         = &bdrv_create_opts_simple,
4467     .mutable_opts        = mutable_opts,
4468     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
4469     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
4470 
4471     .bdrv_co_preadv         = raw_co_preadv,
4472     .bdrv_co_pwritev        = raw_co_pwritev,
4473     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
4474     .bdrv_co_pdiscard       = hdev_co_pdiscard,
4475     .bdrv_co_copy_range_from = raw_co_copy_range_from,
4476     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
4477     .bdrv_refresh_limits = raw_refresh_limits,
4478 
4479     .bdrv_co_truncate                   = raw_co_truncate,
4480     .bdrv_co_getlength                  = raw_co_getlength,
4481     .bdrv_co_get_info                   = raw_co_get_info,
4482     .bdrv_get_specific_info             = raw_get_specific_info,
4483     .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
4484     .bdrv_get_specific_stats = hdev_get_specific_stats,
4485     .bdrv_check_perm = raw_check_perm,
4486     .bdrv_set_perm   = raw_set_perm,
4487     .bdrv_abort_perm_update = raw_abort_perm_update,
4488     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
4489     .bdrv_probe_geometry = hdev_probe_geometry,
4490 
4491     /* generic scsi device */
4492 #ifdef __linux__
4493     .bdrv_co_ioctl          = hdev_co_ioctl,
4494 #endif
4495 
4496     /* zoned device */
4497 #if defined(CONFIG_BLKZONED)
4498     /* zone management operations */
4499     .bdrv_co_zone_report = raw_co_zone_report,
4500     .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
4501     .bdrv_co_zone_append = raw_co_zone_append,
4502 #endif
4503 };
4504 
4505 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
4506 static void cdrom_parse_filename(const char *filename, QDict *options,
4507                                  Error **errp)
4508 {
4509     bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
4510 }
4511 
4512 static void cdrom_refresh_limits(BlockDriverState *bs, Error **errp)
4513 {
4514     bs->bl.has_variable_length = true;
4515     raw_refresh_limits(bs, errp);
4516 }
4517 #endif
4518 
4519 #ifdef __linux__
4520 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
4521                       Error **errp)
4522 {
4523     BDRVRawState *s = bs->opaque;
4524 
4525     s->type = FTYPE_CD;
4526 
4527     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
4528     return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
4529 }
4530 
4531 static int cdrom_probe_device(const char *filename)
4532 {
4533     int fd, ret;
4534     int prio = 0;
4535     struct stat st;
4536 
4537     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
4538     if (fd < 0) {
4539         goto out;
4540     }
4541     ret = fstat(fd, &st);
4542     if (ret == -1 || !S_ISBLK(st.st_mode)) {
4543         goto outc;
4544     }
4545 
4546     /* Attempt to detect via a CDROM specific ioctl */
4547     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
4548     if (ret >= 0)
4549         prio = 100;
4550 
4551 outc:
4552     qemu_close(fd);
4553 out:
4554     return prio;
4555 }
4556 
4557 static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs)
4558 {
4559     BDRVRawState *s = bs->opaque;
4560     int ret;
4561 
4562     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
4563     return ret == CDS_DISC_OK;
4564 }
4565 
4566 static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag)
4567 {
4568     BDRVRawState *s = bs->opaque;
4569 
4570     if (eject_flag) {
4571         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
4572             perror("CDROMEJECT");
4573     } else {
4574         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
4575             perror("CDROMEJECT");
4576     }
4577 }
4578 
4579 static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
4580 {
4581     BDRVRawState *s = bs->opaque;
4582 
4583     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
4584         /*
4585          * Note: an error can happen if the distribution automatically
4586          * mounts the CD-ROM
4587          */
4588         /* perror("CDROM_LOCKDOOR"); */
4589     }
4590 }
4591 
4592 static BlockDriver bdrv_host_cdrom = {
4593     .format_name            = "host_cdrom",
4594     .protocol_name          = "host_cdrom",
4595     .instance_size          = sizeof(BDRVRawState),
4596     .bdrv_needs_filename    = true,
4597     .bdrv_probe_device      = cdrom_probe_device,
4598     .bdrv_parse_filename    = cdrom_parse_filename,
4599     .bdrv_open              = cdrom_open,
4600     .bdrv_close             = raw_close,
4601     .bdrv_reopen_prepare    = raw_reopen_prepare,
4602     .bdrv_reopen_commit     = raw_reopen_commit,
4603     .bdrv_reopen_abort      = raw_reopen_abort,
4604     .bdrv_co_create_opts    = bdrv_co_create_opts_simple,
4605     .create_opts            = &bdrv_create_opts_simple,
4606     .mutable_opts           = mutable_opts,
4607     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
4608 
4609     .bdrv_co_preadv         = raw_co_preadv,
4610     .bdrv_co_pwritev        = raw_co_pwritev,
4611     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
4612     .bdrv_refresh_limits    = cdrom_refresh_limits,
4613 
4614     .bdrv_co_truncate                   = raw_co_truncate,
4615     .bdrv_co_getlength                  = raw_co_getlength,
4616     .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
4617 
4618     /* removable device support */
4619     .bdrv_co_is_inserted    = cdrom_co_is_inserted,
4620     .bdrv_co_eject          = cdrom_co_eject,
4621     .bdrv_co_lock_medium    = cdrom_co_lock_medium,
4622 
4623     /* generic scsi device */
4624     .bdrv_co_ioctl      = hdev_co_ioctl,
4625 };
4626 #endif /* __linux__ */
4627 
4628 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
4629 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
4630                       Error **errp)
4631 {
4632     BDRVRawState *s = bs->opaque;
4633     int ret;
4634 
4635     s->type = FTYPE_CD;
4636 
4637     ret = raw_open_common(bs, options, flags, 0, true, errp);
4638     if (ret) {
4639         return ret;
4640     }
4641 
4642     /* make sure the door isn't locked at this time */
4643     ioctl(s->fd, CDIOCALLOW);
4644     return 0;
4645 }
4646 
4647 static int cdrom_probe_device(const char *filename)
4648 {
4649     if (strstart(filename, "/dev/cd", NULL) ||
4650             strstart(filename, "/dev/acd", NULL))
4651         return 100;
4652     return 0;
4653 }
4654 
4655 static int cdrom_reopen(BlockDriverState *bs)
4656 {
4657     BDRVRawState *s = bs->opaque;
4658     int fd;
4659 
4660     /*
4661      * Force reread of possibly changed/newly loaded disc,
4662      * FreeBSD seems to not notice sometimes...
4663      */
4664     if (s->fd >= 0)
4665         qemu_close(s->fd);
4666     fd = qemu_open(bs->filename, s->open_flags, NULL);
4667     if (fd < 0) {
4668         s->fd = -1;
4669         return -EIO;
4670     }
4671     s->fd = fd;
4672 
4673     /* make sure the door isn't locked at this time */
4674     ioctl(s->fd, CDIOCALLOW);
4675     return 0;
4676 }
4677 
4678 static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs)
4679 {
4680     return raw_getlength(bs) > 0;
4681 }
4682 
4683 static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag)
4684 {
4685     BDRVRawState *s = bs->opaque;
4686 
4687     if (s->fd < 0)
4688         return;
4689 
4690     (void) ioctl(s->fd, CDIOCALLOW);
4691 
4692     if (eject_flag) {
4693         if (ioctl(s->fd, CDIOCEJECT) < 0)
4694             perror("CDIOCEJECT");
4695     } else {
4696         if (ioctl(s->fd, CDIOCCLOSE) < 0)
4697             perror("CDIOCCLOSE");
4698     }
4699 
4700     cdrom_reopen(bs);
4701 }
4702 
4703 static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
4704 {
4705     BDRVRawState *s = bs->opaque;
4706 
4707     if (s->fd < 0)
4708         return;
4709     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
4710         /*
4711          * Note: an error can happen if the distribution automatically
4712          * mounts the CD-ROM
4713          */
4714         /* perror("CDROM_LOCKDOOR"); */
4715     }
4716 }
4717 
4718 static BlockDriver bdrv_host_cdrom = {
4719     .format_name            = "host_cdrom",
4720     .protocol_name          = "host_cdrom",
4721     .instance_size          = sizeof(BDRVRawState),
4722     .bdrv_needs_filename    = true,
4723     .bdrv_probe_device      = cdrom_probe_device,
4724     .bdrv_parse_filename    = cdrom_parse_filename,
4725     .bdrv_open              = cdrom_open,
4726     .bdrv_close             = raw_close,
4727     .bdrv_reopen_prepare    = raw_reopen_prepare,
4728     .bdrv_reopen_commit     = raw_reopen_commit,
4729     .bdrv_reopen_abort      = raw_reopen_abort,
4730     .bdrv_co_create_opts    = bdrv_co_create_opts_simple,
4731     .create_opts            = &bdrv_create_opts_simple,
4732     .mutable_opts           = mutable_opts,
4733 
4734     .bdrv_co_preadv         = raw_co_preadv,
4735     .bdrv_co_pwritev        = raw_co_pwritev,
4736     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
4737     .bdrv_refresh_limits    = cdrom_refresh_limits,
4738 
4739     .bdrv_co_truncate                   = raw_co_truncate,
4740     .bdrv_co_getlength                  = raw_co_getlength,
4741     .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
4742 
4743     /* removable device support */
4744     .bdrv_co_is_inserted     = cdrom_co_is_inserted,
4745     .bdrv_co_eject           = cdrom_co_eject,
4746     .bdrv_co_lock_medium     = cdrom_co_lock_medium,
4747 };
4748 #endif /* __FreeBSD__ */
4749 
4750 #endif /* HAVE_HOST_BLOCK_DEVICE */
4751 
4752 static void bdrv_file_init(void)
4753 {
4754     /*
4755      * Register all the drivers.  Note that order is important, the driver
4756      * registered last will get probed first.
4757      */
4758     bdrv_register(&bdrv_file);
4759 #if defined(HAVE_HOST_BLOCK_DEVICE)
4760     bdrv_register(&bdrv_host_device);
4761 #ifdef __linux__
4762     bdrv_register(&bdrv_host_cdrom);
4763 #endif
4764 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
4765     bdrv_register(&bdrv_host_cdrom);
4766 #endif
4767 #endif /* HAVE_HOST_BLOCK_DEVICE */
4768 }
4769 
4770 block_init(bdrv_file_init);
4771