xref: /openbmc/qemu/block/file-posix.c (revision 6016b7b4)
1 /*
2  * Block driver for RAW files (posix)
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qapi/error.h"
28 #include "qemu/cutils.h"
29 #include "qemu/error-report.h"
30 #include "block/block_int.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "qemu/units.h"
34 #include "trace.h"
35 #include "block/thread-pool.h"
36 #include "qemu/iov.h"
37 #include "block/raw-aio.h"
38 #include "qapi/qmp/qdict.h"
39 #include "qapi/qmp/qstring.h"
40 
41 #include "scsi/pr-manager.h"
42 #include "scsi/constants.h"
43 
44 #if defined(__APPLE__) && (__MACH__)
45 #include <sys/ioctl.h>
46 #if defined(HAVE_HOST_BLOCK_DEVICE)
47 #include <paths.h>
48 #include <sys/param.h>
49 #include <sys/mount.h>
50 #include <IOKit/IOKitLib.h>
51 #include <IOKit/IOBSD.h>
52 #include <IOKit/storage/IOMediaBSDClient.h>
53 #include <IOKit/storage/IOMedia.h>
54 #include <IOKit/storage/IOCDMedia.h>
55 //#include <IOKit/storage/IOCDTypes.h>
56 #include <IOKit/storage/IODVDMedia.h>
57 #include <CoreFoundation/CoreFoundation.h>
58 #endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
59 #endif
60 
61 #ifdef __sun__
62 #define _POSIX_PTHREAD_SEMANTICS 1
63 #include <sys/dkio.h>
64 #endif
65 #ifdef __linux__
66 #include <sys/ioctl.h>
67 #include <sys/param.h>
68 #include <sys/syscall.h>
69 #include <sys/vfs.h>
70 #include <linux/cdrom.h>
71 #include <linux/fd.h>
72 #include <linux/fs.h>
73 #include <linux/hdreg.h>
74 #include <linux/magic.h>
75 #include <scsi/sg.h>
76 #ifdef __s390__
77 #include <asm/dasd.h>
78 #endif
79 #ifndef FS_NOCOW_FL
80 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
81 #endif
82 #endif
83 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
84 #include <linux/falloc.h>
85 #endif
86 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
87 #include <sys/disk.h>
88 #include <sys/cdio.h>
89 #endif
90 
91 #ifdef __OpenBSD__
92 #include <sys/ioctl.h>
93 #include <sys/disklabel.h>
94 #include <sys/dkio.h>
95 #endif
96 
97 #ifdef __NetBSD__
98 #include <sys/ioctl.h>
99 #include <sys/disklabel.h>
100 #include <sys/dkio.h>
101 #include <sys/disk.h>
102 #endif
103 
104 #ifdef __DragonFly__
105 #include <sys/ioctl.h>
106 #include <sys/diskslice.h>
107 #endif
108 
109 #ifdef CONFIG_XFS
110 #include <xfs/xfs.h>
111 #endif
112 
113 /* OS X does not have O_DSYNC */
114 #ifndef O_DSYNC
115 #ifdef O_SYNC
116 #define O_DSYNC O_SYNC
117 #elif defined(O_FSYNC)
118 #define O_DSYNC O_FSYNC
119 #endif
120 #endif
121 
122 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
123 #ifndef O_DIRECT
124 #define O_DIRECT O_DSYNC
125 #endif
126 
127 #define FTYPE_FILE   0
128 #define FTYPE_CD     1
129 
130 #define MAX_BLOCKSIZE	4096
131 
132 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
133  * leaving a few more bytes for its future use. */
134 #define RAW_LOCK_PERM_BASE             100
135 #define RAW_LOCK_SHARED_BASE           200
136 
137 typedef struct BDRVRawState {
138     int fd;
139     bool use_lock;
140     int type;
141     int open_flags;
142     size_t buf_align;
143 
144     /* The current permissions. */
145     uint64_t perm;
146     uint64_t shared_perm;
147 
148     /* The perms bits whose corresponding bytes are already locked in
149      * s->fd. */
150     uint64_t locked_perm;
151     uint64_t locked_shared_perm;
152 
153     uint64_t aio_max_batch;
154 
155     int perm_change_fd;
156     int perm_change_flags;
157     BDRVReopenState *reopen_state;
158 
159 #ifdef CONFIG_XFS
160     bool is_xfs:1;
161 #endif
162     bool has_discard:1;
163     bool has_write_zeroes:1;
164     bool discard_zeroes:1;
165     bool use_linux_aio:1;
166     bool use_linux_io_uring:1;
167     int page_cache_inconsistent; /* errno from fdatasync failure */
168     bool has_fallocate;
169     bool needs_alignment;
170     bool force_alignment;
171     bool drop_cache;
172     bool check_cache_dropped;
173     struct {
174         uint64_t discard_nb_ok;
175         uint64_t discard_nb_failed;
176         uint64_t discard_bytes_ok;
177     } stats;
178 
179     PRManager *pr_mgr;
180 } BDRVRawState;
181 
182 typedef struct BDRVRawReopenState {
183     int open_flags;
184     bool drop_cache;
185     bool check_cache_dropped;
186 } BDRVRawReopenState;
187 
188 static int fd_open(BlockDriverState *bs)
189 {
190     BDRVRawState *s = bs->opaque;
191 
192     /* this is just to ensure s->fd is sane (its called by io ops) */
193     if (s->fd >= 0) {
194         return 0;
195     }
196     return -EIO;
197 }
198 
199 static int64_t raw_getlength(BlockDriverState *bs);
200 
201 typedef struct RawPosixAIOData {
202     BlockDriverState *bs;
203     int aio_type;
204     int aio_fildes;
205 
206     off_t aio_offset;
207     uint64_t aio_nbytes;
208 
209     union {
210         struct {
211             struct iovec *iov;
212             int niov;
213         } io;
214         struct {
215             uint64_t cmd;
216             void *buf;
217         } ioctl;
218         struct {
219             int aio_fd2;
220             off_t aio_offset2;
221         } copy_range;
222         struct {
223             PreallocMode prealloc;
224             Error **errp;
225         } truncate;
226     };
227 } RawPosixAIOData;
228 
229 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
230 static int cdrom_reopen(BlockDriverState *bs);
231 #endif
232 
233 /*
234  * Elide EAGAIN and EACCES details when failing to lock, as this
235  * indicates that the specified file region is already locked by
236  * another process, which is considered a common scenario.
237  */
238 #define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
239     do {                                                                \
240         if ((err) == EAGAIN || (err) == EACCES) {                       \
241             error_setg((errp), (fmt), ## __VA_ARGS__);                  \
242         } else {                                                        \
243             error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
244         }                                                               \
245     } while (0)
246 
247 #if defined(__NetBSD__)
248 static int raw_normalize_devicepath(const char **filename, Error **errp)
249 {
250     static char namebuf[PATH_MAX];
251     const char *dp, *fname;
252     struct stat sb;
253 
254     fname = *filename;
255     dp = strrchr(fname, '/');
256     if (lstat(fname, &sb) < 0) {
257         error_setg_file_open(errp, errno, fname);
258         return -errno;
259     }
260 
261     if (!S_ISBLK(sb.st_mode)) {
262         return 0;
263     }
264 
265     if (dp == NULL) {
266         snprintf(namebuf, PATH_MAX, "r%s", fname);
267     } else {
268         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
269             (int)(dp - fname), fname, dp + 1);
270     }
271     *filename = namebuf;
272     warn_report("%s is a block device, using %s", fname, *filename);
273 
274     return 0;
275 }
276 #else
277 static int raw_normalize_devicepath(const char **filename, Error **errp)
278 {
279     return 0;
280 }
281 #endif
282 
283 /*
284  * Get logical block size via ioctl. On success store it in @sector_size_p.
285  */
286 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
287 {
288     unsigned int sector_size;
289     bool success = false;
290     int i;
291 
292     errno = ENOTSUP;
293     static const unsigned long ioctl_list[] = {
294 #ifdef BLKSSZGET
295         BLKSSZGET,
296 #endif
297 #ifdef DKIOCGETBLOCKSIZE
298         DKIOCGETBLOCKSIZE,
299 #endif
300 #ifdef DIOCGSECTORSIZE
301         DIOCGSECTORSIZE,
302 #endif
303     };
304 
305     /* Try a few ioctls to get the right size */
306     for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
307         if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
308             *sector_size_p = sector_size;
309             success = true;
310         }
311     }
312 
313     return success ? 0 : -errno;
314 }
315 
316 /**
317  * Get physical block size of @fd.
318  * On success, store it in @blk_size and return 0.
319  * On failure, return -errno.
320  */
321 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
322 {
323 #ifdef BLKPBSZGET
324     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
325         return -errno;
326     }
327     return 0;
328 #else
329     return -ENOTSUP;
330 #endif
331 }
332 
333 /*
334  * Returns true if no alignment restrictions are necessary even for files
335  * opened with O_DIRECT.
336  *
337  * raw_probe_alignment() probes the required alignment and assume that 1 means
338  * the probing failed, so it falls back to a safe default of 4k. This can be
339  * avoided if we know that byte alignment is okay for the file.
340  */
341 static bool dio_byte_aligned(int fd)
342 {
343 #ifdef __linux__
344     struct statfs buf;
345     int ret;
346 
347     ret = fstatfs(fd, &buf);
348     if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
349         return true;
350     }
351 #endif
352     return false;
353 }
354 
355 static bool raw_needs_alignment(BlockDriverState *bs)
356 {
357     BDRVRawState *s = bs->opaque;
358 
359     if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
360         return true;
361     }
362 
363     return s->force_alignment;
364 }
365 
366 /* Check if read is allowed with given memory buffer and length.
367  *
368  * This function is used to check O_DIRECT memory buffer and request alignment.
369  */
370 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
371 {
372     ssize_t ret = pread(fd, buf, len, 0);
373 
374     if (ret >= 0) {
375         return true;
376     }
377 
378 #ifdef __linux__
379     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
380      * other errors (e.g. real I/O error), which could happen on a failed
381      * drive, since we only care about probing alignment.
382      */
383     if (errno != EINVAL) {
384         return true;
385     }
386 #endif
387 
388     return false;
389 }
390 
391 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
392 {
393     BDRVRawState *s = bs->opaque;
394     char *buf;
395     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
396     size_t alignments[] = {1, 512, 1024, 2048, 4096};
397 
398     /* For SCSI generic devices the alignment is not really used.
399        With buffered I/O, we don't have any restrictions. */
400     if (bdrv_is_sg(bs) || !s->needs_alignment) {
401         bs->bl.request_alignment = 1;
402         s->buf_align = 1;
403         return;
404     }
405 
406     bs->bl.request_alignment = 0;
407     s->buf_align = 0;
408     /* Let's try to use the logical blocksize for the alignment. */
409     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
410         bs->bl.request_alignment = 0;
411     }
412 #ifdef CONFIG_XFS
413     if (s->is_xfs) {
414         struct dioattr da;
415         if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
416             bs->bl.request_alignment = da.d_miniosz;
417             /* The kernel returns wrong information for d_mem */
418             /* s->buf_align = da.d_mem; */
419         }
420     }
421 #endif
422 
423     /*
424      * If we could not get the sizes so far, we can only guess them. First try
425      * to detect request alignment, since it is more likely to succeed. Then
426      * try to detect buf_align, which cannot be detected in some cases (e.g.
427      * Gluster). If buf_align cannot be detected, we fallback to the value of
428      * request_alignment.
429      */
430 
431     if (!bs->bl.request_alignment) {
432         int i;
433         size_t align;
434         buf = qemu_memalign(max_align, max_align);
435         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
436             align = alignments[i];
437             if (raw_is_io_aligned(fd, buf, align)) {
438                 /* Fallback to safe value. */
439                 bs->bl.request_alignment = (align != 1) ? align : max_align;
440                 break;
441             }
442         }
443         qemu_vfree(buf);
444     }
445 
446     if (!s->buf_align) {
447         int i;
448         size_t align;
449         buf = qemu_memalign(max_align, 2 * max_align);
450         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
451             align = alignments[i];
452             if (raw_is_io_aligned(fd, buf + align, max_align)) {
453                 /* Fallback to request_alignment. */
454                 s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
455                 break;
456             }
457         }
458         qemu_vfree(buf);
459     }
460 
461     if (!s->buf_align || !bs->bl.request_alignment) {
462         error_setg(errp, "Could not find working O_DIRECT alignment");
463         error_append_hint(errp, "Try cache.direct=off\n");
464     }
465 }
466 
467 static int check_hdev_writable(int fd)
468 {
469 #if defined(BLKROGET)
470     /* Linux block devices can be configured "read-only" using blockdev(8).
471      * This is independent of device node permissions and therefore open(2)
472      * with O_RDWR succeeds.  Actual writes fail with EPERM.
473      *
474      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
475      * check for read-only block devices so that Linux block devices behave
476      * properly.
477      */
478     struct stat st;
479     int readonly = 0;
480 
481     if (fstat(fd, &st)) {
482         return -errno;
483     }
484 
485     if (!S_ISBLK(st.st_mode)) {
486         return 0;
487     }
488 
489     if (ioctl(fd, BLKROGET, &readonly) < 0) {
490         return -errno;
491     }
492 
493     if (readonly) {
494         return -EACCES;
495     }
496 #endif /* defined(BLKROGET) */
497     return 0;
498 }
499 
500 static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
501 {
502     bool read_write = false;
503     assert(open_flags != NULL);
504 
505     *open_flags |= O_BINARY;
506     *open_flags &= ~O_ACCMODE;
507 
508     if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
509         read_write = has_writers;
510     } else if (bdrv_flags & BDRV_O_RDWR) {
511         read_write = true;
512     }
513 
514     if (read_write) {
515         *open_flags |= O_RDWR;
516     } else {
517         *open_flags |= O_RDONLY;
518     }
519 
520     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
521      * and O_DIRECT for no caching. */
522     if ((bdrv_flags & BDRV_O_NOCACHE)) {
523         *open_flags |= O_DIRECT;
524     }
525 }
526 
527 static void raw_parse_filename(const char *filename, QDict *options,
528                                Error **errp)
529 {
530     bdrv_parse_filename_strip_prefix(filename, "file:", options);
531 }
532 
533 static QemuOptsList raw_runtime_opts = {
534     .name = "raw",
535     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
536     .desc = {
537         {
538             .name = "filename",
539             .type = QEMU_OPT_STRING,
540             .help = "File name of the image",
541         },
542         {
543             .name = "aio",
544             .type = QEMU_OPT_STRING,
545             .help = "host AIO implementation (threads, native, io_uring)",
546         },
547         {
548             .name = "aio-max-batch",
549             .type = QEMU_OPT_NUMBER,
550             .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
551         },
552         {
553             .name = "locking",
554             .type = QEMU_OPT_STRING,
555             .help = "file locking mode (on/off/auto, default: auto)",
556         },
557         {
558             .name = "pr-manager",
559             .type = QEMU_OPT_STRING,
560             .help = "id of persistent reservation manager object (default: none)",
561         },
562 #if defined(__linux__)
563         {
564             .name = "drop-cache",
565             .type = QEMU_OPT_BOOL,
566             .help = "invalidate page cache during live migration (default: on)",
567         },
568 #endif
569         {
570             .name = "x-check-cache-dropped",
571             .type = QEMU_OPT_BOOL,
572             .help = "check that page cache was dropped on live migration (default: off)"
573         },
574         { /* end of list */ }
575     },
576 };
577 
578 static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
579 
580 static int raw_open_common(BlockDriverState *bs, QDict *options,
581                            int bdrv_flags, int open_flags,
582                            bool device, Error **errp)
583 {
584     BDRVRawState *s = bs->opaque;
585     QemuOpts *opts;
586     Error *local_err = NULL;
587     const char *filename = NULL;
588     const char *str;
589     BlockdevAioOptions aio, aio_default;
590     int fd, ret;
591     struct stat st;
592     OnOffAuto locking;
593 
594     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
595     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
596         ret = -EINVAL;
597         goto fail;
598     }
599 
600     filename = qemu_opt_get(opts, "filename");
601 
602     ret = raw_normalize_devicepath(&filename, errp);
603     if (ret != 0) {
604         goto fail;
605     }
606 
607     if (bdrv_flags & BDRV_O_NATIVE_AIO) {
608         aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
609 #ifdef CONFIG_LINUX_IO_URING
610     } else if (bdrv_flags & BDRV_O_IO_URING) {
611         aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
612 #endif
613     } else {
614         aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
615     }
616 
617     aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
618                           qemu_opt_get(opts, "aio"),
619                           aio_default, &local_err);
620     if (local_err) {
621         error_propagate(errp, local_err);
622         ret = -EINVAL;
623         goto fail;
624     }
625 
626     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
627 #ifdef CONFIG_LINUX_IO_URING
628     s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
629 #endif
630 
631     s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
632 
633     locking = qapi_enum_parse(&OnOffAuto_lookup,
634                               qemu_opt_get(opts, "locking"),
635                               ON_OFF_AUTO_AUTO, &local_err);
636     if (local_err) {
637         error_propagate(errp, local_err);
638         ret = -EINVAL;
639         goto fail;
640     }
641     switch (locking) {
642     case ON_OFF_AUTO_ON:
643         s->use_lock = true;
644         if (!qemu_has_ofd_lock()) {
645             warn_report("File lock requested but OFD locking syscall is "
646                         "unavailable, falling back to POSIX file locks");
647             error_printf("Due to the implementation, locks can be lost "
648                          "unexpectedly.\n");
649         }
650         break;
651     case ON_OFF_AUTO_OFF:
652         s->use_lock = false;
653         break;
654     case ON_OFF_AUTO_AUTO:
655         s->use_lock = qemu_has_ofd_lock();
656         break;
657     default:
658         abort();
659     }
660 
661     str = qemu_opt_get(opts, "pr-manager");
662     if (str) {
663         s->pr_mgr = pr_manager_lookup(str, &local_err);
664         if (local_err) {
665             error_propagate(errp, local_err);
666             ret = -EINVAL;
667             goto fail;
668         }
669     }
670 
671     s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
672     s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
673                                                false);
674 
675     s->open_flags = open_flags;
676     raw_parse_flags(bdrv_flags, &s->open_flags, false);
677 
678     s->fd = -1;
679     fd = qemu_open(filename, s->open_flags, errp);
680     ret = fd < 0 ? -errno : 0;
681 
682     if (ret < 0) {
683         if (ret == -EROFS) {
684             ret = -EACCES;
685         }
686         goto fail;
687     }
688     s->fd = fd;
689 
690     /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
691     if (s->open_flags & O_RDWR) {
692         ret = check_hdev_writable(s->fd);
693         if (ret < 0) {
694             error_setg_errno(errp, -ret, "The device is not writable");
695             goto fail;
696         }
697     }
698 
699     s->perm = 0;
700     s->shared_perm = BLK_PERM_ALL;
701 
702 #ifdef CONFIG_LINUX_AIO
703      /* Currently Linux does AIO only for files opened with O_DIRECT */
704     if (s->use_linux_aio) {
705         if (!(s->open_flags & O_DIRECT)) {
706             error_setg(errp, "aio=native was specified, but it requires "
707                              "cache.direct=on, which was not specified.");
708             ret = -EINVAL;
709             goto fail;
710         }
711         if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
712             error_prepend(errp, "Unable to use native AIO: ");
713             goto fail;
714         }
715     }
716 #else
717     if (s->use_linux_aio) {
718         error_setg(errp, "aio=native was specified, but is not supported "
719                          "in this build.");
720         ret = -EINVAL;
721         goto fail;
722     }
723 #endif /* !defined(CONFIG_LINUX_AIO) */
724 
725 #ifdef CONFIG_LINUX_IO_URING
726     if (s->use_linux_io_uring) {
727         if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
728             error_prepend(errp, "Unable to use io_uring: ");
729             goto fail;
730         }
731     }
732 #else
733     if (s->use_linux_io_uring) {
734         error_setg(errp, "aio=io_uring was specified, but is not supported "
735                          "in this build.");
736         ret = -EINVAL;
737         goto fail;
738     }
739 #endif /* !defined(CONFIG_LINUX_IO_URING) */
740 
741     s->has_discard = true;
742     s->has_write_zeroes = true;
743 
744     if (fstat(s->fd, &st) < 0) {
745         ret = -errno;
746         error_setg_errno(errp, errno, "Could not stat file");
747         goto fail;
748     }
749 
750     if (!device) {
751         if (!S_ISREG(st.st_mode)) {
752             error_setg(errp, "'%s' driver requires '%s' to be a regular file",
753                        bs->drv->format_name, bs->filename);
754             ret = -EINVAL;
755             goto fail;
756         } else {
757             s->discard_zeroes = true;
758             s->has_fallocate = true;
759         }
760     } else {
761         if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
762             error_setg(errp, "'%s' driver requires '%s' to be either "
763                        "a character or block device",
764                        bs->drv->format_name, bs->filename);
765             ret = -EINVAL;
766             goto fail;
767         }
768     }
769 
770     if (S_ISBLK(st.st_mode)) {
771 #ifdef BLKDISCARDZEROES
772         unsigned int arg;
773         if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
774             s->discard_zeroes = true;
775         }
776 #endif
777 #ifdef __linux__
778         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
779          * not rely on the contents of discarded blocks unless using O_DIRECT.
780          * Same for BLKZEROOUT.
781          */
782         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
783             s->discard_zeroes = false;
784             s->has_write_zeroes = false;
785         }
786 #endif
787     }
788 #ifdef __FreeBSD__
789     if (S_ISCHR(st.st_mode)) {
790         /*
791          * The file is a char device (disk), which on FreeBSD isn't behind
792          * a pager, so force all requests to be aligned. This is needed
793          * so QEMU makes sure all IO operations on the device are aligned
794          * to sector size, or else FreeBSD will reject them with EINVAL.
795          */
796         s->force_alignment = true;
797     }
798 #endif
799     s->needs_alignment = raw_needs_alignment(bs);
800 
801 #ifdef CONFIG_XFS
802     if (platform_test_xfs_fd(s->fd)) {
803         s->is_xfs = true;
804     }
805 #endif
806 
807     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
808     if (S_ISREG(st.st_mode)) {
809         /* When extending regular files, we get zeros from the OS */
810         bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
811     }
812     ret = 0;
813 fail:
814     if (ret < 0 && s->fd != -1) {
815         qemu_close(s->fd);
816     }
817     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
818         unlink(filename);
819     }
820     qemu_opts_del(opts);
821     return ret;
822 }
823 
824 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
825                     Error **errp)
826 {
827     BDRVRawState *s = bs->opaque;
828 
829     s->type = FTYPE_FILE;
830     return raw_open_common(bs, options, flags, 0, false, errp);
831 }
832 
833 typedef enum {
834     RAW_PL_PREPARE,
835     RAW_PL_COMMIT,
836     RAW_PL_ABORT,
837 } RawPermLockOp;
838 
839 #define PERM_FOREACH(i) \
840     for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
841 
842 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
843  * file; if @unlock == true, also unlock the unneeded bytes.
844  * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
845  */
846 static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
847                                 uint64_t perm_lock_bits,
848                                 uint64_t shared_perm_lock_bits,
849                                 bool unlock, Error **errp)
850 {
851     int ret;
852     int i;
853     uint64_t locked_perm, locked_shared_perm;
854 
855     if (s) {
856         locked_perm = s->locked_perm;
857         locked_shared_perm = s->locked_shared_perm;
858     } else {
859         /*
860          * We don't have the previous bits, just lock/unlock for each of the
861          * requested bits.
862          */
863         if (unlock) {
864             locked_perm = BLK_PERM_ALL;
865             locked_shared_perm = BLK_PERM_ALL;
866         } else {
867             locked_perm = 0;
868             locked_shared_perm = 0;
869         }
870     }
871 
872     PERM_FOREACH(i) {
873         int off = RAW_LOCK_PERM_BASE + i;
874         uint64_t bit = (1ULL << i);
875         if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
876             ret = qemu_lock_fd(fd, off, 1, false);
877             if (ret) {
878                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
879                                           off);
880                 return ret;
881             } else if (s) {
882                 s->locked_perm |= bit;
883             }
884         } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
885             ret = qemu_unlock_fd(fd, off, 1);
886             if (ret) {
887                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
888                 return ret;
889             } else if (s) {
890                 s->locked_perm &= ~bit;
891             }
892         }
893     }
894     PERM_FOREACH(i) {
895         int off = RAW_LOCK_SHARED_BASE + i;
896         uint64_t bit = (1ULL << i);
897         if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
898             ret = qemu_lock_fd(fd, off, 1, false);
899             if (ret) {
900                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
901                                           off);
902                 return ret;
903             } else if (s) {
904                 s->locked_shared_perm |= bit;
905             }
906         } else if (unlock && (locked_shared_perm & bit) &&
907                    !(shared_perm_lock_bits & bit)) {
908             ret = qemu_unlock_fd(fd, off, 1);
909             if (ret) {
910                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
911                 return ret;
912             } else if (s) {
913                 s->locked_shared_perm &= ~bit;
914             }
915         }
916     }
917     return 0;
918 }
919 
920 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
921 static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
922                                 Error **errp)
923 {
924     int ret;
925     int i;
926 
927     PERM_FOREACH(i) {
928         int off = RAW_LOCK_SHARED_BASE + i;
929         uint64_t p = 1ULL << i;
930         if (perm & p) {
931             ret = qemu_lock_fd_test(fd, off, 1, true);
932             if (ret) {
933                 char *perm_name = bdrv_perm_names(p);
934 
935                 raw_lock_error_setg_errno(errp, -ret,
936                                           "Failed to get \"%s\" lock",
937                                           perm_name);
938                 g_free(perm_name);
939                 return ret;
940             }
941         }
942     }
943     PERM_FOREACH(i) {
944         int off = RAW_LOCK_PERM_BASE + i;
945         uint64_t p = 1ULL << i;
946         if (!(shared_perm & p)) {
947             ret = qemu_lock_fd_test(fd, off, 1, true);
948             if (ret) {
949                 char *perm_name = bdrv_perm_names(p);
950 
951                 raw_lock_error_setg_errno(errp, -ret,
952                                           "Failed to get shared \"%s\" lock",
953                                           perm_name);
954                 g_free(perm_name);
955                 return ret;
956             }
957         }
958     }
959     return 0;
960 }
961 
962 static int raw_handle_perm_lock(BlockDriverState *bs,
963                                 RawPermLockOp op,
964                                 uint64_t new_perm, uint64_t new_shared,
965                                 Error **errp)
966 {
967     BDRVRawState *s = bs->opaque;
968     int ret = 0;
969     Error *local_err = NULL;
970 
971     if (!s->use_lock) {
972         return 0;
973     }
974 
975     if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
976         return 0;
977     }
978 
979     switch (op) {
980     case RAW_PL_PREPARE:
981         if ((s->perm | new_perm) == s->perm &&
982             (s->shared_perm & new_shared) == s->shared_perm)
983         {
984             /*
985              * We are going to unlock bytes, it should not fail. If it fail due
986              * to some fs-dependent permission-unrelated reasons (which occurs
987              * sometimes on NFS and leads to abort in bdrv_replace_child) we
988              * can't prevent such errors by any check here. And we ignore them
989              * anyway in ABORT and COMMIT.
990              */
991             return 0;
992         }
993         ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
994                                    ~s->shared_perm | ~new_shared,
995                                    false, errp);
996         if (!ret) {
997             ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
998             if (!ret) {
999                 return 0;
1000             }
1001             error_append_hint(errp,
1002                               "Is another process using the image [%s]?\n",
1003                               bs->filename);
1004         }
1005         /* fall through to unlock bytes. */
1006     case RAW_PL_ABORT:
1007         raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
1008                              true, &local_err);
1009         if (local_err) {
1010             /* Theoretically the above call only unlocks bytes and it cannot
1011              * fail. Something weird happened, report it.
1012              */
1013             warn_report_err(local_err);
1014         }
1015         break;
1016     case RAW_PL_COMMIT:
1017         raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
1018                              true, &local_err);
1019         if (local_err) {
1020             /* Theoretically the above call only unlocks bytes and it cannot
1021              * fail. Something weird happened, report it.
1022              */
1023             warn_report_err(local_err);
1024         }
1025         break;
1026     }
1027     return ret;
1028 }
1029 
1030 static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
1031                                  int *open_flags, uint64_t perm, bool force_dup,
1032                                  Error **errp)
1033 {
1034     BDRVRawState *s = bs->opaque;
1035     int fd = -1;
1036     int ret;
1037     bool has_writers = perm &
1038         (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
1039     int fcntl_flags = O_APPEND | O_NONBLOCK;
1040 #ifdef O_NOATIME
1041     fcntl_flags |= O_NOATIME;
1042 #endif
1043 
1044     *open_flags = 0;
1045     if (s->type == FTYPE_CD) {
1046         *open_flags |= O_NONBLOCK;
1047     }
1048 
1049     raw_parse_flags(flags, open_flags, has_writers);
1050 
1051 #ifdef O_ASYNC
1052     /* Not all operating systems have O_ASYNC, and those that don't
1053      * will not let us track the state into rs->open_flags (typically
1054      * you achieve the same effect with an ioctl, for example I_SETSIG
1055      * on Solaris). But we do not use O_ASYNC, so that's fine.
1056      */
1057     assert((s->open_flags & O_ASYNC) == 0);
1058 #endif
1059 
1060     if (!force_dup && *open_flags == s->open_flags) {
1061         /* We're lucky, the existing fd is fine */
1062         return s->fd;
1063     }
1064 
1065     if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
1066         /* dup the original fd */
1067         fd = qemu_dup(s->fd);
1068         if (fd >= 0) {
1069             ret = fcntl_setfl(fd, *open_flags);
1070             if (ret) {
1071                 qemu_close(fd);
1072                 fd = -1;
1073             }
1074         }
1075     }
1076 
1077     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
1078     if (fd == -1) {
1079         const char *normalized_filename = bs->filename;
1080         ret = raw_normalize_devicepath(&normalized_filename, errp);
1081         if (ret >= 0) {
1082             fd = qemu_open(normalized_filename, *open_flags, errp);
1083             if (fd == -1) {
1084                 return -1;
1085             }
1086         }
1087     }
1088 
1089     if (fd != -1 && (*open_flags & O_RDWR)) {
1090         ret = check_hdev_writable(fd);
1091         if (ret < 0) {
1092             qemu_close(fd);
1093             error_setg_errno(errp, -ret, "The device is not writable");
1094             return -1;
1095         }
1096     }
1097 
1098     return fd;
1099 }
1100 
1101 static int raw_reopen_prepare(BDRVReopenState *state,
1102                               BlockReopenQueue *queue, Error **errp)
1103 {
1104     BDRVRawState *s;
1105     BDRVRawReopenState *rs;
1106     QemuOpts *opts;
1107     int ret;
1108 
1109     assert(state != NULL);
1110     assert(state->bs != NULL);
1111 
1112     s = state->bs->opaque;
1113 
1114     state->opaque = g_new0(BDRVRawReopenState, 1);
1115     rs = state->opaque;
1116 
1117     /* Handle options changes */
1118     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1119     if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
1120         ret = -EINVAL;
1121         goto out;
1122     }
1123 
1124     rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1125     rs->check_cache_dropped =
1126         qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1127 
1128     /* This driver's reopen function doesn't currently allow changing
1129      * other options, so let's put them back in the original QDict and
1130      * bdrv_reopen_prepare() will detect changes and complain. */
1131     qemu_opts_to_qdict(opts, state->options);
1132 
1133     /*
1134      * As part of reopen prepare we also want to create new fd by
1135      * raw_reconfigure_getfd(). But it wants updated "perm", when in
1136      * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
1137      * permission update. Happily, permission update is always a part (a seprate
1138      * stage) of bdrv_reopen_multiple() so we can rely on this fact and
1139      * reconfigure fd in raw_check_perm().
1140      */
1141 
1142     s->reopen_state = state;
1143     ret = 0;
1144 
1145 out:
1146     qemu_opts_del(opts);
1147     return ret;
1148 }
1149 
1150 static void raw_reopen_commit(BDRVReopenState *state)
1151 {
1152     BDRVRawReopenState *rs = state->opaque;
1153     BDRVRawState *s = state->bs->opaque;
1154 
1155     s->drop_cache = rs->drop_cache;
1156     s->check_cache_dropped = rs->check_cache_dropped;
1157     s->open_flags = rs->open_flags;
1158     g_free(state->opaque);
1159     state->opaque = NULL;
1160 
1161     assert(s->reopen_state == state);
1162     s->reopen_state = NULL;
1163 }
1164 
1165 
1166 static void raw_reopen_abort(BDRVReopenState *state)
1167 {
1168     BDRVRawReopenState *rs = state->opaque;
1169     BDRVRawState *s = state->bs->opaque;
1170 
1171      /* nothing to do if NULL, we didn't get far enough */
1172     if (rs == NULL) {
1173         return;
1174     }
1175 
1176     g_free(state->opaque);
1177     state->opaque = NULL;
1178 
1179     assert(s->reopen_state == state);
1180     s->reopen_state = NULL;
1181 }
1182 
1183 static int hdev_get_max_hw_transfer(int fd, struct stat *st)
1184 {
1185 #ifdef BLKSECTGET
1186     if (S_ISBLK(st->st_mode)) {
1187         unsigned short max_sectors = 0;
1188         if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
1189             return max_sectors * 512;
1190         }
1191     } else {
1192         int max_bytes = 0;
1193         if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1194             return max_bytes;
1195         }
1196     }
1197     return -errno;
1198 #else
1199     return -ENOSYS;
1200 #endif
1201 }
1202 
1203 static int hdev_get_max_segments(int fd, struct stat *st)
1204 {
1205 #ifdef CONFIG_LINUX
1206     char buf[32];
1207     const char *end;
1208     char *sysfspath = NULL;
1209     int ret;
1210     int sysfd = -1;
1211     long max_segments;
1212 
1213     if (S_ISCHR(st->st_mode)) {
1214         if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
1215             return ret;
1216         }
1217         return -ENOTSUP;
1218     }
1219 
1220     if (!S_ISBLK(st->st_mode)) {
1221         return -ENOTSUP;
1222     }
1223 
1224     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1225                                 major(st->st_rdev), minor(st->st_rdev));
1226     sysfd = open(sysfspath, O_RDONLY);
1227     if (sysfd == -1) {
1228         ret = -errno;
1229         goto out;
1230     }
1231     do {
1232         ret = read(sysfd, buf, sizeof(buf) - 1);
1233     } while (ret == -1 && errno == EINTR);
1234     if (ret < 0) {
1235         ret = -errno;
1236         goto out;
1237     } else if (ret == 0) {
1238         ret = -EIO;
1239         goto out;
1240     }
1241     buf[ret] = 0;
1242     /* The file is ended with '\n', pass 'end' to accept that. */
1243     ret = qemu_strtol(buf, &end, 10, &max_segments);
1244     if (ret == 0 && end && *end == '\n') {
1245         ret = max_segments;
1246     }
1247 
1248 out:
1249     if (sysfd != -1) {
1250         close(sysfd);
1251     }
1252     g_free(sysfspath);
1253     return ret;
1254 #else
1255     return -ENOTSUP;
1256 #endif
1257 }
1258 
1259 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1260 {
1261     BDRVRawState *s = bs->opaque;
1262     struct stat st;
1263 
1264     s->needs_alignment = raw_needs_alignment(bs);
1265     raw_probe_alignment(bs, s->fd, errp);
1266 
1267     bs->bl.min_mem_alignment = s->buf_align;
1268     bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
1269 
1270     /*
1271      * Maximum transfers are best effort, so it is okay to ignore any
1272      * errors.  That said, based on the man page errors in fstat would be
1273      * very much unexpected; the only possible case seems to be ENOMEM.
1274      */
1275     if (fstat(s->fd, &st)) {
1276         return;
1277     }
1278 
1279 #if defined(__APPLE__) && (__MACH__)
1280     struct statfs buf;
1281 
1282     if (!fstatfs(s->fd, &buf)) {
1283         bs->bl.opt_transfer = buf.f_iosize;
1284         bs->bl.pdiscard_alignment = buf.f_bsize;
1285     }
1286 #endif
1287 
1288     if (bs->sg || S_ISBLK(st.st_mode)) {
1289         int ret = hdev_get_max_hw_transfer(s->fd, &st);
1290 
1291         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1292             bs->bl.max_hw_transfer = ret;
1293         }
1294 
1295         ret = hdev_get_max_segments(s->fd, &st);
1296         if (ret > 0) {
1297             bs->bl.max_hw_iov = ret;
1298         }
1299     }
1300 }
1301 
1302 static int check_for_dasd(int fd)
1303 {
1304 #ifdef BIODASDINFO2
1305     struct dasd_information2_t info = {0};
1306 
1307     return ioctl(fd, BIODASDINFO2, &info);
1308 #else
1309     return -1;
1310 #endif
1311 }
1312 
1313 /**
1314  * Try to get @bs's logical and physical block size.
1315  * On success, store them in @bsz and return zero.
1316  * On failure, return negative errno.
1317  */
1318 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1319 {
1320     BDRVRawState *s = bs->opaque;
1321     int ret;
1322 
1323     /* If DASD, get blocksizes */
1324     if (check_for_dasd(s->fd) < 0) {
1325         return -ENOTSUP;
1326     }
1327     ret = probe_logical_blocksize(s->fd, &bsz->log);
1328     if (ret < 0) {
1329         return ret;
1330     }
1331     return probe_physical_blocksize(s->fd, &bsz->phys);
1332 }
1333 
1334 /**
1335  * Try to get @bs's geometry: cyls, heads, sectors.
1336  * On success, store them in @geo and return 0.
1337  * On failure return -errno.
1338  * (Allows block driver to assign default geometry values that guest sees)
1339  */
1340 #ifdef __linux__
1341 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1342 {
1343     BDRVRawState *s = bs->opaque;
1344     struct hd_geometry ioctl_geo = {0};
1345 
1346     /* If DASD, get its geometry */
1347     if (check_for_dasd(s->fd) < 0) {
1348         return -ENOTSUP;
1349     }
1350     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1351         return -errno;
1352     }
1353     /* HDIO_GETGEO may return success even though geo contains zeros
1354        (e.g. certain multipath setups) */
1355     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1356         return -ENOTSUP;
1357     }
1358     /* Do not return a geometry for partition */
1359     if (ioctl_geo.start != 0) {
1360         return -ENOTSUP;
1361     }
1362     geo->heads = ioctl_geo.heads;
1363     geo->sectors = ioctl_geo.sectors;
1364     geo->cylinders = ioctl_geo.cylinders;
1365 
1366     return 0;
1367 }
1368 #else /* __linux__ */
1369 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1370 {
1371     return -ENOTSUP;
1372 }
1373 #endif
1374 
1375 #if defined(__linux__)
1376 static int handle_aiocb_ioctl(void *opaque)
1377 {
1378     RawPosixAIOData *aiocb = opaque;
1379     int ret;
1380 
1381     do {
1382         ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1383     } while (ret == -1 && errno == EINTR);
1384     if (ret == -1) {
1385         return -errno;
1386     }
1387 
1388     return 0;
1389 }
1390 #endif /* linux */
1391 
1392 static int handle_aiocb_flush(void *opaque)
1393 {
1394     RawPosixAIOData *aiocb = opaque;
1395     BDRVRawState *s = aiocb->bs->opaque;
1396     int ret;
1397 
1398     if (s->page_cache_inconsistent) {
1399         return -s->page_cache_inconsistent;
1400     }
1401 
1402     ret = qemu_fdatasync(aiocb->aio_fildes);
1403     if (ret == -1) {
1404         trace_file_flush_fdatasync_failed(errno);
1405 
1406         /* There is no clear definition of the semantics of a failing fsync(),
1407          * so we may have to assume the worst. The sad truth is that this
1408          * assumption is correct for Linux. Some pages are now probably marked
1409          * clean in the page cache even though they are inconsistent with the
1410          * on-disk contents. The next fdatasync() call would succeed, but no
1411          * further writeback attempt will be made. We can't get back to a state
1412          * in which we know what is on disk (we would have to rewrite
1413          * everything that was touched since the last fdatasync() at least), so
1414          * make bdrv_flush() fail permanently. Given that the behaviour isn't
1415          * really defined, I have little hope that other OSes are doing better.
1416          *
1417          * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1418          * cache. */
1419         if ((s->open_flags & O_DIRECT) == 0) {
1420             s->page_cache_inconsistent = errno;
1421         }
1422         return -errno;
1423     }
1424     return 0;
1425 }
1426 
1427 #ifdef CONFIG_PREADV
1428 
1429 static bool preadv_present = true;
1430 
1431 static ssize_t
1432 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1433 {
1434     return preadv(fd, iov, nr_iov, offset);
1435 }
1436 
1437 static ssize_t
1438 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1439 {
1440     return pwritev(fd, iov, nr_iov, offset);
1441 }
1442 
1443 #else
1444 
1445 static bool preadv_present = false;
1446 
1447 static ssize_t
1448 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1449 {
1450     return -ENOSYS;
1451 }
1452 
1453 static ssize_t
1454 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1455 {
1456     return -ENOSYS;
1457 }
1458 
1459 #endif
1460 
1461 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1462 {
1463     ssize_t len;
1464 
1465     do {
1466         if (aiocb->aio_type & QEMU_AIO_WRITE)
1467             len = qemu_pwritev(aiocb->aio_fildes,
1468                                aiocb->io.iov,
1469                                aiocb->io.niov,
1470                                aiocb->aio_offset);
1471          else
1472             len = qemu_preadv(aiocb->aio_fildes,
1473                               aiocb->io.iov,
1474                               aiocb->io.niov,
1475                               aiocb->aio_offset);
1476     } while (len == -1 && errno == EINTR);
1477 
1478     if (len == -1) {
1479         return -errno;
1480     }
1481     return len;
1482 }
1483 
1484 /*
1485  * Read/writes the data to/from a given linear buffer.
1486  *
1487  * Returns the number of bytes handles or -errno in case of an error. Short
1488  * reads are only returned if the end of the file is reached.
1489  */
1490 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1491 {
1492     ssize_t offset = 0;
1493     ssize_t len;
1494 
1495     while (offset < aiocb->aio_nbytes) {
1496         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1497             len = pwrite(aiocb->aio_fildes,
1498                          (const char *)buf + offset,
1499                          aiocb->aio_nbytes - offset,
1500                          aiocb->aio_offset + offset);
1501         } else {
1502             len = pread(aiocb->aio_fildes,
1503                         buf + offset,
1504                         aiocb->aio_nbytes - offset,
1505                         aiocb->aio_offset + offset);
1506         }
1507         if (len == -1 && errno == EINTR) {
1508             continue;
1509         } else if (len == -1 && errno == EINVAL &&
1510                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1511                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1512                    offset > 0) {
1513             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1514              * after a short read.  Assume that O_DIRECT short reads only occur
1515              * at EOF.  Therefore this is a short read, not an I/O error.
1516              */
1517             break;
1518         } else if (len == -1) {
1519             offset = -errno;
1520             break;
1521         } else if (len == 0) {
1522             break;
1523         }
1524         offset += len;
1525     }
1526 
1527     return offset;
1528 }
1529 
1530 static int handle_aiocb_rw(void *opaque)
1531 {
1532     RawPosixAIOData *aiocb = opaque;
1533     ssize_t nbytes;
1534     char *buf;
1535 
1536     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1537         /*
1538          * If there is just a single buffer, and it is properly aligned
1539          * we can just use plain pread/pwrite without any problems.
1540          */
1541         if (aiocb->io.niov == 1) {
1542             nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1543             goto out;
1544         }
1545         /*
1546          * We have more than one iovec, and all are properly aligned.
1547          *
1548          * Try preadv/pwritev first and fall back to linearizing the
1549          * buffer if it's not supported.
1550          */
1551         if (preadv_present) {
1552             nbytes = handle_aiocb_rw_vector(aiocb);
1553             if (nbytes == aiocb->aio_nbytes ||
1554                 (nbytes < 0 && nbytes != -ENOSYS)) {
1555                 goto out;
1556             }
1557             preadv_present = false;
1558         }
1559 
1560         /*
1561          * XXX(hch): short read/write.  no easy way to handle the reminder
1562          * using these interfaces.  For now retry using plain
1563          * pread/pwrite?
1564          */
1565     }
1566 
1567     /*
1568      * Ok, we have to do it the hard way, copy all segments into
1569      * a single aligned buffer.
1570      */
1571     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1572     if (buf == NULL) {
1573         nbytes = -ENOMEM;
1574         goto out;
1575     }
1576 
1577     if (aiocb->aio_type & QEMU_AIO_WRITE) {
1578         char *p = buf;
1579         int i;
1580 
1581         for (i = 0; i < aiocb->io.niov; ++i) {
1582             memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1583             p += aiocb->io.iov[i].iov_len;
1584         }
1585         assert(p - buf == aiocb->aio_nbytes);
1586     }
1587 
1588     nbytes = handle_aiocb_rw_linear(aiocb, buf);
1589     if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1590         char *p = buf;
1591         size_t count = aiocb->aio_nbytes, copy;
1592         int i;
1593 
1594         for (i = 0; i < aiocb->io.niov && count; ++i) {
1595             copy = count;
1596             if (copy > aiocb->io.iov[i].iov_len) {
1597                 copy = aiocb->io.iov[i].iov_len;
1598             }
1599             memcpy(aiocb->io.iov[i].iov_base, p, copy);
1600             assert(count >= copy);
1601             p     += copy;
1602             count -= copy;
1603         }
1604         assert(count == 0);
1605     }
1606     qemu_vfree(buf);
1607 
1608 out:
1609     if (nbytes == aiocb->aio_nbytes) {
1610         return 0;
1611     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1612         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1613             return -EINVAL;
1614         } else {
1615             iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1616                       0, aiocb->aio_nbytes - nbytes);
1617             return 0;
1618         }
1619     } else {
1620         assert(nbytes < 0);
1621         return nbytes;
1622     }
1623 }
1624 
1625 #if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
1626 static int translate_err(int err)
1627 {
1628     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1629         err == -ENOTTY) {
1630         err = -ENOTSUP;
1631     }
1632     return err;
1633 }
1634 #endif
1635 
1636 #ifdef CONFIG_FALLOCATE
1637 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1638 {
1639     do {
1640         if (fallocate(fd, mode, offset, len) == 0) {
1641             return 0;
1642         }
1643     } while (errno == EINTR);
1644     return translate_err(-errno);
1645 }
1646 #endif
1647 
1648 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1649 {
1650     int ret = -ENOTSUP;
1651     BDRVRawState *s = aiocb->bs->opaque;
1652 
1653     if (!s->has_write_zeroes) {
1654         return -ENOTSUP;
1655     }
1656 
1657 #ifdef BLKZEROOUT
1658     /* The BLKZEROOUT implementation in the kernel doesn't set
1659      * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1660      * fallbacks. */
1661     if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1662         do {
1663             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1664             if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1665                 return 0;
1666             }
1667         } while (errno == EINTR);
1668 
1669         ret = translate_err(-errno);
1670         if (ret == -ENOTSUP) {
1671             s->has_write_zeroes = false;
1672         }
1673     }
1674 #endif
1675 
1676     return ret;
1677 }
1678 
1679 static int handle_aiocb_write_zeroes(void *opaque)
1680 {
1681     RawPosixAIOData *aiocb = opaque;
1682 #ifdef CONFIG_FALLOCATE
1683     BDRVRawState *s = aiocb->bs->opaque;
1684     int64_t len;
1685 #endif
1686 
1687     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1688         return handle_aiocb_write_zeroes_block(aiocb);
1689     }
1690 
1691 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1692     if (s->has_write_zeroes) {
1693         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1694                                aiocb->aio_offset, aiocb->aio_nbytes);
1695         if (ret == -ENOTSUP) {
1696             s->has_write_zeroes = false;
1697         } else if (ret == 0 || ret != -EINVAL) {
1698             return ret;
1699         }
1700         /*
1701          * Note: Some file systems do not like unaligned byte ranges, and
1702          * return EINVAL in such a case, though they should not do it according
1703          * to the man-page of fallocate(). Thus we simply ignore this return
1704          * value and try the other fallbacks instead.
1705          */
1706     }
1707 #endif
1708 
1709 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1710     if (s->has_discard && s->has_fallocate) {
1711         int ret = do_fallocate(s->fd,
1712                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1713                                aiocb->aio_offset, aiocb->aio_nbytes);
1714         if (ret == 0) {
1715             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1716             if (ret == 0 || ret != -ENOTSUP) {
1717                 return ret;
1718             }
1719             s->has_fallocate = false;
1720         } else if (ret == -EINVAL) {
1721             /*
1722              * Some file systems like older versions of GPFS do not like un-
1723              * aligned byte ranges, and return EINVAL in such a case, though
1724              * they should not do it according to the man-page of fallocate().
1725              * Warn about the bad filesystem and try the final fallback instead.
1726              */
1727             warn_report_once("Your file system is misbehaving: "
1728                              "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
1729                              "Please report this bug to your file system "
1730                              "vendor.");
1731         } else if (ret != -ENOTSUP) {
1732             return ret;
1733         } else {
1734             s->has_discard = false;
1735         }
1736     }
1737 #endif
1738 
1739 #ifdef CONFIG_FALLOCATE
1740     /* Last resort: we are trying to extend the file with zeroed data. This
1741      * can be done via fallocate(fd, 0) */
1742     len = bdrv_getlength(aiocb->bs);
1743     if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1744         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1745         if (ret == 0 || ret != -ENOTSUP) {
1746             return ret;
1747         }
1748         s->has_fallocate = false;
1749     }
1750 #endif
1751 
1752     return -ENOTSUP;
1753 }
1754 
1755 static int handle_aiocb_write_zeroes_unmap(void *opaque)
1756 {
1757     RawPosixAIOData *aiocb = opaque;
1758     BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1759 
1760     /* First try to write zeros and unmap at the same time */
1761 
1762 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1763     int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1764                            aiocb->aio_offset, aiocb->aio_nbytes);
1765     switch (ret) {
1766     case -ENOTSUP:
1767     case -EINVAL:
1768     case -EBUSY:
1769         break;
1770     default:
1771         return ret;
1772     }
1773 #endif
1774 
1775     /* If we couldn't manage to unmap while guaranteed that the area reads as
1776      * all-zero afterwards, just write zeroes without unmapping */
1777     return handle_aiocb_write_zeroes(aiocb);
1778 }
1779 
1780 #ifndef HAVE_COPY_FILE_RANGE
1781 static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1782                              off_t *out_off, size_t len, unsigned int flags)
1783 {
1784 #ifdef __NR_copy_file_range
1785     return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1786                    out_off, len, flags);
1787 #else
1788     errno = ENOSYS;
1789     return -1;
1790 #endif
1791 }
1792 #endif
1793 
1794 static int handle_aiocb_copy_range(void *opaque)
1795 {
1796     RawPosixAIOData *aiocb = opaque;
1797     uint64_t bytes = aiocb->aio_nbytes;
1798     off_t in_off = aiocb->aio_offset;
1799     off_t out_off = aiocb->copy_range.aio_offset2;
1800 
1801     while (bytes) {
1802         ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1803                                       aiocb->copy_range.aio_fd2, &out_off,
1804                                       bytes, 0);
1805         trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1806                                    aiocb->copy_range.aio_fd2, out_off, bytes,
1807                                    0, ret);
1808         if (ret == 0) {
1809             /* No progress (e.g. when beyond EOF), let the caller fall back to
1810              * buffer I/O. */
1811             return -ENOSPC;
1812         }
1813         if (ret < 0) {
1814             switch (errno) {
1815             case ENOSYS:
1816                 return -ENOTSUP;
1817             case EINTR:
1818                 continue;
1819             default:
1820                 return -errno;
1821             }
1822         }
1823         bytes -= ret;
1824     }
1825     return 0;
1826 }
1827 
1828 static int handle_aiocb_discard(void *opaque)
1829 {
1830     RawPosixAIOData *aiocb = opaque;
1831     int ret = -ENOTSUP;
1832     BDRVRawState *s = aiocb->bs->opaque;
1833 
1834     if (!s->has_discard) {
1835         return -ENOTSUP;
1836     }
1837 
1838     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1839 #ifdef BLKDISCARD
1840         do {
1841             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1842             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1843                 return 0;
1844             }
1845         } while (errno == EINTR);
1846 
1847         ret = translate_err(-errno);
1848 #endif
1849     } else {
1850 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1851         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1852                            aiocb->aio_offset, aiocb->aio_nbytes);
1853         ret = translate_err(ret);
1854 #elif defined(__APPLE__) && (__MACH__)
1855         fpunchhole_t fpunchhole;
1856         fpunchhole.fp_flags = 0;
1857         fpunchhole.reserved = 0;
1858         fpunchhole.fp_offset = aiocb->aio_offset;
1859         fpunchhole.fp_length = aiocb->aio_nbytes;
1860         if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
1861             ret = errno == ENODEV ? -ENOTSUP : -errno;
1862         } else {
1863             ret = 0;
1864         }
1865 #endif
1866     }
1867 
1868     if (ret == -ENOTSUP) {
1869         s->has_discard = false;
1870     }
1871     return ret;
1872 }
1873 
1874 /*
1875  * Help alignment probing by allocating the first block.
1876  *
1877  * When reading with direct I/O from unallocated area on Gluster backed by XFS,
1878  * reading succeeds regardless of request length. In this case we fallback to
1879  * safe alignment which is not optimal. Allocating the first block avoids this
1880  * fallback.
1881  *
1882  * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
1883  * request alignment, so we use safe values.
1884  *
1885  * Returns: 0 on success, -errno on failure. Since this is an optimization,
1886  * caller may ignore failures.
1887  */
1888 static int allocate_first_block(int fd, size_t max_size)
1889 {
1890     size_t write_size = (max_size < MAX_BLOCKSIZE)
1891         ? BDRV_SECTOR_SIZE
1892         : MAX_BLOCKSIZE;
1893     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
1894     void *buf;
1895     ssize_t n;
1896     int ret;
1897 
1898     buf = qemu_memalign(max_align, write_size);
1899     memset(buf, 0, write_size);
1900 
1901     do {
1902         n = pwrite(fd, buf, write_size, 0);
1903     } while (n == -1 && errno == EINTR);
1904 
1905     ret = (n == -1) ? -errno : 0;
1906 
1907     qemu_vfree(buf);
1908     return ret;
1909 }
1910 
1911 static int handle_aiocb_truncate(void *opaque)
1912 {
1913     RawPosixAIOData *aiocb = opaque;
1914     int result = 0;
1915     int64_t current_length = 0;
1916     char *buf = NULL;
1917     struct stat st;
1918     int fd = aiocb->aio_fildes;
1919     int64_t offset = aiocb->aio_offset;
1920     PreallocMode prealloc = aiocb->truncate.prealloc;
1921     Error **errp = aiocb->truncate.errp;
1922 
1923     if (fstat(fd, &st) < 0) {
1924         result = -errno;
1925         error_setg_errno(errp, -result, "Could not stat file");
1926         return result;
1927     }
1928 
1929     current_length = st.st_size;
1930     if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1931         error_setg(errp, "Cannot use preallocation for shrinking files");
1932         return -ENOTSUP;
1933     }
1934 
1935     switch (prealloc) {
1936 #ifdef CONFIG_POSIX_FALLOCATE
1937     case PREALLOC_MODE_FALLOC:
1938         /*
1939          * Truncating before posix_fallocate() makes it about twice slower on
1940          * file systems that do not support fallocate(), trying to check if a
1941          * block is allocated before allocating it, so don't do that here.
1942          */
1943         if (offset != current_length) {
1944             result = -posix_fallocate(fd, current_length,
1945                                       offset - current_length);
1946             if (result != 0) {
1947                 /* posix_fallocate() doesn't set errno. */
1948                 error_setg_errno(errp, -result,
1949                                  "Could not preallocate new data");
1950             } else if (current_length == 0) {
1951                 /*
1952                  * posix_fallocate() uses fallocate() if the filesystem
1953                  * supports it, or fallback to manually writing zeroes. If
1954                  * fallocate() was used, unaligned reads from the fallocated
1955                  * area in raw_probe_alignment() will succeed, hence we need to
1956                  * allocate the first block.
1957                  *
1958                  * Optimize future alignment probing; ignore failures.
1959                  */
1960                 allocate_first_block(fd, offset);
1961             }
1962         } else {
1963             result = 0;
1964         }
1965         goto out;
1966 #endif
1967     case PREALLOC_MODE_FULL:
1968     {
1969         int64_t num = 0, left = offset - current_length;
1970         off_t seek_result;
1971 
1972         /*
1973          * Knowing the final size from the beginning could allow the file
1974          * system driver to do less allocations and possibly avoid
1975          * fragmentation of the file.
1976          */
1977         if (ftruncate(fd, offset) != 0) {
1978             result = -errno;
1979             error_setg_errno(errp, -result, "Could not resize file");
1980             goto out;
1981         }
1982 
1983         buf = g_malloc0(65536);
1984 
1985         seek_result = lseek(fd, current_length, SEEK_SET);
1986         if (seek_result < 0) {
1987             result = -errno;
1988             error_setg_errno(errp, -result,
1989                              "Failed to seek to the old end of file");
1990             goto out;
1991         }
1992 
1993         while (left > 0) {
1994             num = MIN(left, 65536);
1995             result = write(fd, buf, num);
1996             if (result < 0) {
1997                 if (errno == EINTR) {
1998                     continue;
1999                 }
2000                 result = -errno;
2001                 error_setg_errno(errp, -result,
2002                                  "Could not write zeros for preallocation");
2003                 goto out;
2004             }
2005             left -= result;
2006         }
2007         if (result >= 0) {
2008             result = fsync(fd);
2009             if (result < 0) {
2010                 result = -errno;
2011                 error_setg_errno(errp, -result,
2012                                  "Could not flush file to disk");
2013                 goto out;
2014             }
2015         }
2016         goto out;
2017     }
2018     case PREALLOC_MODE_OFF:
2019         if (ftruncate(fd, offset) != 0) {
2020             result = -errno;
2021             error_setg_errno(errp, -result, "Could not resize file");
2022         } else if (current_length == 0 && offset > current_length) {
2023             /* Optimize future alignment probing; ignore failures. */
2024             allocate_first_block(fd, offset);
2025         }
2026         return result;
2027     default:
2028         result = -ENOTSUP;
2029         error_setg(errp, "Unsupported preallocation mode: %s",
2030                    PreallocMode_str(prealloc));
2031         return result;
2032     }
2033 
2034 out:
2035     if (result < 0) {
2036         if (ftruncate(fd, current_length) < 0) {
2037             error_report("Failed to restore old file length: %s",
2038                          strerror(errno));
2039         }
2040     }
2041 
2042     g_free(buf);
2043     return result;
2044 }
2045 
2046 static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
2047                                                ThreadPoolFunc func, void *arg)
2048 {
2049     /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
2050     ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
2051     return thread_pool_submit_co(pool, func, arg);
2052 }
2053 
2054 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
2055                                    uint64_t bytes, QEMUIOVector *qiov, int type)
2056 {
2057     BDRVRawState *s = bs->opaque;
2058     RawPosixAIOData acb;
2059 
2060     if (fd_open(bs) < 0)
2061         return -EIO;
2062 
2063     /*
2064      * When using O_DIRECT, the request must be aligned to be able to use
2065      * either libaio or io_uring interface. If not fail back to regular thread
2066      * pool read/write code which emulates this for us if we
2067      * set QEMU_AIO_MISALIGNED.
2068      */
2069     if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
2070         type |= QEMU_AIO_MISALIGNED;
2071 #ifdef CONFIG_LINUX_IO_URING
2072     } else if (s->use_linux_io_uring) {
2073         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2074         assert(qiov->size == bytes);
2075         return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
2076 #endif
2077 #ifdef CONFIG_LINUX_AIO
2078     } else if (s->use_linux_aio) {
2079         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2080         assert(qiov->size == bytes);
2081         return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
2082                               s->aio_max_batch);
2083 #endif
2084     }
2085 
2086     acb = (RawPosixAIOData) {
2087         .bs             = bs,
2088         .aio_fildes     = s->fd,
2089         .aio_type       = type,
2090         .aio_offset     = offset,
2091         .aio_nbytes     = bytes,
2092         .io             = {
2093             .iov            = qiov->iov,
2094             .niov           = qiov->niov,
2095         },
2096     };
2097 
2098     assert(qiov->size == bytes);
2099     return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
2100 }
2101 
2102 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
2103                                       int64_t bytes, QEMUIOVector *qiov,
2104                                       BdrvRequestFlags flags)
2105 {
2106     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
2107 }
2108 
2109 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
2110                                        int64_t bytes, QEMUIOVector *qiov,
2111                                        BdrvRequestFlags flags)
2112 {
2113     assert(flags == 0);
2114     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
2115 }
2116 
2117 static void raw_aio_plug(BlockDriverState *bs)
2118 {
2119     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2120 #ifdef CONFIG_LINUX_AIO
2121     if (s->use_linux_aio) {
2122         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2123         laio_io_plug(bs, aio);
2124     }
2125 #endif
2126 #ifdef CONFIG_LINUX_IO_URING
2127     if (s->use_linux_io_uring) {
2128         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2129         luring_io_plug(bs, aio);
2130     }
2131 #endif
2132 }
2133 
2134 static void raw_aio_unplug(BlockDriverState *bs)
2135 {
2136     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2137 #ifdef CONFIG_LINUX_AIO
2138     if (s->use_linux_aio) {
2139         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2140         laio_io_unplug(bs, aio, s->aio_max_batch);
2141     }
2142 #endif
2143 #ifdef CONFIG_LINUX_IO_URING
2144     if (s->use_linux_io_uring) {
2145         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2146         luring_io_unplug(bs, aio);
2147     }
2148 #endif
2149 }
2150 
2151 static int raw_co_flush_to_disk(BlockDriverState *bs)
2152 {
2153     BDRVRawState *s = bs->opaque;
2154     RawPosixAIOData acb;
2155     int ret;
2156 
2157     ret = fd_open(bs);
2158     if (ret < 0) {
2159         return ret;
2160     }
2161 
2162     acb = (RawPosixAIOData) {
2163         .bs             = bs,
2164         .aio_fildes     = s->fd,
2165         .aio_type       = QEMU_AIO_FLUSH,
2166     };
2167 
2168 #ifdef CONFIG_LINUX_IO_URING
2169     if (s->use_linux_io_uring) {
2170         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2171         return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
2172     }
2173 #endif
2174     return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
2175 }
2176 
2177 static void raw_aio_attach_aio_context(BlockDriverState *bs,
2178                                        AioContext *new_context)
2179 {
2180     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2181 #ifdef CONFIG_LINUX_AIO
2182     if (s->use_linux_aio) {
2183         Error *local_err = NULL;
2184         if (!aio_setup_linux_aio(new_context, &local_err)) {
2185             error_reportf_err(local_err, "Unable to use native AIO, "
2186                                          "falling back to thread pool: ");
2187             s->use_linux_aio = false;
2188         }
2189     }
2190 #endif
2191 #ifdef CONFIG_LINUX_IO_URING
2192     if (s->use_linux_io_uring) {
2193         Error *local_err = NULL;
2194         if (!aio_setup_linux_io_uring(new_context, &local_err)) {
2195             error_reportf_err(local_err, "Unable to use linux io_uring, "
2196                                          "falling back to thread pool: ");
2197             s->use_linux_io_uring = false;
2198         }
2199     }
2200 #endif
2201 }
2202 
2203 static void raw_close(BlockDriverState *bs)
2204 {
2205     BDRVRawState *s = bs->opaque;
2206 
2207     if (s->fd >= 0) {
2208         qemu_close(s->fd);
2209         s->fd = -1;
2210     }
2211 }
2212 
2213 /**
2214  * Truncates the given regular file @fd to @offset and, when growing, fills the
2215  * new space according to @prealloc.
2216  *
2217  * Returns: 0 on success, -errno on failure.
2218  */
2219 static int coroutine_fn
2220 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2221                      PreallocMode prealloc, Error **errp)
2222 {
2223     RawPosixAIOData acb;
2224 
2225     acb = (RawPosixAIOData) {
2226         .bs             = bs,
2227         .aio_fildes     = fd,
2228         .aio_type       = QEMU_AIO_TRUNCATE,
2229         .aio_offset     = offset,
2230         .truncate       = {
2231             .prealloc       = prealloc,
2232             .errp           = errp,
2233         },
2234     };
2235 
2236     return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2237 }
2238 
2239 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2240                                         bool exact, PreallocMode prealloc,
2241                                         BdrvRequestFlags flags, Error **errp)
2242 {
2243     BDRVRawState *s = bs->opaque;
2244     struct stat st;
2245     int ret;
2246 
2247     if (fstat(s->fd, &st)) {
2248         ret = -errno;
2249         error_setg_errno(errp, -ret, "Failed to fstat() the file");
2250         return ret;
2251     }
2252 
2253     if (S_ISREG(st.st_mode)) {
2254         /* Always resizes to the exact @offset */
2255         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2256     }
2257 
2258     if (prealloc != PREALLOC_MODE_OFF) {
2259         error_setg(errp, "Preallocation mode '%s' unsupported for this "
2260                    "non-regular file", PreallocMode_str(prealloc));
2261         return -ENOTSUP;
2262     }
2263 
2264     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2265         int64_t cur_length = raw_getlength(bs);
2266 
2267         if (offset != cur_length && exact) {
2268             error_setg(errp, "Cannot resize device files");
2269             return -ENOTSUP;
2270         } else if (offset > cur_length) {
2271             error_setg(errp, "Cannot grow device files");
2272             return -EINVAL;
2273         }
2274     } else {
2275         error_setg(errp, "Resizing this file is not supported");
2276         return -ENOTSUP;
2277     }
2278 
2279     return 0;
2280 }
2281 
2282 #ifdef __OpenBSD__
2283 static int64_t raw_getlength(BlockDriverState *bs)
2284 {
2285     BDRVRawState *s = bs->opaque;
2286     int fd = s->fd;
2287     struct stat st;
2288 
2289     if (fstat(fd, &st))
2290         return -errno;
2291     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2292         struct disklabel dl;
2293 
2294         if (ioctl(fd, DIOCGDINFO, &dl))
2295             return -errno;
2296         return (uint64_t)dl.d_secsize *
2297             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2298     } else
2299         return st.st_size;
2300 }
2301 #elif defined(__NetBSD__)
2302 static int64_t raw_getlength(BlockDriverState *bs)
2303 {
2304     BDRVRawState *s = bs->opaque;
2305     int fd = s->fd;
2306     struct stat st;
2307 
2308     if (fstat(fd, &st))
2309         return -errno;
2310     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2311         struct dkwedge_info dkw;
2312 
2313         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2314             return dkw.dkw_size * 512;
2315         } else {
2316             struct disklabel dl;
2317 
2318             if (ioctl(fd, DIOCGDINFO, &dl))
2319                 return -errno;
2320             return (uint64_t)dl.d_secsize *
2321                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2322         }
2323     } else
2324         return st.st_size;
2325 }
2326 #elif defined(__sun__)
2327 static int64_t raw_getlength(BlockDriverState *bs)
2328 {
2329     BDRVRawState *s = bs->opaque;
2330     struct dk_minfo minfo;
2331     int ret;
2332     int64_t size;
2333 
2334     ret = fd_open(bs);
2335     if (ret < 0) {
2336         return ret;
2337     }
2338 
2339     /*
2340      * Use the DKIOCGMEDIAINFO ioctl to read the size.
2341      */
2342     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2343     if (ret != -1) {
2344         return minfo.dki_lbsize * minfo.dki_capacity;
2345     }
2346 
2347     /*
2348      * There are reports that lseek on some devices fails, but
2349      * irc discussion said that contingency on contingency was overkill.
2350      */
2351     size = lseek(s->fd, 0, SEEK_END);
2352     if (size < 0) {
2353         return -errno;
2354     }
2355     return size;
2356 }
2357 #elif defined(CONFIG_BSD)
2358 static int64_t raw_getlength(BlockDriverState *bs)
2359 {
2360     BDRVRawState *s = bs->opaque;
2361     int fd = s->fd;
2362     int64_t size;
2363     struct stat sb;
2364 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2365     int reopened = 0;
2366 #endif
2367     int ret;
2368 
2369     ret = fd_open(bs);
2370     if (ret < 0)
2371         return ret;
2372 
2373 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2374 again:
2375 #endif
2376     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2377         size = 0;
2378 #ifdef DIOCGMEDIASIZE
2379         if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
2380             size = 0;
2381         }
2382 #endif
2383 #ifdef DIOCGPART
2384         if (size == 0) {
2385             struct partinfo pi;
2386             if (ioctl(fd, DIOCGPART, &pi) == 0) {
2387                 size = pi.media_size;
2388             }
2389         }
2390 #endif
2391 #if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
2392         if (size == 0) {
2393             uint64_t sectors = 0;
2394             uint32_t sector_size = 0;
2395 
2396             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2397                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2398                 size = sectors * sector_size;
2399             }
2400         }
2401 #endif
2402         if (size == 0) {
2403             size = lseek(fd, 0LL, SEEK_END);
2404         }
2405         if (size < 0) {
2406             return -errno;
2407         }
2408 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2409         switch(s->type) {
2410         case FTYPE_CD:
2411             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2412             if (size == 2048LL * (unsigned)-1)
2413                 size = 0;
2414             /* XXX no disc?  maybe we need to reopen... */
2415             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2416                 reopened = 1;
2417                 goto again;
2418             }
2419         }
2420 #endif
2421     } else {
2422         size = lseek(fd, 0, SEEK_END);
2423         if (size < 0) {
2424             return -errno;
2425         }
2426     }
2427     return size;
2428 }
2429 #else
2430 static int64_t raw_getlength(BlockDriverState *bs)
2431 {
2432     BDRVRawState *s = bs->opaque;
2433     int ret;
2434     int64_t size;
2435 
2436     ret = fd_open(bs);
2437     if (ret < 0) {
2438         return ret;
2439     }
2440 
2441     size = lseek(s->fd, 0, SEEK_END);
2442     if (size < 0) {
2443         return -errno;
2444     }
2445     return size;
2446 }
2447 #endif
2448 
2449 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2450 {
2451     struct stat st;
2452     BDRVRawState *s = bs->opaque;
2453 
2454     if (fstat(s->fd, &st) < 0) {
2455         return -errno;
2456     }
2457     return (int64_t)st.st_blocks * 512;
2458 }
2459 
2460 static int coroutine_fn
2461 raw_co_create(BlockdevCreateOptions *options, Error **errp)
2462 {
2463     BlockdevCreateOptionsFile *file_opts;
2464     Error *local_err = NULL;
2465     int fd;
2466     uint64_t perm, shared;
2467     int result = 0;
2468 
2469     /* Validate options and set default values */
2470     assert(options->driver == BLOCKDEV_DRIVER_FILE);
2471     file_opts = &options->u.file;
2472 
2473     if (!file_opts->has_nocow) {
2474         file_opts->nocow = false;
2475     }
2476     if (!file_opts->has_preallocation) {
2477         file_opts->preallocation = PREALLOC_MODE_OFF;
2478     }
2479     if (!file_opts->has_extent_size_hint) {
2480         file_opts->extent_size_hint = 1 * MiB;
2481     }
2482     if (file_opts->extent_size_hint > UINT32_MAX) {
2483         result = -EINVAL;
2484         error_setg(errp, "Extent size hint is too large");
2485         goto out;
2486     }
2487 
2488     /* Create file */
2489     fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
2490     if (fd < 0) {
2491         result = -errno;
2492         goto out;
2493     }
2494 
2495     /* Take permissions: We want to discard everything, so we need
2496      * BLK_PERM_WRITE; and truncation to the desired size requires
2497      * BLK_PERM_RESIZE.
2498      * On the other hand, we cannot share the RESIZE permission
2499      * because we promise that after this function, the file has the
2500      * size given in the options.  If someone else were to resize it
2501      * concurrently, we could not guarantee that.
2502      * Note that after this function, we can no longer guarantee that
2503      * the file is not touched by a third party, so it may be resized
2504      * then. */
2505     perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2506     shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2507 
2508     /* Step one: Take locks */
2509     result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2510     if (result < 0) {
2511         goto out_close;
2512     }
2513 
2514     /* Step two: Check that nobody else has taken conflicting locks */
2515     result = raw_check_lock_bytes(fd, perm, shared, errp);
2516     if (result < 0) {
2517         error_append_hint(errp,
2518                           "Is another process using the image [%s]?\n",
2519                           file_opts->filename);
2520         goto out_unlock;
2521     }
2522 
2523     /* Clear the file by truncating it to 0 */
2524     result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2525     if (result < 0) {
2526         goto out_unlock;
2527     }
2528 
2529     if (file_opts->nocow) {
2530 #ifdef __linux__
2531         /* Set NOCOW flag to solve performance issue on fs like btrfs.
2532          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2533          * will be ignored since any failure of this operation should not
2534          * block the left work.
2535          */
2536         int attr;
2537         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2538             attr |= FS_NOCOW_FL;
2539             ioctl(fd, FS_IOC_SETFLAGS, &attr);
2540         }
2541 #endif
2542     }
2543 #ifdef FS_IOC_FSSETXATTR
2544     /*
2545      * Try to set the extent size hint. Failure is not fatal, and a warning is
2546      * only printed if the option was explicitly specified.
2547      */
2548     {
2549         struct fsxattr attr;
2550         result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
2551         if (result == 0) {
2552             attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
2553             attr.fsx_extsize = file_opts->extent_size_hint;
2554             result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
2555         }
2556         if (result < 0 && file_opts->has_extent_size_hint &&
2557             file_opts->extent_size_hint)
2558         {
2559             warn_report("Failed to set extent size hint: %s",
2560                         strerror(errno));
2561         }
2562     }
2563 #endif
2564 
2565     /* Resize and potentially preallocate the file to the desired
2566      * final size */
2567     result = raw_regular_truncate(NULL, fd, file_opts->size,
2568                                   file_opts->preallocation, errp);
2569     if (result < 0) {
2570         goto out_unlock;
2571     }
2572 
2573 out_unlock:
2574     raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2575     if (local_err) {
2576         /* The above call should not fail, and if it does, that does
2577          * not mean the whole creation operation has failed.  So
2578          * report it the user for their convenience, but do not report
2579          * it to the caller. */
2580         warn_report_err(local_err);
2581     }
2582 
2583 out_close:
2584     if (qemu_close(fd) != 0 && result == 0) {
2585         result = -errno;
2586         error_setg_errno(errp, -result, "Could not close the new file");
2587     }
2588 out:
2589     return result;
2590 }
2591 
2592 static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
2593                                            const char *filename,
2594                                            QemuOpts *opts,
2595                                            Error **errp)
2596 {
2597     BlockdevCreateOptions options;
2598     int64_t total_size = 0;
2599     int64_t extent_size_hint = 0;
2600     bool has_extent_size_hint = false;
2601     bool nocow = false;
2602     PreallocMode prealloc;
2603     char *buf = NULL;
2604     Error *local_err = NULL;
2605 
2606     /* Skip file: protocol prefix */
2607     strstart(filename, "file:", &filename);
2608 
2609     /* Read out options */
2610     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2611                           BDRV_SECTOR_SIZE);
2612     if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
2613         has_extent_size_hint = true;
2614         extent_size_hint =
2615             qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
2616     }
2617     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2618     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2619     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2620                                PREALLOC_MODE_OFF, &local_err);
2621     g_free(buf);
2622     if (local_err) {
2623         error_propagate(errp, local_err);
2624         return -EINVAL;
2625     }
2626 
2627     options = (BlockdevCreateOptions) {
2628         .driver     = BLOCKDEV_DRIVER_FILE,
2629         .u.file     = {
2630             .filename           = (char *) filename,
2631             .size               = total_size,
2632             .has_preallocation  = true,
2633             .preallocation      = prealloc,
2634             .has_nocow          = true,
2635             .nocow              = nocow,
2636             .has_extent_size_hint = has_extent_size_hint,
2637             .extent_size_hint   = extent_size_hint,
2638         },
2639     };
2640     return raw_co_create(&options, errp);
2641 }
2642 
2643 static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
2644                                            Error **errp)
2645 {
2646     struct stat st;
2647     int ret;
2648 
2649     if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
2650         error_setg_errno(errp, ENOENT, "%s is not a regular file",
2651                          bs->filename);
2652         return -ENOENT;
2653     }
2654 
2655     ret = unlink(bs->filename);
2656     if (ret < 0) {
2657         ret = -errno;
2658         error_setg_errno(errp, -ret, "Error when deleting file %s",
2659                          bs->filename);
2660     }
2661 
2662     return ret;
2663 }
2664 
2665 /*
2666  * Find allocation range in @bs around offset @start.
2667  * May change underlying file descriptor's file offset.
2668  * If @start is not in a hole, store @start in @data, and the
2669  * beginning of the next hole in @hole, and return 0.
2670  * If @start is in a non-trailing hole, store @start in @hole and the
2671  * beginning of the next non-hole in @data, and return 0.
2672  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2673  * If we can't find out, return a negative errno other than -ENXIO.
2674  */
2675 static int find_allocation(BlockDriverState *bs, off_t start,
2676                            off_t *data, off_t *hole)
2677 {
2678 #if defined SEEK_HOLE && defined SEEK_DATA
2679     BDRVRawState *s = bs->opaque;
2680     off_t offs;
2681 
2682     /*
2683      * SEEK_DATA cases:
2684      * D1. offs == start: start is in data
2685      * D2. offs > start: start is in a hole, next data at offs
2686      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2687      *                              or start is beyond EOF
2688      *     If the latter happens, the file has been truncated behind
2689      *     our back since we opened it.  All bets are off then.
2690      *     Treating like a trailing hole is simplest.
2691      * D4. offs < 0, errno != ENXIO: we learned nothing
2692      */
2693     offs = lseek(s->fd, start, SEEK_DATA);
2694     if (offs < 0) {
2695         return -errno;          /* D3 or D4 */
2696     }
2697 
2698     if (offs < start) {
2699         /* This is not a valid return by lseek().  We are safe to just return
2700          * -EIO in this case, and we'll treat it like D4. */
2701         return -EIO;
2702     }
2703 
2704     if (offs > start) {
2705         /* D2: in hole, next data at offs */
2706         *hole = start;
2707         *data = offs;
2708         return 0;
2709     }
2710 
2711     /* D1: in data, end not yet known */
2712 
2713     /*
2714      * SEEK_HOLE cases:
2715      * H1. offs == start: start is in a hole
2716      *     If this happens here, a hole has been dug behind our back
2717      *     since the previous lseek().
2718      * H2. offs > start: either start is in data, next hole at offs,
2719      *                   or start is in trailing hole, EOF at offs
2720      *     Linux treats trailing holes like any other hole: offs ==
2721      *     start.  Solaris seeks to EOF instead: offs > start (blech).
2722      *     If that happens here, a hole has been dug behind our back
2723      *     since the previous lseek().
2724      * H3. offs < 0, errno = ENXIO: start is beyond EOF
2725      *     If this happens, the file has been truncated behind our
2726      *     back since we opened it.  Treat it like a trailing hole.
2727      * H4. offs < 0, errno != ENXIO: we learned nothing
2728      *     Pretend we know nothing at all, i.e. "forget" about D1.
2729      */
2730     offs = lseek(s->fd, start, SEEK_HOLE);
2731     if (offs < 0) {
2732         return -errno;          /* D1 and (H3 or H4) */
2733     }
2734 
2735     if (offs < start) {
2736         /* This is not a valid return by lseek().  We are safe to just return
2737          * -EIO in this case, and we'll treat it like H4. */
2738         return -EIO;
2739     }
2740 
2741     if (offs > start) {
2742         /*
2743          * D1 and H2: either in data, next hole at offs, or it was in
2744          * data but is now in a trailing hole.  In the latter case,
2745          * all bets are off.  Treating it as if it there was data all
2746          * the way to EOF is safe, so simply do that.
2747          */
2748         *data = start;
2749         *hole = offs;
2750         return 0;
2751     }
2752 
2753     /* D1 and H1 */
2754     return -EBUSY;
2755 #else
2756     return -ENOTSUP;
2757 #endif
2758 }
2759 
2760 /*
2761  * Returns the allocation status of the specified offset.
2762  *
2763  * The block layer guarantees 'offset' and 'bytes' are within bounds.
2764  *
2765  * 'pnum' is set to the number of bytes (including and immediately following
2766  * the specified offset) that are known to be in the same
2767  * allocated/unallocated state.
2768  *
2769  * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
2770  * well exceed it.
2771  */
2772 static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2773                                             bool want_zero,
2774                                             int64_t offset,
2775                                             int64_t bytes, int64_t *pnum,
2776                                             int64_t *map,
2777                                             BlockDriverState **file)
2778 {
2779     off_t data = 0, hole = 0;
2780     int ret;
2781 
2782     assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2783 
2784     ret = fd_open(bs);
2785     if (ret < 0) {
2786         return ret;
2787     }
2788 
2789     if (!want_zero) {
2790         *pnum = bytes;
2791         *map = offset;
2792         *file = bs;
2793         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2794     }
2795 
2796     ret = find_allocation(bs, offset, &data, &hole);
2797     if (ret == -ENXIO) {
2798         /* Trailing hole */
2799         *pnum = bytes;
2800         ret = BDRV_BLOCK_ZERO;
2801     } else if (ret < 0) {
2802         /* No info available, so pretend there are no holes */
2803         *pnum = bytes;
2804         ret = BDRV_BLOCK_DATA;
2805     } else if (data == offset) {
2806         /* On a data extent, compute bytes to the end of the extent,
2807          * possibly including a partial sector at EOF. */
2808         *pnum = hole - offset;
2809 
2810         /*
2811          * We are not allowed to return partial sectors, though, so
2812          * round up if necessary.
2813          */
2814         if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2815             int64_t file_length = raw_getlength(bs);
2816             if (file_length > 0) {
2817                 /* Ignore errors, this is just a safeguard */
2818                 assert(hole == file_length);
2819             }
2820             *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2821         }
2822 
2823         ret = BDRV_BLOCK_DATA;
2824     } else {
2825         /* On a hole, compute bytes to the beginning of the next extent.  */
2826         assert(hole == offset);
2827         *pnum = data - offset;
2828         ret = BDRV_BLOCK_ZERO;
2829     }
2830     *map = offset;
2831     *file = bs;
2832     return ret | BDRV_BLOCK_OFFSET_VALID;
2833 }
2834 
2835 #if defined(__linux__)
2836 /* Verify that the file is not in the page cache */
2837 static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2838 {
2839     const size_t window_size = 128 * 1024 * 1024;
2840     BDRVRawState *s = bs->opaque;
2841     void *window = NULL;
2842     size_t length = 0;
2843     unsigned char *vec;
2844     size_t page_size;
2845     off_t offset;
2846     off_t end;
2847 
2848     /* mincore(2) page status information requires 1 byte per page */
2849     page_size = sysconf(_SC_PAGESIZE);
2850     vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2851 
2852     end = raw_getlength(bs);
2853 
2854     for (offset = 0; offset < end; offset += window_size) {
2855         void *new_window;
2856         size_t new_length;
2857         size_t vec_end;
2858         size_t i;
2859         int ret;
2860 
2861         /* Unmap previous window if size has changed */
2862         new_length = MIN(end - offset, window_size);
2863         if (new_length != length) {
2864             munmap(window, length);
2865             window = NULL;
2866             length = 0;
2867         }
2868 
2869         new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2870                           s->fd, offset);
2871         if (new_window == MAP_FAILED) {
2872             error_setg_errno(errp, errno, "mmap failed");
2873             break;
2874         }
2875 
2876         window = new_window;
2877         length = new_length;
2878 
2879         ret = mincore(window, length, vec);
2880         if (ret < 0) {
2881             error_setg_errno(errp, errno, "mincore failed");
2882             break;
2883         }
2884 
2885         vec_end = DIV_ROUND_UP(length, page_size);
2886         for (i = 0; i < vec_end; i++) {
2887             if (vec[i] & 0x1) {
2888                 break;
2889             }
2890         }
2891         if (i < vec_end) {
2892             error_setg(errp, "page cache still in use!");
2893             break;
2894         }
2895     }
2896 
2897     if (window) {
2898         munmap(window, length);
2899     }
2900 
2901     g_free(vec);
2902 }
2903 #endif /* __linux__ */
2904 
2905 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2906                                                  Error **errp)
2907 {
2908     BDRVRawState *s = bs->opaque;
2909     int ret;
2910 
2911     ret = fd_open(bs);
2912     if (ret < 0) {
2913         error_setg_errno(errp, -ret, "The file descriptor is not open");
2914         return;
2915     }
2916 
2917     if (!s->drop_cache) {
2918         return;
2919     }
2920 
2921     if (s->open_flags & O_DIRECT) {
2922         return; /* No host kernel page cache */
2923     }
2924 
2925 #if defined(__linux__)
2926     /* This sets the scene for the next syscall... */
2927     ret = bdrv_co_flush(bs);
2928     if (ret < 0) {
2929         error_setg_errno(errp, -ret, "flush failed");
2930         return;
2931     }
2932 
2933     /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2934      * process.  These limitations are okay because we just fsynced the file,
2935      * we don't use mmap, and the file should not be in use by other processes.
2936      */
2937     ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2938     if (ret != 0) { /* the return value is a positive errno */
2939         error_setg_errno(errp, ret, "fadvise failed");
2940         return;
2941     }
2942 
2943     if (s->check_cache_dropped) {
2944         check_cache_dropped(bs, errp);
2945     }
2946 #else /* __linux__ */
2947     /* Do nothing.  Live migration to a remote host with cache.direct=off is
2948      * unsupported on other host operating systems.  Cache consistency issues
2949      * may occur but no error is reported here, partly because that's the
2950      * historical behavior and partly because it's hard to differentiate valid
2951      * configurations that should not cause errors.
2952      */
2953 #endif /* !__linux__ */
2954 }
2955 
2956 static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
2957 {
2958     if (ret) {
2959         s->stats.discard_nb_failed++;
2960     } else {
2961         s->stats.discard_nb_ok++;
2962         s->stats.discard_bytes_ok += nbytes;
2963     }
2964 }
2965 
2966 static coroutine_fn int
2967 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
2968                 bool blkdev)
2969 {
2970     BDRVRawState *s = bs->opaque;
2971     RawPosixAIOData acb;
2972     int ret;
2973 
2974     acb = (RawPosixAIOData) {
2975         .bs             = bs,
2976         .aio_fildes     = s->fd,
2977         .aio_type       = QEMU_AIO_DISCARD,
2978         .aio_offset     = offset,
2979         .aio_nbytes     = bytes,
2980     };
2981 
2982     if (blkdev) {
2983         acb.aio_type |= QEMU_AIO_BLKDEV;
2984     }
2985 
2986     ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
2987     raw_account_discard(s, bytes, ret);
2988     return ret;
2989 }
2990 
2991 static coroutine_fn int
2992 raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
2993 {
2994     return raw_do_pdiscard(bs, offset, bytes, false);
2995 }
2996 
2997 static int coroutine_fn
2998 raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
2999                      BdrvRequestFlags flags, bool blkdev)
3000 {
3001     BDRVRawState *s = bs->opaque;
3002     RawPosixAIOData acb;
3003     ThreadPoolFunc *handler;
3004 
3005 #ifdef CONFIG_FALLOCATE
3006     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
3007         BdrvTrackedRequest *req;
3008 
3009         /*
3010          * This is a workaround for a bug in the Linux XFS driver,
3011          * where writes submitted through the AIO interface will be
3012          * discarded if they happen beyond a concurrently running
3013          * fallocate() that increases the file length (i.e., both the
3014          * write and the fallocate() happen beyond the EOF).
3015          *
3016          * To work around it, we extend the tracked request for this
3017          * zero write until INT64_MAX (effectively infinity), and mark
3018          * it as serializing.
3019          *
3020          * We have to enable this workaround for all filesystems and
3021          * AIO modes (not just XFS with aio=native), because for
3022          * remote filesystems we do not know the host configuration.
3023          */
3024 
3025         req = bdrv_co_get_self_request(bs);
3026         assert(req);
3027         assert(req->type == BDRV_TRACKED_WRITE);
3028         assert(req->offset <= offset);
3029         assert(req->offset + req->bytes >= offset + bytes);
3030 
3031         req->bytes = BDRV_MAX_LENGTH - req->offset;
3032 
3033         bdrv_check_request(req->offset, req->bytes, &error_abort);
3034 
3035         bdrv_make_request_serialising(req, bs->bl.request_alignment);
3036     }
3037 #endif
3038 
3039     acb = (RawPosixAIOData) {
3040         .bs             = bs,
3041         .aio_fildes     = s->fd,
3042         .aio_type       = QEMU_AIO_WRITE_ZEROES,
3043         .aio_offset     = offset,
3044         .aio_nbytes     = bytes,
3045     };
3046 
3047     if (blkdev) {
3048         acb.aio_type |= QEMU_AIO_BLKDEV;
3049     }
3050     if (flags & BDRV_REQ_NO_FALLBACK) {
3051         acb.aio_type |= QEMU_AIO_NO_FALLBACK;
3052     }
3053 
3054     if (flags & BDRV_REQ_MAY_UNMAP) {
3055         acb.aio_type |= QEMU_AIO_DISCARD;
3056         handler = handle_aiocb_write_zeroes_unmap;
3057     } else {
3058         handler = handle_aiocb_write_zeroes;
3059     }
3060 
3061     return raw_thread_pool_submit(bs, handler, &acb);
3062 }
3063 
3064 static int coroutine_fn raw_co_pwrite_zeroes(
3065     BlockDriverState *bs, int64_t offset,
3066     int64_t bytes, BdrvRequestFlags flags)
3067 {
3068     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
3069 }
3070 
3071 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3072 {
3073     return 0;
3074 }
3075 
3076 static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
3077 {
3078     BDRVRawState *s = bs->opaque;
3079     return (BlockStatsSpecificFile) {
3080         .discard_nb_ok = s->stats.discard_nb_ok,
3081         .discard_nb_failed = s->stats.discard_nb_failed,
3082         .discard_bytes_ok = s->stats.discard_bytes_ok,
3083     };
3084 }
3085 
3086 static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
3087 {
3088     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3089 
3090     stats->driver = BLOCKDEV_DRIVER_FILE;
3091     stats->u.file = get_blockstats_specific_file(bs);
3092 
3093     return stats;
3094 }
3095 
3096 #if defined(HAVE_HOST_BLOCK_DEVICE)
3097 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
3098 {
3099     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3100 
3101     stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
3102     stats->u.host_device = get_blockstats_specific_file(bs);
3103 
3104     return stats;
3105 }
3106 #endif /* HAVE_HOST_BLOCK_DEVICE */
3107 
3108 static QemuOptsList raw_create_opts = {
3109     .name = "raw-create-opts",
3110     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
3111     .desc = {
3112         {
3113             .name = BLOCK_OPT_SIZE,
3114             .type = QEMU_OPT_SIZE,
3115             .help = "Virtual disk size"
3116         },
3117         {
3118             .name = BLOCK_OPT_NOCOW,
3119             .type = QEMU_OPT_BOOL,
3120             .help = "Turn off copy-on-write (valid only on btrfs)"
3121         },
3122         {
3123             .name = BLOCK_OPT_PREALLOC,
3124             .type = QEMU_OPT_STRING,
3125             .help = "Preallocation mode (allowed values: off"
3126 #ifdef CONFIG_POSIX_FALLOCATE
3127                     ", falloc"
3128 #endif
3129                     ", full)"
3130         },
3131         {
3132             .name = BLOCK_OPT_EXTENT_SIZE_HINT,
3133             .type = QEMU_OPT_SIZE,
3134             .help = "Extent size hint for the image file, 0 to disable"
3135         },
3136         { /* end of list */ }
3137     }
3138 };
3139 
3140 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
3141                           Error **errp)
3142 {
3143     BDRVRawState *s = bs->opaque;
3144     int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
3145     int open_flags;
3146     int ret;
3147 
3148     /* We may need a new fd if auto-read-only switches the mode */
3149     ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm,
3150                                 false, errp);
3151     if (ret < 0) {
3152         return ret;
3153     } else if (ret != s->fd) {
3154         Error *local_err = NULL;
3155 
3156         /*
3157          * Fail already check_perm() if we can't get a working O_DIRECT
3158          * alignment with the new fd.
3159          */
3160         raw_probe_alignment(bs, ret, &local_err);
3161         if (local_err) {
3162             error_propagate(errp, local_err);
3163             return -EINVAL;
3164         }
3165 
3166         s->perm_change_fd = ret;
3167         s->perm_change_flags = open_flags;
3168     }
3169 
3170     /* Prepare permissions on old fd to avoid conflicts between old and new,
3171      * but keep everything locked that new will need. */
3172     ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
3173     if (ret < 0) {
3174         goto fail;
3175     }
3176 
3177     /* Copy locks to the new fd */
3178     if (s->perm_change_fd && s->use_lock) {
3179         ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
3180                                    false, errp);
3181         if (ret < 0) {
3182             raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3183             goto fail;
3184         }
3185     }
3186     return 0;
3187 
3188 fail:
3189     if (s->perm_change_fd) {
3190         qemu_close(s->perm_change_fd);
3191     }
3192     s->perm_change_fd = 0;
3193     return ret;
3194 }
3195 
3196 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3197 {
3198     BDRVRawState *s = bs->opaque;
3199 
3200     /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3201      * called after .bdrv_reopen_commit) */
3202     if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3203         qemu_close(s->fd);
3204         s->fd = s->perm_change_fd;
3205         s->open_flags = s->perm_change_flags;
3206     }
3207     s->perm_change_fd = 0;
3208 
3209     raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3210     s->perm = perm;
3211     s->shared_perm = shared;
3212 }
3213 
3214 static void raw_abort_perm_update(BlockDriverState *bs)
3215 {
3216     BDRVRawState *s = bs->opaque;
3217 
3218     /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3219      * the file descriptor. */
3220     if (s->perm_change_fd) {
3221         qemu_close(s->perm_change_fd);
3222     }
3223     s->perm_change_fd = 0;
3224 
3225     raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3226 }
3227 
3228 static int coroutine_fn raw_co_copy_range_from(
3229         BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
3230         BdrvChild *dst, int64_t dst_offset, int64_t bytes,
3231         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3232 {
3233     return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3234                                  read_flags, write_flags);
3235 }
3236 
3237 static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
3238                                              BdrvChild *src,
3239                                              int64_t src_offset,
3240                                              BdrvChild *dst,
3241                                              int64_t dst_offset,
3242                                              int64_t bytes,
3243                                              BdrvRequestFlags read_flags,
3244                                              BdrvRequestFlags write_flags)
3245 {
3246     RawPosixAIOData acb;
3247     BDRVRawState *s = bs->opaque;
3248     BDRVRawState *src_s;
3249 
3250     assert(dst->bs == bs);
3251     if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3252         return -ENOTSUP;
3253     }
3254 
3255     src_s = src->bs->opaque;
3256     if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3257         return -EIO;
3258     }
3259 
3260     acb = (RawPosixAIOData) {
3261         .bs             = bs,
3262         .aio_type       = QEMU_AIO_COPY_RANGE,
3263         .aio_fildes     = src_s->fd,
3264         .aio_offset     = src_offset,
3265         .aio_nbytes     = bytes,
3266         .copy_range     = {
3267             .aio_fd2        = s->fd,
3268             .aio_offset2    = dst_offset,
3269         },
3270     };
3271 
3272     return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
3273 }
3274 
3275 BlockDriver bdrv_file = {
3276     .format_name = "file",
3277     .protocol_name = "file",
3278     .instance_size = sizeof(BDRVRawState),
3279     .bdrv_needs_filename = true,
3280     .bdrv_probe = NULL, /* no probe for protocols */
3281     .bdrv_parse_filename = raw_parse_filename,
3282     .bdrv_file_open = raw_open,
3283     .bdrv_reopen_prepare = raw_reopen_prepare,
3284     .bdrv_reopen_commit = raw_reopen_commit,
3285     .bdrv_reopen_abort = raw_reopen_abort,
3286     .bdrv_close = raw_close,
3287     .bdrv_co_create = raw_co_create,
3288     .bdrv_co_create_opts = raw_co_create_opts,
3289     .bdrv_has_zero_init = bdrv_has_zero_init_1,
3290     .bdrv_co_block_status = raw_co_block_status,
3291     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3292     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3293     .bdrv_co_delete_file = raw_co_delete_file,
3294 
3295     .bdrv_co_preadv         = raw_co_preadv,
3296     .bdrv_co_pwritev        = raw_co_pwritev,
3297     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3298     .bdrv_co_pdiscard       = raw_co_pdiscard,
3299     .bdrv_co_copy_range_from = raw_co_copy_range_from,
3300     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3301     .bdrv_refresh_limits = raw_refresh_limits,
3302     .bdrv_io_plug = raw_aio_plug,
3303     .bdrv_io_unplug = raw_aio_unplug,
3304     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3305 
3306     .bdrv_co_truncate = raw_co_truncate,
3307     .bdrv_getlength = raw_getlength,
3308     .bdrv_get_info = raw_get_info,
3309     .bdrv_get_allocated_file_size
3310                         = raw_get_allocated_file_size,
3311     .bdrv_get_specific_stats = raw_get_specific_stats,
3312     .bdrv_check_perm = raw_check_perm,
3313     .bdrv_set_perm   = raw_set_perm,
3314     .bdrv_abort_perm_update = raw_abort_perm_update,
3315     .create_opts = &raw_create_opts,
3316     .mutable_opts = mutable_opts,
3317 };
3318 
3319 /***********************************************/
3320 /* host device */
3321 
3322 #if defined(HAVE_HOST_BLOCK_DEVICE)
3323 
3324 #if defined(__APPLE__) && defined(__MACH__)
3325 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3326                                 CFIndex maxPathSize, int flags);
3327 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
3328 {
3329     kern_return_t kernResult = KERN_FAILURE;
3330     mach_port_t     masterPort;
3331     CFMutableDictionaryRef  classesToMatch;
3332     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
3333     char *mediaType = NULL;
3334 
3335     kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
3336     if ( KERN_SUCCESS != kernResult ) {
3337         printf( "IOMasterPort returned %d\n", kernResult );
3338     }
3339 
3340     int index;
3341     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3342         classesToMatch = IOServiceMatching(matching_array[index]);
3343         if (classesToMatch == NULL) {
3344             error_report("IOServiceMatching returned NULL for %s",
3345                          matching_array[index]);
3346             continue;
3347         }
3348         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3349                              kCFBooleanTrue);
3350         kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
3351                                                   mediaIterator);
3352         if (kernResult != KERN_SUCCESS) {
3353             error_report("Note: IOServiceGetMatchingServices returned %d",
3354                          kernResult);
3355             continue;
3356         }
3357 
3358         /* If a match was found, leave the loop */
3359         if (*mediaIterator != 0) {
3360             trace_file_FindEjectableOpticalMedia(matching_array[index]);
3361             mediaType = g_strdup(matching_array[index]);
3362             break;
3363         }
3364     }
3365     return mediaType;
3366 }
3367 
3368 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3369                          CFIndex maxPathSize, int flags)
3370 {
3371     io_object_t     nextMedia;
3372     kern_return_t   kernResult = KERN_FAILURE;
3373     *bsdPath = '\0';
3374     nextMedia = IOIteratorNext( mediaIterator );
3375     if ( nextMedia )
3376     {
3377         CFTypeRef   bsdPathAsCFString;
3378     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3379         if ( bsdPathAsCFString ) {
3380             size_t devPathLength;
3381             strcpy( bsdPath, _PATH_DEV );
3382             if (flags & BDRV_O_NOCACHE) {
3383                 strcat(bsdPath, "r");
3384             }
3385             devPathLength = strlen( bsdPath );
3386             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3387                 kernResult = KERN_SUCCESS;
3388             }
3389             CFRelease( bsdPathAsCFString );
3390         }
3391         IOObjectRelease( nextMedia );
3392     }
3393 
3394     return kernResult;
3395 }
3396 
3397 /* Sets up a real cdrom for use in QEMU */
3398 static bool setup_cdrom(char *bsd_path, Error **errp)
3399 {
3400     int index, num_of_test_partitions = 2, fd;
3401     char test_partition[MAXPATHLEN];
3402     bool partition_found = false;
3403 
3404     /* look for a working partition */
3405     for (index = 0; index < num_of_test_partitions; index++) {
3406         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3407                  index);
3408         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
3409         if (fd >= 0) {
3410             partition_found = true;
3411             qemu_close(fd);
3412             break;
3413         }
3414     }
3415 
3416     /* if a working partition on the device was not found */
3417     if (partition_found == false) {
3418         error_setg(errp, "Failed to find a working partition on disc");
3419     } else {
3420         trace_file_setup_cdrom(test_partition);
3421         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3422     }
3423     return partition_found;
3424 }
3425 
3426 /* Prints directions on mounting and unmounting a device */
3427 static void print_unmounting_directions(const char *file_name)
3428 {
3429     error_report("If device %s is mounted on the desktop, unmount"
3430                  " it first before using it in QEMU", file_name);
3431     error_report("Command to unmount device: diskutil unmountDisk %s",
3432                  file_name);
3433     error_report("Command to mount device: diskutil mountDisk %s", file_name);
3434 }
3435 
3436 #endif /* defined(__APPLE__) && defined(__MACH__) */
3437 
3438 static int hdev_probe_device(const char *filename)
3439 {
3440     struct stat st;
3441 
3442     /* allow a dedicated CD-ROM driver to match with a higher priority */
3443     if (strstart(filename, "/dev/cdrom", NULL))
3444         return 50;
3445 
3446     if (stat(filename, &st) >= 0 &&
3447             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3448         return 100;
3449     }
3450 
3451     return 0;
3452 }
3453 
3454 static void hdev_parse_filename(const char *filename, QDict *options,
3455                                 Error **errp)
3456 {
3457     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3458 }
3459 
3460 static bool hdev_is_sg(BlockDriverState *bs)
3461 {
3462 
3463 #if defined(__linux__)
3464 
3465     BDRVRawState *s = bs->opaque;
3466     struct stat st;
3467     struct sg_scsi_id scsiid;
3468     int sg_version;
3469     int ret;
3470 
3471     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3472         return false;
3473     }
3474 
3475     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3476     if (ret < 0) {
3477         return false;
3478     }
3479 
3480     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3481     if (ret >= 0) {
3482         trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3483         return true;
3484     }
3485 
3486 #endif
3487 
3488     return false;
3489 }
3490 
3491 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3492                      Error **errp)
3493 {
3494     BDRVRawState *s = bs->opaque;
3495     int ret;
3496 
3497 #if defined(__APPLE__) && defined(__MACH__)
3498     /*
3499      * Caution: while qdict_get_str() is fine, getting non-string types
3500      * would require more care.  When @options come from -blockdev or
3501      * blockdev_add, its members are typed according to the QAPI
3502      * schema, but when they come from -drive, they're all QString.
3503      */
3504     const char *filename = qdict_get_str(options, "filename");
3505     char bsd_path[MAXPATHLEN] = "";
3506     bool error_occurred = false;
3507 
3508     /* If using a real cdrom */
3509     if (strcmp(filename, "/dev/cdrom") == 0) {
3510         char *mediaType = NULL;
3511         kern_return_t ret_val;
3512         io_iterator_t mediaIterator = 0;
3513 
3514         mediaType = FindEjectableOpticalMedia(&mediaIterator);
3515         if (mediaType == NULL) {
3516             error_setg(errp, "Please make sure your CD/DVD is in the optical"
3517                        " drive");
3518             error_occurred = true;
3519             goto hdev_open_Mac_error;
3520         }
3521 
3522         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3523         if (ret_val != KERN_SUCCESS) {
3524             error_setg(errp, "Could not get BSD path for optical drive");
3525             error_occurred = true;
3526             goto hdev_open_Mac_error;
3527         }
3528 
3529         /* If a real optical drive was not found */
3530         if (bsd_path[0] == '\0') {
3531             error_setg(errp, "Failed to obtain bsd path for optical drive");
3532             error_occurred = true;
3533             goto hdev_open_Mac_error;
3534         }
3535 
3536         /* If using a cdrom disc and finding a partition on the disc failed */
3537         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3538             setup_cdrom(bsd_path, errp) == false) {
3539             print_unmounting_directions(bsd_path);
3540             error_occurred = true;
3541             goto hdev_open_Mac_error;
3542         }
3543 
3544         qdict_put_str(options, "filename", bsd_path);
3545 
3546 hdev_open_Mac_error:
3547         g_free(mediaType);
3548         if (mediaIterator) {
3549             IOObjectRelease(mediaIterator);
3550         }
3551         if (error_occurred) {
3552             return -ENOENT;
3553         }
3554     }
3555 #endif /* defined(__APPLE__) && defined(__MACH__) */
3556 
3557     s->type = FTYPE_FILE;
3558 
3559     ret = raw_open_common(bs, options, flags, 0, true, errp);
3560     if (ret < 0) {
3561 #if defined(__APPLE__) && defined(__MACH__)
3562         if (*bsd_path) {
3563             filename = bsd_path;
3564         }
3565         /* if a physical device experienced an error while being opened */
3566         if (strncmp(filename, "/dev/", 5) == 0) {
3567             print_unmounting_directions(filename);
3568         }
3569 #endif /* defined(__APPLE__) && defined(__MACH__) */
3570         return ret;
3571     }
3572 
3573     /* Since this does ioctl the device must be already opened */
3574     bs->sg = hdev_is_sg(bs);
3575 
3576     return ret;
3577 }
3578 
3579 #if defined(__linux__)
3580 static int coroutine_fn
3581 hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3582 {
3583     BDRVRawState *s = bs->opaque;
3584     RawPosixAIOData acb;
3585     int ret;
3586 
3587     ret = fd_open(bs);
3588     if (ret < 0) {
3589         return ret;
3590     }
3591 
3592     if (req == SG_IO && s->pr_mgr) {
3593         struct sg_io_hdr *io_hdr = buf;
3594         if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3595             io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3596             return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3597                                       s->fd, io_hdr);
3598         }
3599     }
3600 
3601     acb = (RawPosixAIOData) {
3602         .bs         = bs,
3603         .aio_type   = QEMU_AIO_IOCTL,
3604         .aio_fildes = s->fd,
3605         .aio_offset = 0,
3606         .ioctl      = {
3607             .buf        = buf,
3608             .cmd        = req,
3609         },
3610     };
3611 
3612     return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3613 }
3614 #endif /* linux */
3615 
3616 static coroutine_fn int
3617 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
3618 {
3619     BDRVRawState *s = bs->opaque;
3620     int ret;
3621 
3622     ret = fd_open(bs);
3623     if (ret < 0) {
3624         raw_account_discard(s, bytes, ret);
3625         return ret;
3626     }
3627     return raw_do_pdiscard(bs, offset, bytes, true);
3628 }
3629 
3630 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3631     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
3632 {
3633     int rc;
3634 
3635     rc = fd_open(bs);
3636     if (rc < 0) {
3637         return rc;
3638     }
3639 
3640     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3641 }
3642 
3643 static BlockDriver bdrv_host_device = {
3644     .format_name        = "host_device",
3645     .protocol_name        = "host_device",
3646     .instance_size      = sizeof(BDRVRawState),
3647     .bdrv_needs_filename = true,
3648     .bdrv_probe_device  = hdev_probe_device,
3649     .bdrv_parse_filename = hdev_parse_filename,
3650     .bdrv_file_open     = hdev_open,
3651     .bdrv_close         = raw_close,
3652     .bdrv_reopen_prepare = raw_reopen_prepare,
3653     .bdrv_reopen_commit  = raw_reopen_commit,
3654     .bdrv_reopen_abort   = raw_reopen_abort,
3655     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3656     .create_opts         = &bdrv_create_opts_simple,
3657     .mutable_opts        = mutable_opts,
3658     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3659     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3660 
3661     .bdrv_co_preadv         = raw_co_preadv,
3662     .bdrv_co_pwritev        = raw_co_pwritev,
3663     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3664     .bdrv_co_pdiscard       = hdev_co_pdiscard,
3665     .bdrv_co_copy_range_from = raw_co_copy_range_from,
3666     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3667     .bdrv_refresh_limits = raw_refresh_limits,
3668     .bdrv_io_plug = raw_aio_plug,
3669     .bdrv_io_unplug = raw_aio_unplug,
3670     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3671 
3672     .bdrv_co_truncate       = raw_co_truncate,
3673     .bdrv_getlength	= raw_getlength,
3674     .bdrv_get_info = raw_get_info,
3675     .bdrv_get_allocated_file_size
3676                         = raw_get_allocated_file_size,
3677     .bdrv_get_specific_stats = hdev_get_specific_stats,
3678     .bdrv_check_perm = raw_check_perm,
3679     .bdrv_set_perm   = raw_set_perm,
3680     .bdrv_abort_perm_update = raw_abort_perm_update,
3681     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3682     .bdrv_probe_geometry = hdev_probe_geometry,
3683 
3684     /* generic scsi device */
3685 #ifdef __linux__
3686     .bdrv_co_ioctl          = hdev_co_ioctl,
3687 #endif
3688 };
3689 
3690 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3691 static void cdrom_parse_filename(const char *filename, QDict *options,
3692                                  Error **errp)
3693 {
3694     bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3695 }
3696 #endif
3697 
3698 #ifdef __linux__
3699 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3700                       Error **errp)
3701 {
3702     BDRVRawState *s = bs->opaque;
3703 
3704     s->type = FTYPE_CD;
3705 
3706     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3707     return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3708 }
3709 
3710 static int cdrom_probe_device(const char *filename)
3711 {
3712     int fd, ret;
3713     int prio = 0;
3714     struct stat st;
3715 
3716     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
3717     if (fd < 0) {
3718         goto out;
3719     }
3720     ret = fstat(fd, &st);
3721     if (ret == -1 || !S_ISBLK(st.st_mode)) {
3722         goto outc;
3723     }
3724 
3725     /* Attempt to detect via a CDROM specific ioctl */
3726     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3727     if (ret >= 0)
3728         prio = 100;
3729 
3730 outc:
3731     qemu_close(fd);
3732 out:
3733     return prio;
3734 }
3735 
3736 static bool cdrom_is_inserted(BlockDriverState *bs)
3737 {
3738     BDRVRawState *s = bs->opaque;
3739     int ret;
3740 
3741     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3742     return ret == CDS_DISC_OK;
3743 }
3744 
3745 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3746 {
3747     BDRVRawState *s = bs->opaque;
3748 
3749     if (eject_flag) {
3750         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3751             perror("CDROMEJECT");
3752     } else {
3753         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3754             perror("CDROMEJECT");
3755     }
3756 }
3757 
3758 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3759 {
3760     BDRVRawState *s = bs->opaque;
3761 
3762     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3763         /*
3764          * Note: an error can happen if the distribution automatically
3765          * mounts the CD-ROM
3766          */
3767         /* perror("CDROM_LOCKDOOR"); */
3768     }
3769 }
3770 
3771 static BlockDriver bdrv_host_cdrom = {
3772     .format_name        = "host_cdrom",
3773     .protocol_name      = "host_cdrom",
3774     .instance_size      = sizeof(BDRVRawState),
3775     .bdrv_needs_filename = true,
3776     .bdrv_probe_device	= cdrom_probe_device,
3777     .bdrv_parse_filename = cdrom_parse_filename,
3778     .bdrv_file_open     = cdrom_open,
3779     .bdrv_close         = raw_close,
3780     .bdrv_reopen_prepare = raw_reopen_prepare,
3781     .bdrv_reopen_commit  = raw_reopen_commit,
3782     .bdrv_reopen_abort   = raw_reopen_abort,
3783     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3784     .create_opts         = &bdrv_create_opts_simple,
3785     .mutable_opts        = mutable_opts,
3786     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3787 
3788     .bdrv_co_preadv         = raw_co_preadv,
3789     .bdrv_co_pwritev        = raw_co_pwritev,
3790     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3791     .bdrv_refresh_limits = raw_refresh_limits,
3792     .bdrv_io_plug = raw_aio_plug,
3793     .bdrv_io_unplug = raw_aio_unplug,
3794     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3795 
3796     .bdrv_co_truncate    = raw_co_truncate,
3797     .bdrv_getlength      = raw_getlength,
3798     .has_variable_length = true,
3799     .bdrv_get_allocated_file_size
3800                         = raw_get_allocated_file_size,
3801 
3802     /* removable device support */
3803     .bdrv_is_inserted   = cdrom_is_inserted,
3804     .bdrv_eject         = cdrom_eject,
3805     .bdrv_lock_medium   = cdrom_lock_medium,
3806 
3807     /* generic scsi device */
3808     .bdrv_co_ioctl      = hdev_co_ioctl,
3809 };
3810 #endif /* __linux__ */
3811 
3812 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3813 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3814                       Error **errp)
3815 {
3816     BDRVRawState *s = bs->opaque;
3817     int ret;
3818 
3819     s->type = FTYPE_CD;
3820 
3821     ret = raw_open_common(bs, options, flags, 0, true, errp);
3822     if (ret) {
3823         return ret;
3824     }
3825 
3826     /* make sure the door isn't locked at this time */
3827     ioctl(s->fd, CDIOCALLOW);
3828     return 0;
3829 }
3830 
3831 static int cdrom_probe_device(const char *filename)
3832 {
3833     if (strstart(filename, "/dev/cd", NULL) ||
3834             strstart(filename, "/dev/acd", NULL))
3835         return 100;
3836     return 0;
3837 }
3838 
3839 static int cdrom_reopen(BlockDriverState *bs)
3840 {
3841     BDRVRawState *s = bs->opaque;
3842     int fd;
3843 
3844     /*
3845      * Force reread of possibly changed/newly loaded disc,
3846      * FreeBSD seems to not notice sometimes...
3847      */
3848     if (s->fd >= 0)
3849         qemu_close(s->fd);
3850     fd = qemu_open(bs->filename, s->open_flags, NULL);
3851     if (fd < 0) {
3852         s->fd = -1;
3853         return -EIO;
3854     }
3855     s->fd = fd;
3856 
3857     /* make sure the door isn't locked at this time */
3858     ioctl(s->fd, CDIOCALLOW);
3859     return 0;
3860 }
3861 
3862 static bool cdrom_is_inserted(BlockDriverState *bs)
3863 {
3864     return raw_getlength(bs) > 0;
3865 }
3866 
3867 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3868 {
3869     BDRVRawState *s = bs->opaque;
3870 
3871     if (s->fd < 0)
3872         return;
3873 
3874     (void) ioctl(s->fd, CDIOCALLOW);
3875 
3876     if (eject_flag) {
3877         if (ioctl(s->fd, CDIOCEJECT) < 0)
3878             perror("CDIOCEJECT");
3879     } else {
3880         if (ioctl(s->fd, CDIOCCLOSE) < 0)
3881             perror("CDIOCCLOSE");
3882     }
3883 
3884     cdrom_reopen(bs);
3885 }
3886 
3887 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3888 {
3889     BDRVRawState *s = bs->opaque;
3890 
3891     if (s->fd < 0)
3892         return;
3893     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3894         /*
3895          * Note: an error can happen if the distribution automatically
3896          * mounts the CD-ROM
3897          */
3898         /* perror("CDROM_LOCKDOOR"); */
3899     }
3900 }
3901 
3902 static BlockDriver bdrv_host_cdrom = {
3903     .format_name        = "host_cdrom",
3904     .protocol_name      = "host_cdrom",
3905     .instance_size      = sizeof(BDRVRawState),
3906     .bdrv_needs_filename = true,
3907     .bdrv_probe_device	= cdrom_probe_device,
3908     .bdrv_parse_filename = cdrom_parse_filename,
3909     .bdrv_file_open     = cdrom_open,
3910     .bdrv_close         = raw_close,
3911     .bdrv_reopen_prepare = raw_reopen_prepare,
3912     .bdrv_reopen_commit  = raw_reopen_commit,
3913     .bdrv_reopen_abort   = raw_reopen_abort,
3914     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3915     .create_opts         = &bdrv_create_opts_simple,
3916     .mutable_opts       = mutable_opts,
3917 
3918     .bdrv_co_preadv         = raw_co_preadv,
3919     .bdrv_co_pwritev        = raw_co_pwritev,
3920     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3921     .bdrv_refresh_limits = raw_refresh_limits,
3922     .bdrv_io_plug = raw_aio_plug,
3923     .bdrv_io_unplug = raw_aio_unplug,
3924     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3925 
3926     .bdrv_co_truncate    = raw_co_truncate,
3927     .bdrv_getlength      = raw_getlength,
3928     .has_variable_length = true,
3929     .bdrv_get_allocated_file_size
3930                         = raw_get_allocated_file_size,
3931 
3932     /* removable device support */
3933     .bdrv_is_inserted   = cdrom_is_inserted,
3934     .bdrv_eject         = cdrom_eject,
3935     .bdrv_lock_medium   = cdrom_lock_medium,
3936 };
3937 #endif /* __FreeBSD__ */
3938 
3939 #endif /* HAVE_HOST_BLOCK_DEVICE */
3940 
3941 static void bdrv_file_init(void)
3942 {
3943     /*
3944      * Register all the drivers.  Note that order is important, the driver
3945      * registered last will get probed first.
3946      */
3947     bdrv_register(&bdrv_file);
3948 #if defined(HAVE_HOST_BLOCK_DEVICE)
3949     bdrv_register(&bdrv_host_device);
3950 #ifdef __linux__
3951     bdrv_register(&bdrv_host_cdrom);
3952 #endif
3953 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3954     bdrv_register(&bdrv_host_cdrom);
3955 #endif
3956 #endif /* HAVE_HOST_BLOCK_DEVICE */
3957 }
3958 
3959 block_init(bdrv_file_init);
3960