xref: /openbmc/qemu/block/file-posix.c (revision 9e60d759d38d1faae1d85de2c53411e635be3cf2)
1 /*
2  * Block driver for RAW files (posix)
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qapi/error.h"
28 #include "qemu/cutils.h"
29 #include "qemu/error-report.h"
30 #include "block/block_int.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "qemu/units.h"
34 #include "trace.h"
35 #include "block/thread-pool.h"
36 #include "qemu/iov.h"
37 #include "block/raw-aio.h"
38 #include "qapi/qmp/qdict.h"
39 #include "qapi/qmp/qstring.h"
40 
41 #include "scsi/pr-manager.h"
42 #include "scsi/constants.h"
43 
44 #if defined(__APPLE__) && (__MACH__)
45 #include <paths.h>
46 #include <sys/param.h>
47 #include <IOKit/IOKitLib.h>
48 #include <IOKit/IOBSD.h>
49 #include <IOKit/storage/IOMediaBSDClient.h>
50 #include <IOKit/storage/IOMedia.h>
51 #include <IOKit/storage/IOCDMedia.h>
52 //#include <IOKit/storage/IOCDTypes.h>
53 #include <IOKit/storage/IODVDMedia.h>
54 #include <CoreFoundation/CoreFoundation.h>
55 #endif
56 
57 #ifdef __sun__
58 #define _POSIX_PTHREAD_SEMANTICS 1
59 #include <sys/dkio.h>
60 #endif
61 #ifdef __linux__
62 #include <sys/ioctl.h>
63 #include <sys/param.h>
64 #include <sys/syscall.h>
65 #include <sys/vfs.h>
66 #include <linux/cdrom.h>
67 #include <linux/fd.h>
68 #include <linux/fs.h>
69 #include <linux/hdreg.h>
70 #include <linux/magic.h>
71 #include <scsi/sg.h>
72 #ifdef __s390__
73 #include <asm/dasd.h>
74 #endif
75 #ifndef FS_NOCOW_FL
76 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
77 #endif
78 #endif
79 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
80 #include <linux/falloc.h>
81 #endif
82 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
83 #include <sys/disk.h>
84 #include <sys/cdio.h>
85 #endif
86 
87 #ifdef __OpenBSD__
88 #include <sys/ioctl.h>
89 #include <sys/disklabel.h>
90 #include <sys/dkio.h>
91 #endif
92 
93 #ifdef __NetBSD__
94 #include <sys/ioctl.h>
95 #include <sys/disklabel.h>
96 #include <sys/dkio.h>
97 #include <sys/disk.h>
98 #endif
99 
100 #ifdef __DragonFly__
101 #include <sys/ioctl.h>
102 #include <sys/diskslice.h>
103 #endif
104 
105 #ifdef CONFIG_XFS
106 #include <xfs/xfs.h>
107 #endif
108 
109 #include "trace.h"
110 
111 /* OS X does not have O_DSYNC */
112 #ifndef O_DSYNC
113 #ifdef O_SYNC
114 #define O_DSYNC O_SYNC
115 #elif defined(O_FSYNC)
116 #define O_DSYNC O_FSYNC
117 #endif
118 #endif
119 
120 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
121 #ifndef O_DIRECT
122 #define O_DIRECT O_DSYNC
123 #endif
124 
125 #define FTYPE_FILE   0
126 #define FTYPE_CD     1
127 
128 #define MAX_BLOCKSIZE	4096
129 
130 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
131  * leaving a few more bytes for its future use. */
132 #define RAW_LOCK_PERM_BASE             100
133 #define RAW_LOCK_SHARED_BASE           200
134 
135 typedef struct BDRVRawState {
136     int fd;
137     bool use_lock;
138     int type;
139     int open_flags;
140     size_t buf_align;
141 
142     /* The current permissions. */
143     uint64_t perm;
144     uint64_t shared_perm;
145 
146     /* The perms bits whose corresponding bytes are already locked in
147      * s->fd. */
148     uint64_t locked_perm;
149     uint64_t locked_shared_perm;
150 
151     int perm_change_fd;
152     int perm_change_flags;
153     BDRVReopenState *reopen_state;
154 
155 #ifdef CONFIG_XFS
156     bool is_xfs:1;
157 #endif
158     bool has_discard:1;
159     bool has_write_zeroes:1;
160     bool discard_zeroes:1;
161     bool use_linux_aio:1;
162     bool use_linux_io_uring:1;
163     bool page_cache_inconsistent:1;
164     bool has_fallocate;
165     bool needs_alignment;
166     bool drop_cache;
167     bool check_cache_dropped;
168     struct {
169         uint64_t discard_nb_ok;
170         uint64_t discard_nb_failed;
171         uint64_t discard_bytes_ok;
172     } stats;
173 
174     PRManager *pr_mgr;
175 } BDRVRawState;
176 
177 typedef struct BDRVRawReopenState {
178     int fd;
179     int open_flags;
180     bool drop_cache;
181     bool check_cache_dropped;
182 } BDRVRawReopenState;
183 
184 static int fd_open(BlockDriverState *bs);
185 static int64_t raw_getlength(BlockDriverState *bs);
186 
187 typedef struct RawPosixAIOData {
188     BlockDriverState *bs;
189     int aio_type;
190     int aio_fildes;
191 
192     off_t aio_offset;
193     uint64_t aio_nbytes;
194 
195     union {
196         struct {
197             struct iovec *iov;
198             int niov;
199         } io;
200         struct {
201             uint64_t cmd;
202             void *buf;
203         } ioctl;
204         struct {
205             int aio_fd2;
206             off_t aio_offset2;
207         } copy_range;
208         struct {
209             PreallocMode prealloc;
210             Error **errp;
211         } truncate;
212     };
213 } RawPosixAIOData;
214 
215 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
216 static int cdrom_reopen(BlockDriverState *bs);
217 #endif
218 
219 #if defined(__NetBSD__)
220 static int raw_normalize_devicepath(const char **filename, Error **errp)
221 {
222     static char namebuf[PATH_MAX];
223     const char *dp, *fname;
224     struct stat sb;
225 
226     fname = *filename;
227     dp = strrchr(fname, '/');
228     if (lstat(fname, &sb) < 0) {
229         error_setg_file_open(errp, errno, fname);
230         return -errno;
231     }
232 
233     if (!S_ISBLK(sb.st_mode)) {
234         return 0;
235     }
236 
237     if (dp == NULL) {
238         snprintf(namebuf, PATH_MAX, "r%s", fname);
239     } else {
240         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
241             (int)(dp - fname), fname, dp + 1);
242     }
243     *filename = namebuf;
244     warn_report("%s is a block device, using %s", fname, *filename);
245 
246     return 0;
247 }
248 #else
249 static int raw_normalize_devicepath(const char **filename, Error **errp)
250 {
251     return 0;
252 }
253 #endif
254 
255 /*
256  * Get logical block size via ioctl. On success store it in @sector_size_p.
257  */
258 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
259 {
260     unsigned int sector_size;
261     bool success = false;
262     int i;
263 
264     errno = ENOTSUP;
265     static const unsigned long ioctl_list[] = {
266 #ifdef BLKSSZGET
267         BLKSSZGET,
268 #endif
269 #ifdef DKIOCGETBLOCKSIZE
270         DKIOCGETBLOCKSIZE,
271 #endif
272 #ifdef DIOCGSECTORSIZE
273         DIOCGSECTORSIZE,
274 #endif
275     };
276 
277     /* Try a few ioctls to get the right size */
278     for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
279         if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
280             *sector_size_p = sector_size;
281             success = true;
282         }
283     }
284 
285     return success ? 0 : -errno;
286 }
287 
288 /**
289  * Get physical block size of @fd.
290  * On success, store it in @blk_size and return 0.
291  * On failure, return -errno.
292  */
293 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
294 {
295 #ifdef BLKPBSZGET
296     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
297         return -errno;
298     }
299     return 0;
300 #else
301     return -ENOTSUP;
302 #endif
303 }
304 
305 /*
306  * Returns true if no alignment restrictions are necessary even for files
307  * opened with O_DIRECT.
308  *
309  * raw_probe_alignment() probes the required alignment and assume that 1 means
310  * the probing failed, so it falls back to a safe default of 4k. This can be
311  * avoided if we know that byte alignment is okay for the file.
312  */
313 static bool dio_byte_aligned(int fd)
314 {
315 #ifdef __linux__
316     struct statfs buf;
317     int ret;
318 
319     ret = fstatfs(fd, &buf);
320     if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
321         return true;
322     }
323 #endif
324     return false;
325 }
326 
327 /* Check if read is allowed with given memory buffer and length.
328  *
329  * This function is used to check O_DIRECT memory buffer and request alignment.
330  */
331 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
332 {
333     ssize_t ret = pread(fd, buf, len, 0);
334 
335     if (ret >= 0) {
336         return true;
337     }
338 
339 #ifdef __linux__
340     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
341      * other errors (e.g. real I/O error), which could happen on a failed
342      * drive, since we only care about probing alignment.
343      */
344     if (errno != EINVAL) {
345         return true;
346     }
347 #endif
348 
349     return false;
350 }
351 
352 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
353 {
354     BDRVRawState *s = bs->opaque;
355     char *buf;
356     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
357     size_t alignments[] = {1, 512, 1024, 2048, 4096};
358 
359     /* For SCSI generic devices the alignment is not really used.
360        With buffered I/O, we don't have any restrictions. */
361     if (bdrv_is_sg(bs) || !s->needs_alignment) {
362         bs->bl.request_alignment = 1;
363         s->buf_align = 1;
364         return;
365     }
366 
367     bs->bl.request_alignment = 0;
368     s->buf_align = 0;
369     /* Let's try to use the logical blocksize for the alignment. */
370     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
371         bs->bl.request_alignment = 0;
372     }
373 #ifdef CONFIG_XFS
374     if (s->is_xfs) {
375         struct dioattr da;
376         if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
377             bs->bl.request_alignment = da.d_miniosz;
378             /* The kernel returns wrong information for d_mem */
379             /* s->buf_align = da.d_mem; */
380         }
381     }
382 #endif
383 
384     /*
385      * If we could not get the sizes so far, we can only guess them. First try
386      * to detect request alignment, since it is more likely to succeed. Then
387      * try to detect buf_align, which cannot be detected in some cases (e.g.
388      * Gluster). If buf_align cannot be detected, we fallback to the value of
389      * request_alignment.
390      */
391 
392     if (!bs->bl.request_alignment) {
393         int i;
394         size_t align;
395         buf = qemu_memalign(max_align, max_align);
396         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
397             align = alignments[i];
398             if (raw_is_io_aligned(fd, buf, align)) {
399                 /* Fallback to safe value. */
400                 bs->bl.request_alignment = (align != 1) ? align : max_align;
401                 break;
402             }
403         }
404         qemu_vfree(buf);
405     }
406 
407     if (!s->buf_align) {
408         int i;
409         size_t align;
410         buf = qemu_memalign(max_align, 2 * max_align);
411         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
412             align = alignments[i];
413             if (raw_is_io_aligned(fd, buf + align, max_align)) {
414                 /* Fallback to request_alignment. */
415                 s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
416                 break;
417             }
418         }
419         qemu_vfree(buf);
420     }
421 
422     if (!s->buf_align || !bs->bl.request_alignment) {
423         error_setg(errp, "Could not find working O_DIRECT alignment");
424         error_append_hint(errp, "Try cache.direct=off\n");
425     }
426 }
427 
428 static int check_hdev_writable(int fd)
429 {
430 #if defined(BLKROGET)
431     /* Linux block devices can be configured "read-only" using blockdev(8).
432      * This is independent of device node permissions and therefore open(2)
433      * with O_RDWR succeeds.  Actual writes fail with EPERM.
434      *
435      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
436      * check for read-only block devices so that Linux block devices behave
437      * properly.
438      */
439     struct stat st;
440     int readonly = 0;
441 
442     if (fstat(fd, &st)) {
443         return -errno;
444     }
445 
446     if (!S_ISBLK(st.st_mode)) {
447         return 0;
448     }
449 
450     if (ioctl(fd, BLKROGET, &readonly) < 0) {
451         return -errno;
452     }
453 
454     if (readonly) {
455         return -EACCES;
456     }
457 #endif /* defined(BLKROGET) */
458     return 0;
459 }
460 
461 static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
462 {
463     bool read_write = false;
464     assert(open_flags != NULL);
465 
466     *open_flags |= O_BINARY;
467     *open_flags &= ~O_ACCMODE;
468 
469     if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
470         read_write = has_writers;
471     } else if (bdrv_flags & BDRV_O_RDWR) {
472         read_write = true;
473     }
474 
475     if (read_write) {
476         *open_flags |= O_RDWR;
477     } else {
478         *open_flags |= O_RDONLY;
479     }
480 
481     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
482      * and O_DIRECT for no caching. */
483     if ((bdrv_flags & BDRV_O_NOCACHE)) {
484         *open_flags |= O_DIRECT;
485     }
486 }
487 
488 static void raw_parse_filename(const char *filename, QDict *options,
489                                Error **errp)
490 {
491     bdrv_parse_filename_strip_prefix(filename, "file:", options);
492 }
493 
494 static QemuOptsList raw_runtime_opts = {
495     .name = "raw",
496     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
497     .desc = {
498         {
499             .name = "filename",
500             .type = QEMU_OPT_STRING,
501             .help = "File name of the image",
502         },
503         {
504             .name = "aio",
505             .type = QEMU_OPT_STRING,
506             .help = "host AIO implementation (threads, native, io_uring)",
507         },
508         {
509             .name = "locking",
510             .type = QEMU_OPT_STRING,
511             .help = "file locking mode (on/off/auto, default: auto)",
512         },
513         {
514             .name = "pr-manager",
515             .type = QEMU_OPT_STRING,
516             .help = "id of persistent reservation manager object (default: none)",
517         },
518 #if defined(__linux__)
519         {
520             .name = "drop-cache",
521             .type = QEMU_OPT_BOOL,
522             .help = "invalidate page cache during live migration (default: on)",
523         },
524 #endif
525         {
526             .name = "x-check-cache-dropped",
527             .type = QEMU_OPT_BOOL,
528             .help = "check that page cache was dropped on live migration (default: off)"
529         },
530         { /* end of list */ }
531     },
532 };
533 
534 static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
535 
536 static int raw_open_common(BlockDriverState *bs, QDict *options,
537                            int bdrv_flags, int open_flags,
538                            bool device, Error **errp)
539 {
540     BDRVRawState *s = bs->opaque;
541     QemuOpts *opts;
542     Error *local_err = NULL;
543     const char *filename = NULL;
544     const char *str;
545     BlockdevAioOptions aio, aio_default;
546     int fd, ret;
547     struct stat st;
548     OnOffAuto locking;
549 
550     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
551     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
552         ret = -EINVAL;
553         goto fail;
554     }
555 
556     filename = qemu_opt_get(opts, "filename");
557 
558     ret = raw_normalize_devicepath(&filename, errp);
559     if (ret != 0) {
560         goto fail;
561     }
562 
563     if (bdrv_flags & BDRV_O_NATIVE_AIO) {
564         aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
565 #ifdef CONFIG_LINUX_IO_URING
566     } else if (bdrv_flags & BDRV_O_IO_URING) {
567         aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
568 #endif
569     } else {
570         aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
571     }
572 
573     aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
574                           qemu_opt_get(opts, "aio"),
575                           aio_default, &local_err);
576     if (local_err) {
577         error_propagate(errp, local_err);
578         ret = -EINVAL;
579         goto fail;
580     }
581 
582     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
583 #ifdef CONFIG_LINUX_IO_URING
584     s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
585 #endif
586 
587     locking = qapi_enum_parse(&OnOffAuto_lookup,
588                               qemu_opt_get(opts, "locking"),
589                               ON_OFF_AUTO_AUTO, &local_err);
590     if (local_err) {
591         error_propagate(errp, local_err);
592         ret = -EINVAL;
593         goto fail;
594     }
595     switch (locking) {
596     case ON_OFF_AUTO_ON:
597         s->use_lock = true;
598         if (!qemu_has_ofd_lock()) {
599             warn_report("File lock requested but OFD locking syscall is "
600                         "unavailable, falling back to POSIX file locks");
601             error_printf("Due to the implementation, locks can be lost "
602                          "unexpectedly.\n");
603         }
604         break;
605     case ON_OFF_AUTO_OFF:
606         s->use_lock = false;
607         break;
608     case ON_OFF_AUTO_AUTO:
609         s->use_lock = qemu_has_ofd_lock();
610         break;
611     default:
612         abort();
613     }
614 
615     str = qemu_opt_get(opts, "pr-manager");
616     if (str) {
617         s->pr_mgr = pr_manager_lookup(str, &local_err);
618         if (local_err) {
619             error_propagate(errp, local_err);
620             ret = -EINVAL;
621             goto fail;
622         }
623     }
624 
625     s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
626     s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
627                                                false);
628 
629     s->open_flags = open_flags;
630     raw_parse_flags(bdrv_flags, &s->open_flags, false);
631 
632     s->fd = -1;
633     fd = qemu_open(filename, s->open_flags, 0644);
634     ret = fd < 0 ? -errno : 0;
635 
636     if (ret < 0) {
637         error_setg_file_open(errp, -ret, filename);
638         if (ret == -EROFS) {
639             ret = -EACCES;
640         }
641         goto fail;
642     }
643     s->fd = fd;
644 
645     /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
646     if (s->open_flags & O_RDWR) {
647         ret = check_hdev_writable(s->fd);
648         if (ret < 0) {
649             error_setg_errno(errp, -ret, "The device is not writable");
650             goto fail;
651         }
652     }
653 
654     s->perm = 0;
655     s->shared_perm = BLK_PERM_ALL;
656 
657 #ifdef CONFIG_LINUX_AIO
658      /* Currently Linux does AIO only for files opened with O_DIRECT */
659     if (s->use_linux_aio) {
660         if (!(s->open_flags & O_DIRECT)) {
661             error_setg(errp, "aio=native was specified, but it requires "
662                              "cache.direct=on, which was not specified.");
663             ret = -EINVAL;
664             goto fail;
665         }
666         if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
667             error_prepend(errp, "Unable to use native AIO: ");
668             goto fail;
669         }
670     }
671 #else
672     if (s->use_linux_aio) {
673         error_setg(errp, "aio=native was specified, but is not supported "
674                          "in this build.");
675         ret = -EINVAL;
676         goto fail;
677     }
678 #endif /* !defined(CONFIG_LINUX_AIO) */
679 
680 #ifdef CONFIG_LINUX_IO_URING
681     if (s->use_linux_io_uring) {
682         if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
683             error_prepend(errp, "Unable to use io_uring: ");
684             goto fail;
685         }
686     }
687 #else
688     if (s->use_linux_io_uring) {
689         error_setg(errp, "aio=io_uring was specified, but is not supported "
690                          "in this build.");
691         ret = -EINVAL;
692         goto fail;
693     }
694 #endif /* !defined(CONFIG_LINUX_IO_URING) */
695 
696     s->has_discard = true;
697     s->has_write_zeroes = true;
698     if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
699         s->needs_alignment = true;
700     }
701 
702     if (fstat(s->fd, &st) < 0) {
703         ret = -errno;
704         error_setg_errno(errp, errno, "Could not stat file");
705         goto fail;
706     }
707 
708     if (!device) {
709         if (S_ISBLK(st.st_mode)) {
710             warn_report("Opening a block device as a file using the '%s' "
711                         "driver is deprecated", bs->drv->format_name);
712         } else if (S_ISCHR(st.st_mode)) {
713             warn_report("Opening a character device as a file using the '%s' "
714                         "driver is deprecated", bs->drv->format_name);
715         } else if (!S_ISREG(st.st_mode)) {
716             error_setg(errp, "A regular file was expected by the '%s' driver, "
717                        "but something else was given", bs->drv->format_name);
718             ret = -EINVAL;
719             goto fail;
720         } else {
721             s->discard_zeroes = true;
722             s->has_fallocate = true;
723         }
724     } else {
725         if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
726             error_setg(errp, "'%s' driver expects either "
727                        "a character or block device", bs->drv->format_name);
728             ret = -EINVAL;
729             goto fail;
730         }
731     }
732 
733     if (S_ISBLK(st.st_mode)) {
734 #ifdef BLKDISCARDZEROES
735         unsigned int arg;
736         if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
737             s->discard_zeroes = true;
738         }
739 #endif
740 #ifdef __linux__
741         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
742          * not rely on the contents of discarded blocks unless using O_DIRECT.
743          * Same for BLKZEROOUT.
744          */
745         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
746             s->discard_zeroes = false;
747             s->has_write_zeroes = false;
748         }
749 #endif
750     }
751 #ifdef __FreeBSD__
752     if (S_ISCHR(st.st_mode)) {
753         /*
754          * The file is a char device (disk), which on FreeBSD isn't behind
755          * a pager, so force all requests to be aligned. This is needed
756          * so QEMU makes sure all IO operations on the device are aligned
757          * to sector size, or else FreeBSD will reject them with EINVAL.
758          */
759         s->needs_alignment = true;
760     }
761 #endif
762 
763 #ifdef CONFIG_XFS
764     if (platform_test_xfs_fd(s->fd)) {
765         s->is_xfs = true;
766     }
767 #endif
768 
769     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
770     if (S_ISREG(st.st_mode)) {
771         /* When extending regular files, we get zeros from the OS */
772         bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
773     }
774     ret = 0;
775 fail:
776     if (ret < 0 && s->fd != -1) {
777         qemu_close(s->fd);
778     }
779     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
780         unlink(filename);
781     }
782     qemu_opts_del(opts);
783     return ret;
784 }
785 
786 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
787                     Error **errp)
788 {
789     BDRVRawState *s = bs->opaque;
790 
791     s->type = FTYPE_FILE;
792     return raw_open_common(bs, options, flags, 0, false, errp);
793 }
794 
795 typedef enum {
796     RAW_PL_PREPARE,
797     RAW_PL_COMMIT,
798     RAW_PL_ABORT,
799 } RawPermLockOp;
800 
801 #define PERM_FOREACH(i) \
802     for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
803 
804 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
805  * file; if @unlock == true, also unlock the unneeded bytes.
806  * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
807  */
808 static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
809                                 uint64_t perm_lock_bits,
810                                 uint64_t shared_perm_lock_bits,
811                                 bool unlock, Error **errp)
812 {
813     int ret;
814     int i;
815     uint64_t locked_perm, locked_shared_perm;
816 
817     if (s) {
818         locked_perm = s->locked_perm;
819         locked_shared_perm = s->locked_shared_perm;
820     } else {
821         /*
822          * We don't have the previous bits, just lock/unlock for each of the
823          * requested bits.
824          */
825         if (unlock) {
826             locked_perm = BLK_PERM_ALL;
827             locked_shared_perm = BLK_PERM_ALL;
828         } else {
829             locked_perm = 0;
830             locked_shared_perm = 0;
831         }
832     }
833 
834     PERM_FOREACH(i) {
835         int off = RAW_LOCK_PERM_BASE + i;
836         uint64_t bit = (1ULL << i);
837         if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
838             ret = qemu_lock_fd(fd, off, 1, false);
839             if (ret) {
840                 error_setg(errp, "Failed to lock byte %d", off);
841                 return ret;
842             } else if (s) {
843                 s->locked_perm |= bit;
844             }
845         } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
846             ret = qemu_unlock_fd(fd, off, 1);
847             if (ret) {
848                 error_setg(errp, "Failed to unlock byte %d", off);
849                 return ret;
850             } else if (s) {
851                 s->locked_perm &= ~bit;
852             }
853         }
854     }
855     PERM_FOREACH(i) {
856         int off = RAW_LOCK_SHARED_BASE + i;
857         uint64_t bit = (1ULL << i);
858         if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
859             ret = qemu_lock_fd(fd, off, 1, false);
860             if (ret) {
861                 error_setg(errp, "Failed to lock byte %d", off);
862                 return ret;
863             } else if (s) {
864                 s->locked_shared_perm |= bit;
865             }
866         } else if (unlock && (locked_shared_perm & bit) &&
867                    !(shared_perm_lock_bits & bit)) {
868             ret = qemu_unlock_fd(fd, off, 1);
869             if (ret) {
870                 error_setg(errp, "Failed to unlock byte %d", off);
871                 return ret;
872             } else if (s) {
873                 s->locked_shared_perm &= ~bit;
874             }
875         }
876     }
877     return 0;
878 }
879 
880 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
881 static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
882                                 Error **errp)
883 {
884     int ret;
885     int i;
886 
887     PERM_FOREACH(i) {
888         int off = RAW_LOCK_SHARED_BASE + i;
889         uint64_t p = 1ULL << i;
890         if (perm & p) {
891             ret = qemu_lock_fd_test(fd, off, 1, true);
892             if (ret) {
893                 char *perm_name = bdrv_perm_names(p);
894                 error_setg(errp,
895                            "Failed to get \"%s\" lock",
896                            perm_name);
897                 g_free(perm_name);
898                 return ret;
899             }
900         }
901     }
902     PERM_FOREACH(i) {
903         int off = RAW_LOCK_PERM_BASE + i;
904         uint64_t p = 1ULL << i;
905         if (!(shared_perm & p)) {
906             ret = qemu_lock_fd_test(fd, off, 1, true);
907             if (ret) {
908                 char *perm_name = bdrv_perm_names(p);
909                 error_setg(errp,
910                            "Failed to get shared \"%s\" lock",
911                            perm_name);
912                 g_free(perm_name);
913                 return ret;
914             }
915         }
916     }
917     return 0;
918 }
919 
920 static int raw_handle_perm_lock(BlockDriverState *bs,
921                                 RawPermLockOp op,
922                                 uint64_t new_perm, uint64_t new_shared,
923                                 Error **errp)
924 {
925     BDRVRawState *s = bs->opaque;
926     int ret = 0;
927     Error *local_err = NULL;
928 
929     if (!s->use_lock) {
930         return 0;
931     }
932 
933     if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
934         return 0;
935     }
936 
937     switch (op) {
938     case RAW_PL_PREPARE:
939         if ((s->perm | new_perm) == s->perm &&
940             (s->shared_perm & new_shared) == s->shared_perm)
941         {
942             /*
943              * We are going to unlock bytes, it should not fail. If it fail due
944              * to some fs-dependent permission-unrelated reasons (which occurs
945              * sometimes on NFS and leads to abort in bdrv_replace_child) we
946              * can't prevent such errors by any check here. And we ignore them
947              * anyway in ABORT and COMMIT.
948              */
949             return 0;
950         }
951         ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
952                                    ~s->shared_perm | ~new_shared,
953                                    false, errp);
954         if (!ret) {
955             ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
956             if (!ret) {
957                 return 0;
958             }
959             error_append_hint(errp,
960                               "Is another process using the image [%s]?\n",
961                               bs->filename);
962         }
963         /* fall through to unlock bytes. */
964     case RAW_PL_ABORT:
965         raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
966                              true, &local_err);
967         if (local_err) {
968             /* Theoretically the above call only unlocks bytes and it cannot
969              * fail. Something weird happened, report it.
970              */
971             warn_report_err(local_err);
972         }
973         break;
974     case RAW_PL_COMMIT:
975         raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
976                              true, &local_err);
977         if (local_err) {
978             /* Theoretically the above call only unlocks bytes and it cannot
979              * fail. Something weird happened, report it.
980              */
981             warn_report_err(local_err);
982         }
983         break;
984     }
985     return ret;
986 }
987 
988 static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
989                                  int *open_flags, uint64_t perm, bool force_dup,
990                                  Error **errp)
991 {
992     BDRVRawState *s = bs->opaque;
993     int fd = -1;
994     int ret;
995     bool has_writers = perm &
996         (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
997     int fcntl_flags = O_APPEND | O_NONBLOCK;
998 #ifdef O_NOATIME
999     fcntl_flags |= O_NOATIME;
1000 #endif
1001 
1002     *open_flags = 0;
1003     if (s->type == FTYPE_CD) {
1004         *open_flags |= O_NONBLOCK;
1005     }
1006 
1007     raw_parse_flags(flags, open_flags, has_writers);
1008 
1009 #ifdef O_ASYNC
1010     /* Not all operating systems have O_ASYNC, and those that don't
1011      * will not let us track the state into rs->open_flags (typically
1012      * you achieve the same effect with an ioctl, for example I_SETSIG
1013      * on Solaris). But we do not use O_ASYNC, so that's fine.
1014      */
1015     assert((s->open_flags & O_ASYNC) == 0);
1016 #endif
1017 
1018     if (!force_dup && *open_flags == s->open_flags) {
1019         /* We're lucky, the existing fd is fine */
1020         return s->fd;
1021     }
1022 
1023     if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
1024         /* dup the original fd */
1025         fd = qemu_dup(s->fd);
1026         if (fd >= 0) {
1027             ret = fcntl_setfl(fd, *open_flags);
1028             if (ret) {
1029                 qemu_close(fd);
1030                 fd = -1;
1031             }
1032         }
1033     }
1034 
1035     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
1036     if (fd == -1) {
1037         const char *normalized_filename = bs->filename;
1038         ret = raw_normalize_devicepath(&normalized_filename, errp);
1039         if (ret >= 0) {
1040             assert(!(*open_flags & O_CREAT));
1041             fd = qemu_open(normalized_filename, *open_flags);
1042             if (fd == -1) {
1043                 error_setg_errno(errp, errno, "Could not reopen file");
1044                 return -1;
1045             }
1046         }
1047     }
1048 
1049     if (fd != -1 && (*open_flags & O_RDWR)) {
1050         ret = check_hdev_writable(fd);
1051         if (ret < 0) {
1052             qemu_close(fd);
1053             error_setg_errno(errp, -ret, "The device is not writable");
1054             return -1;
1055         }
1056     }
1057 
1058     return fd;
1059 }
1060 
1061 static int raw_reopen_prepare(BDRVReopenState *state,
1062                               BlockReopenQueue *queue, Error **errp)
1063 {
1064     BDRVRawState *s;
1065     BDRVRawReopenState *rs;
1066     QemuOpts *opts;
1067     int ret;
1068     Error *local_err = NULL;
1069 
1070     assert(state != NULL);
1071     assert(state->bs != NULL);
1072 
1073     s = state->bs->opaque;
1074 
1075     state->opaque = g_new0(BDRVRawReopenState, 1);
1076     rs = state->opaque;
1077 
1078     /* Handle options changes */
1079     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1080     if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
1081         ret = -EINVAL;
1082         goto out;
1083     }
1084 
1085     rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1086     rs->check_cache_dropped =
1087         qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1088 
1089     /* This driver's reopen function doesn't currently allow changing
1090      * other options, so let's put them back in the original QDict and
1091      * bdrv_reopen_prepare() will detect changes and complain. */
1092     qemu_opts_to_qdict(opts, state->options);
1093 
1094     rs->fd = raw_reconfigure_getfd(state->bs, state->flags, &rs->open_flags,
1095                                    state->perm, true, &local_err);
1096     if (local_err) {
1097         error_propagate(errp, local_err);
1098         ret = -1;
1099         goto out;
1100     }
1101 
1102     /* Fail already reopen_prepare() if we can't get a working O_DIRECT
1103      * alignment with the new fd. */
1104     if (rs->fd != -1) {
1105         raw_probe_alignment(state->bs, rs->fd, &local_err);
1106         if (local_err) {
1107             error_propagate(errp, local_err);
1108             ret = -EINVAL;
1109             goto out_fd;
1110         }
1111     }
1112 
1113     s->reopen_state = state;
1114     ret = 0;
1115 out_fd:
1116     if (ret < 0) {
1117         qemu_close(rs->fd);
1118         rs->fd = -1;
1119     }
1120 out:
1121     qemu_opts_del(opts);
1122     return ret;
1123 }
1124 
1125 static void raw_reopen_commit(BDRVReopenState *state)
1126 {
1127     BDRVRawReopenState *rs = state->opaque;
1128     BDRVRawState *s = state->bs->opaque;
1129 
1130     s->drop_cache = rs->drop_cache;
1131     s->check_cache_dropped = rs->check_cache_dropped;
1132     s->open_flags = rs->open_flags;
1133 
1134     qemu_close(s->fd);
1135     s->fd = rs->fd;
1136 
1137     g_free(state->opaque);
1138     state->opaque = NULL;
1139 
1140     assert(s->reopen_state == state);
1141     s->reopen_state = NULL;
1142 }
1143 
1144 
1145 static void raw_reopen_abort(BDRVReopenState *state)
1146 {
1147     BDRVRawReopenState *rs = state->opaque;
1148     BDRVRawState *s = state->bs->opaque;
1149 
1150      /* nothing to do if NULL, we didn't get far enough */
1151     if (rs == NULL) {
1152         return;
1153     }
1154 
1155     if (rs->fd >= 0) {
1156         qemu_close(rs->fd);
1157         rs->fd = -1;
1158     }
1159     g_free(state->opaque);
1160     state->opaque = NULL;
1161 
1162     assert(s->reopen_state == state);
1163     s->reopen_state = NULL;
1164 }
1165 
1166 static int sg_get_max_transfer_length(int fd)
1167 {
1168 #ifdef BLKSECTGET
1169     int max_bytes = 0;
1170 
1171     if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1172         return max_bytes;
1173     } else {
1174         return -errno;
1175     }
1176 #else
1177     return -ENOSYS;
1178 #endif
1179 }
1180 
1181 static int sg_get_max_segments(int fd)
1182 {
1183 #ifdef CONFIG_LINUX
1184     char buf[32];
1185     const char *end;
1186     char *sysfspath = NULL;
1187     int ret;
1188     int sysfd = -1;
1189     long max_segments;
1190     struct stat st;
1191 
1192     if (fstat(fd, &st)) {
1193         ret = -errno;
1194         goto out;
1195     }
1196 
1197     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1198                                 major(st.st_rdev), minor(st.st_rdev));
1199     sysfd = open(sysfspath, O_RDONLY);
1200     if (sysfd == -1) {
1201         ret = -errno;
1202         goto out;
1203     }
1204     do {
1205         ret = read(sysfd, buf, sizeof(buf) - 1);
1206     } while (ret == -1 && errno == EINTR);
1207     if (ret < 0) {
1208         ret = -errno;
1209         goto out;
1210     } else if (ret == 0) {
1211         ret = -EIO;
1212         goto out;
1213     }
1214     buf[ret] = 0;
1215     /* The file is ended with '\n', pass 'end' to accept that. */
1216     ret = qemu_strtol(buf, &end, 10, &max_segments);
1217     if (ret == 0 && end && *end == '\n') {
1218         ret = max_segments;
1219     }
1220 
1221 out:
1222     if (sysfd != -1) {
1223         close(sysfd);
1224     }
1225     g_free(sysfspath);
1226     return ret;
1227 #else
1228     return -ENOTSUP;
1229 #endif
1230 }
1231 
1232 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1233 {
1234     BDRVRawState *s = bs->opaque;
1235 
1236     if (bs->sg) {
1237         int ret = sg_get_max_transfer_length(s->fd);
1238 
1239         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1240             bs->bl.max_transfer = pow2floor(ret);
1241         }
1242 
1243         ret = sg_get_max_segments(s->fd);
1244         if (ret > 0) {
1245             bs->bl.max_transfer = MIN(bs->bl.max_transfer,
1246                                       ret * qemu_real_host_page_size);
1247         }
1248     }
1249 
1250     raw_probe_alignment(bs, s->fd, errp);
1251     bs->bl.min_mem_alignment = s->buf_align;
1252     bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
1253 }
1254 
1255 static int check_for_dasd(int fd)
1256 {
1257 #ifdef BIODASDINFO2
1258     struct dasd_information2_t info = {0};
1259 
1260     return ioctl(fd, BIODASDINFO2, &info);
1261 #else
1262     return -1;
1263 #endif
1264 }
1265 
1266 /**
1267  * Try to get @bs's logical and physical block size.
1268  * On success, store them in @bsz and return zero.
1269  * On failure, return negative errno.
1270  */
1271 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1272 {
1273     BDRVRawState *s = bs->opaque;
1274     int ret;
1275 
1276     /* If DASD, get blocksizes */
1277     if (check_for_dasd(s->fd) < 0) {
1278         return -ENOTSUP;
1279     }
1280     ret = probe_logical_blocksize(s->fd, &bsz->log);
1281     if (ret < 0) {
1282         return ret;
1283     }
1284     return probe_physical_blocksize(s->fd, &bsz->phys);
1285 }
1286 
1287 /**
1288  * Try to get @bs's geometry: cyls, heads, sectors.
1289  * On success, store them in @geo and return 0.
1290  * On failure return -errno.
1291  * (Allows block driver to assign default geometry values that guest sees)
1292  */
1293 #ifdef __linux__
1294 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1295 {
1296     BDRVRawState *s = bs->opaque;
1297     struct hd_geometry ioctl_geo = {0};
1298 
1299     /* If DASD, get its geometry */
1300     if (check_for_dasd(s->fd) < 0) {
1301         return -ENOTSUP;
1302     }
1303     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1304         return -errno;
1305     }
1306     /* HDIO_GETGEO may return success even though geo contains zeros
1307        (e.g. certain multipath setups) */
1308     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1309         return -ENOTSUP;
1310     }
1311     /* Do not return a geometry for partition */
1312     if (ioctl_geo.start != 0) {
1313         return -ENOTSUP;
1314     }
1315     geo->heads = ioctl_geo.heads;
1316     geo->sectors = ioctl_geo.sectors;
1317     geo->cylinders = ioctl_geo.cylinders;
1318 
1319     return 0;
1320 }
1321 #else /* __linux__ */
1322 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1323 {
1324     return -ENOTSUP;
1325 }
1326 #endif
1327 
1328 #if defined(__linux__)
1329 static int handle_aiocb_ioctl(void *opaque)
1330 {
1331     RawPosixAIOData *aiocb = opaque;
1332     int ret;
1333 
1334     ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1335     if (ret == -1) {
1336         return -errno;
1337     }
1338 
1339     return 0;
1340 }
1341 #endif /* linux */
1342 
1343 static int handle_aiocb_flush(void *opaque)
1344 {
1345     RawPosixAIOData *aiocb = opaque;
1346     BDRVRawState *s = aiocb->bs->opaque;
1347     int ret;
1348 
1349     if (s->page_cache_inconsistent) {
1350         return -EIO;
1351     }
1352 
1353     ret = qemu_fdatasync(aiocb->aio_fildes);
1354     if (ret == -1) {
1355         /* There is no clear definition of the semantics of a failing fsync(),
1356          * so we may have to assume the worst. The sad truth is that this
1357          * assumption is correct for Linux. Some pages are now probably marked
1358          * clean in the page cache even though they are inconsistent with the
1359          * on-disk contents. The next fdatasync() call would succeed, but no
1360          * further writeback attempt will be made. We can't get back to a state
1361          * in which we know what is on disk (we would have to rewrite
1362          * everything that was touched since the last fdatasync() at least), so
1363          * make bdrv_flush() fail permanently. Given that the behaviour isn't
1364          * really defined, I have little hope that other OSes are doing better.
1365          *
1366          * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1367          * cache. */
1368         if ((s->open_flags & O_DIRECT) == 0) {
1369             s->page_cache_inconsistent = true;
1370         }
1371         return -errno;
1372     }
1373     return 0;
1374 }
1375 
1376 #ifdef CONFIG_PREADV
1377 
1378 static bool preadv_present = true;
1379 
1380 static ssize_t
1381 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1382 {
1383     return preadv(fd, iov, nr_iov, offset);
1384 }
1385 
1386 static ssize_t
1387 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1388 {
1389     return pwritev(fd, iov, nr_iov, offset);
1390 }
1391 
1392 #else
1393 
1394 static bool preadv_present = false;
1395 
1396 static ssize_t
1397 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1398 {
1399     return -ENOSYS;
1400 }
1401 
1402 static ssize_t
1403 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1404 {
1405     return -ENOSYS;
1406 }
1407 
1408 #endif
1409 
1410 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1411 {
1412     ssize_t len;
1413 
1414     do {
1415         if (aiocb->aio_type & QEMU_AIO_WRITE)
1416             len = qemu_pwritev(aiocb->aio_fildes,
1417                                aiocb->io.iov,
1418                                aiocb->io.niov,
1419                                aiocb->aio_offset);
1420          else
1421             len = qemu_preadv(aiocb->aio_fildes,
1422                               aiocb->io.iov,
1423                               aiocb->io.niov,
1424                               aiocb->aio_offset);
1425     } while (len == -1 && errno == EINTR);
1426 
1427     if (len == -1) {
1428         return -errno;
1429     }
1430     return len;
1431 }
1432 
1433 /*
1434  * Read/writes the data to/from a given linear buffer.
1435  *
1436  * Returns the number of bytes handles or -errno in case of an error. Short
1437  * reads are only returned if the end of the file is reached.
1438  */
1439 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1440 {
1441     ssize_t offset = 0;
1442     ssize_t len;
1443 
1444     while (offset < aiocb->aio_nbytes) {
1445         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1446             len = pwrite(aiocb->aio_fildes,
1447                          (const char *)buf + offset,
1448                          aiocb->aio_nbytes - offset,
1449                          aiocb->aio_offset + offset);
1450         } else {
1451             len = pread(aiocb->aio_fildes,
1452                         buf + offset,
1453                         aiocb->aio_nbytes - offset,
1454                         aiocb->aio_offset + offset);
1455         }
1456         if (len == -1 && errno == EINTR) {
1457             continue;
1458         } else if (len == -1 && errno == EINVAL &&
1459                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1460                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1461                    offset > 0) {
1462             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1463              * after a short read.  Assume that O_DIRECT short reads only occur
1464              * at EOF.  Therefore this is a short read, not an I/O error.
1465              */
1466             break;
1467         } else if (len == -1) {
1468             offset = -errno;
1469             break;
1470         } else if (len == 0) {
1471             break;
1472         }
1473         offset += len;
1474     }
1475 
1476     return offset;
1477 }
1478 
1479 static int handle_aiocb_rw(void *opaque)
1480 {
1481     RawPosixAIOData *aiocb = opaque;
1482     ssize_t nbytes;
1483     char *buf;
1484 
1485     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1486         /*
1487          * If there is just a single buffer, and it is properly aligned
1488          * we can just use plain pread/pwrite without any problems.
1489          */
1490         if (aiocb->io.niov == 1) {
1491             nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1492             goto out;
1493         }
1494         /*
1495          * We have more than one iovec, and all are properly aligned.
1496          *
1497          * Try preadv/pwritev first and fall back to linearizing the
1498          * buffer if it's not supported.
1499          */
1500         if (preadv_present) {
1501             nbytes = handle_aiocb_rw_vector(aiocb);
1502             if (nbytes == aiocb->aio_nbytes ||
1503                 (nbytes < 0 && nbytes != -ENOSYS)) {
1504                 goto out;
1505             }
1506             preadv_present = false;
1507         }
1508 
1509         /*
1510          * XXX(hch): short read/write.  no easy way to handle the reminder
1511          * using these interfaces.  For now retry using plain
1512          * pread/pwrite?
1513          */
1514     }
1515 
1516     /*
1517      * Ok, we have to do it the hard way, copy all segments into
1518      * a single aligned buffer.
1519      */
1520     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1521     if (buf == NULL) {
1522         nbytes = -ENOMEM;
1523         goto out;
1524     }
1525 
1526     if (aiocb->aio_type & QEMU_AIO_WRITE) {
1527         char *p = buf;
1528         int i;
1529 
1530         for (i = 0; i < aiocb->io.niov; ++i) {
1531             memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1532             p += aiocb->io.iov[i].iov_len;
1533         }
1534         assert(p - buf == aiocb->aio_nbytes);
1535     }
1536 
1537     nbytes = handle_aiocb_rw_linear(aiocb, buf);
1538     if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1539         char *p = buf;
1540         size_t count = aiocb->aio_nbytes, copy;
1541         int i;
1542 
1543         for (i = 0; i < aiocb->io.niov && count; ++i) {
1544             copy = count;
1545             if (copy > aiocb->io.iov[i].iov_len) {
1546                 copy = aiocb->io.iov[i].iov_len;
1547             }
1548             memcpy(aiocb->io.iov[i].iov_base, p, copy);
1549             assert(count >= copy);
1550             p     += copy;
1551             count -= copy;
1552         }
1553         assert(count == 0);
1554     }
1555     qemu_vfree(buf);
1556 
1557 out:
1558     if (nbytes == aiocb->aio_nbytes) {
1559         return 0;
1560     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1561         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1562             return -EINVAL;
1563         } else {
1564             iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1565                       0, aiocb->aio_nbytes - nbytes);
1566             return 0;
1567         }
1568     } else {
1569         assert(nbytes < 0);
1570         return nbytes;
1571     }
1572 }
1573 
1574 static int translate_err(int err)
1575 {
1576     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1577         err == -ENOTTY) {
1578         err = -ENOTSUP;
1579     }
1580     return err;
1581 }
1582 
1583 #ifdef CONFIG_FALLOCATE
1584 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1585 {
1586     do {
1587         if (fallocate(fd, mode, offset, len) == 0) {
1588             return 0;
1589         }
1590     } while (errno == EINTR);
1591     return translate_err(-errno);
1592 }
1593 #endif
1594 
1595 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1596 {
1597     int ret = -ENOTSUP;
1598     BDRVRawState *s = aiocb->bs->opaque;
1599 
1600     if (!s->has_write_zeroes) {
1601         return -ENOTSUP;
1602     }
1603 
1604 #ifdef BLKZEROOUT
1605     /* The BLKZEROOUT implementation in the kernel doesn't set
1606      * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1607      * fallbacks. */
1608     if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1609         do {
1610             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1611             if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1612                 return 0;
1613             }
1614         } while (errno == EINTR);
1615 
1616         ret = translate_err(-errno);
1617         if (ret == -ENOTSUP) {
1618             s->has_write_zeroes = false;
1619         }
1620     }
1621 #endif
1622 
1623     return ret;
1624 }
1625 
1626 static int handle_aiocb_write_zeroes(void *opaque)
1627 {
1628     RawPosixAIOData *aiocb = opaque;
1629 #ifdef CONFIG_FALLOCATE
1630     BDRVRawState *s = aiocb->bs->opaque;
1631     int64_t len;
1632 #endif
1633 
1634     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1635         return handle_aiocb_write_zeroes_block(aiocb);
1636     }
1637 
1638 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1639     if (s->has_write_zeroes) {
1640         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1641                                aiocb->aio_offset, aiocb->aio_nbytes);
1642         if (ret == -EINVAL) {
1643             /*
1644              * Allow falling back to pwrite for file systems that
1645              * do not support fallocate() for an unaligned byte range.
1646              */
1647             return -ENOTSUP;
1648         }
1649         if (ret == 0 || ret != -ENOTSUP) {
1650             return ret;
1651         }
1652         s->has_write_zeroes = false;
1653     }
1654 #endif
1655 
1656 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1657     if (s->has_discard && s->has_fallocate) {
1658         int ret = do_fallocate(s->fd,
1659                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1660                                aiocb->aio_offset, aiocb->aio_nbytes);
1661         if (ret == 0) {
1662             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1663             if (ret == 0 || ret != -ENOTSUP) {
1664                 return ret;
1665             }
1666             s->has_fallocate = false;
1667         } else if (ret != -ENOTSUP) {
1668             return ret;
1669         } else {
1670             s->has_discard = false;
1671         }
1672     }
1673 #endif
1674 
1675 #ifdef CONFIG_FALLOCATE
1676     /* Last resort: we are trying to extend the file with zeroed data. This
1677      * can be done via fallocate(fd, 0) */
1678     len = bdrv_getlength(aiocb->bs);
1679     if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1680         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1681         if (ret == 0 || ret != -ENOTSUP) {
1682             return ret;
1683         }
1684         s->has_fallocate = false;
1685     }
1686 #endif
1687 
1688     return -ENOTSUP;
1689 }
1690 
1691 static int handle_aiocb_write_zeroes_unmap(void *opaque)
1692 {
1693     RawPosixAIOData *aiocb = opaque;
1694     BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1695 
1696     /* First try to write zeros and unmap at the same time */
1697 
1698 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1699     int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1700                            aiocb->aio_offset, aiocb->aio_nbytes);
1701     switch (ret) {
1702     case -ENOTSUP:
1703     case -EINVAL:
1704         break;
1705     default:
1706         return ret;
1707     }
1708 #endif
1709 
1710     /* If we couldn't manage to unmap while guaranteed that the area reads as
1711      * all-zero afterwards, just write zeroes without unmapping */
1712     return handle_aiocb_write_zeroes(aiocb);
1713 }
1714 
1715 #ifndef HAVE_COPY_FILE_RANGE
1716 static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1717                              off_t *out_off, size_t len, unsigned int flags)
1718 {
1719 #ifdef __NR_copy_file_range
1720     return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1721                    out_off, len, flags);
1722 #else
1723     errno = ENOSYS;
1724     return -1;
1725 #endif
1726 }
1727 #endif
1728 
1729 static int handle_aiocb_copy_range(void *opaque)
1730 {
1731     RawPosixAIOData *aiocb = opaque;
1732     uint64_t bytes = aiocb->aio_nbytes;
1733     off_t in_off = aiocb->aio_offset;
1734     off_t out_off = aiocb->copy_range.aio_offset2;
1735 
1736     while (bytes) {
1737         ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1738                                       aiocb->copy_range.aio_fd2, &out_off,
1739                                       bytes, 0);
1740         trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1741                                    aiocb->copy_range.aio_fd2, out_off, bytes,
1742                                    0, ret);
1743         if (ret == 0) {
1744             /* No progress (e.g. when beyond EOF), let the caller fall back to
1745              * buffer I/O. */
1746             return -ENOSPC;
1747         }
1748         if (ret < 0) {
1749             switch (errno) {
1750             case ENOSYS:
1751                 return -ENOTSUP;
1752             case EINTR:
1753                 continue;
1754             default:
1755                 return -errno;
1756             }
1757         }
1758         bytes -= ret;
1759     }
1760     return 0;
1761 }
1762 
1763 static int handle_aiocb_discard(void *opaque)
1764 {
1765     RawPosixAIOData *aiocb = opaque;
1766     int ret = -EOPNOTSUPP;
1767     BDRVRawState *s = aiocb->bs->opaque;
1768 
1769     if (!s->has_discard) {
1770         return -ENOTSUP;
1771     }
1772 
1773     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1774 #ifdef BLKDISCARD
1775         do {
1776             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1777             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1778                 return 0;
1779             }
1780         } while (errno == EINTR);
1781 
1782         ret = -errno;
1783 #endif
1784     } else {
1785 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1786         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1787                            aiocb->aio_offset, aiocb->aio_nbytes);
1788 #endif
1789     }
1790 
1791     ret = translate_err(ret);
1792     if (ret == -ENOTSUP) {
1793         s->has_discard = false;
1794     }
1795     return ret;
1796 }
1797 
1798 /*
1799  * Help alignment probing by allocating the first block.
1800  *
1801  * When reading with direct I/O from unallocated area on Gluster backed by XFS,
1802  * reading succeeds regardless of request length. In this case we fallback to
1803  * safe alignment which is not optimal. Allocating the first block avoids this
1804  * fallback.
1805  *
1806  * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
1807  * request alignment, so we use safe values.
1808  *
1809  * Returns: 0 on success, -errno on failure. Since this is an optimization,
1810  * caller may ignore failures.
1811  */
1812 static int allocate_first_block(int fd, size_t max_size)
1813 {
1814     size_t write_size = (max_size < MAX_BLOCKSIZE)
1815         ? BDRV_SECTOR_SIZE
1816         : MAX_BLOCKSIZE;
1817     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
1818     void *buf;
1819     ssize_t n;
1820     int ret;
1821 
1822     buf = qemu_memalign(max_align, write_size);
1823     memset(buf, 0, write_size);
1824 
1825     do {
1826         n = pwrite(fd, buf, write_size, 0);
1827     } while (n == -1 && errno == EINTR);
1828 
1829     ret = (n == -1) ? -errno : 0;
1830 
1831     qemu_vfree(buf);
1832     return ret;
1833 }
1834 
1835 static int handle_aiocb_truncate(void *opaque)
1836 {
1837     RawPosixAIOData *aiocb = opaque;
1838     int result = 0;
1839     int64_t current_length = 0;
1840     char *buf = NULL;
1841     struct stat st;
1842     int fd = aiocb->aio_fildes;
1843     int64_t offset = aiocb->aio_offset;
1844     PreallocMode prealloc = aiocb->truncate.prealloc;
1845     Error **errp = aiocb->truncate.errp;
1846 
1847     if (fstat(fd, &st) < 0) {
1848         result = -errno;
1849         error_setg_errno(errp, -result, "Could not stat file");
1850         return result;
1851     }
1852 
1853     current_length = st.st_size;
1854     if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1855         error_setg(errp, "Cannot use preallocation for shrinking files");
1856         return -ENOTSUP;
1857     }
1858 
1859     switch (prealloc) {
1860 #ifdef CONFIG_POSIX_FALLOCATE
1861     case PREALLOC_MODE_FALLOC:
1862         /*
1863          * Truncating before posix_fallocate() makes it about twice slower on
1864          * file systems that do not support fallocate(), trying to check if a
1865          * block is allocated before allocating it, so don't do that here.
1866          */
1867         if (offset != current_length) {
1868             result = -posix_fallocate(fd, current_length,
1869                                       offset - current_length);
1870             if (result != 0) {
1871                 /* posix_fallocate() doesn't set errno. */
1872                 error_setg_errno(errp, -result,
1873                                  "Could not preallocate new data");
1874             } else if (current_length == 0) {
1875                 /*
1876                  * posix_fallocate() uses fallocate() if the filesystem
1877                  * supports it, or fallback to manually writing zeroes. If
1878                  * fallocate() was used, unaligned reads from the fallocated
1879                  * area in raw_probe_alignment() will succeed, hence we need to
1880                  * allocate the first block.
1881                  *
1882                  * Optimize future alignment probing; ignore failures.
1883                  */
1884                 allocate_first_block(fd, offset);
1885             }
1886         } else {
1887             result = 0;
1888         }
1889         goto out;
1890 #endif
1891     case PREALLOC_MODE_FULL:
1892     {
1893         int64_t num = 0, left = offset - current_length;
1894         off_t seek_result;
1895 
1896         /*
1897          * Knowing the final size from the beginning could allow the file
1898          * system driver to do less allocations and possibly avoid
1899          * fragmentation of the file.
1900          */
1901         if (ftruncate(fd, offset) != 0) {
1902             result = -errno;
1903             error_setg_errno(errp, -result, "Could not resize file");
1904             goto out;
1905         }
1906 
1907         buf = g_malloc0(65536);
1908 
1909         seek_result = lseek(fd, current_length, SEEK_SET);
1910         if (seek_result < 0) {
1911             result = -errno;
1912             error_setg_errno(errp, -result,
1913                              "Failed to seek to the old end of file");
1914             goto out;
1915         }
1916 
1917         while (left > 0) {
1918             num = MIN(left, 65536);
1919             result = write(fd, buf, num);
1920             if (result < 0) {
1921                 if (errno == EINTR) {
1922                     continue;
1923                 }
1924                 result = -errno;
1925                 error_setg_errno(errp, -result,
1926                                  "Could not write zeros for preallocation");
1927                 goto out;
1928             }
1929             left -= result;
1930         }
1931         if (result >= 0) {
1932             result = fsync(fd);
1933             if (result < 0) {
1934                 result = -errno;
1935                 error_setg_errno(errp, -result,
1936                                  "Could not flush file to disk");
1937                 goto out;
1938             }
1939         }
1940         goto out;
1941     }
1942     case PREALLOC_MODE_OFF:
1943         if (ftruncate(fd, offset) != 0) {
1944             result = -errno;
1945             error_setg_errno(errp, -result, "Could not resize file");
1946         } else if (current_length == 0 && offset > current_length) {
1947             /* Optimize future alignment probing; ignore failures. */
1948             allocate_first_block(fd, offset);
1949         }
1950         return result;
1951     default:
1952         result = -ENOTSUP;
1953         error_setg(errp, "Unsupported preallocation mode: %s",
1954                    PreallocMode_str(prealloc));
1955         return result;
1956     }
1957 
1958 out:
1959     if (result < 0) {
1960         if (ftruncate(fd, current_length) < 0) {
1961             error_report("Failed to restore old file length: %s",
1962                          strerror(errno));
1963         }
1964     }
1965 
1966     g_free(buf);
1967     return result;
1968 }
1969 
1970 static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
1971                                                ThreadPoolFunc func, void *arg)
1972 {
1973     /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
1974     ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1975     return thread_pool_submit_co(pool, func, arg);
1976 }
1977 
1978 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1979                                    uint64_t bytes, QEMUIOVector *qiov, int type)
1980 {
1981     BDRVRawState *s = bs->opaque;
1982     RawPosixAIOData acb;
1983 
1984     if (fd_open(bs) < 0)
1985         return -EIO;
1986 
1987     /*
1988      * When using O_DIRECT, the request must be aligned to be able to use
1989      * either libaio or io_uring interface. If not fail back to regular thread
1990      * pool read/write code which emulates this for us if we
1991      * set QEMU_AIO_MISALIGNED.
1992      */
1993     if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
1994         type |= QEMU_AIO_MISALIGNED;
1995 #ifdef CONFIG_LINUX_IO_URING
1996     } else if (s->use_linux_io_uring) {
1997         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
1998         assert(qiov->size == bytes);
1999         return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
2000 #endif
2001 #ifdef CONFIG_LINUX_AIO
2002     } else if (s->use_linux_aio) {
2003         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2004         assert(qiov->size == bytes);
2005         return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
2006 #endif
2007     }
2008 
2009     acb = (RawPosixAIOData) {
2010         .bs             = bs,
2011         .aio_fildes     = s->fd,
2012         .aio_type       = type,
2013         .aio_offset     = offset,
2014         .aio_nbytes     = bytes,
2015         .io             = {
2016             .iov            = qiov->iov,
2017             .niov           = qiov->niov,
2018         },
2019     };
2020 
2021     assert(qiov->size == bytes);
2022     return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
2023 }
2024 
2025 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
2026                                       uint64_t bytes, QEMUIOVector *qiov,
2027                                       int flags)
2028 {
2029     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
2030 }
2031 
2032 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
2033                                        uint64_t bytes, QEMUIOVector *qiov,
2034                                        int flags)
2035 {
2036     assert(flags == 0);
2037     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
2038 }
2039 
2040 static void raw_aio_plug(BlockDriverState *bs)
2041 {
2042     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2043 #ifdef CONFIG_LINUX_AIO
2044     if (s->use_linux_aio) {
2045         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2046         laio_io_plug(bs, aio);
2047     }
2048 #endif
2049 #ifdef CONFIG_LINUX_IO_URING
2050     if (s->use_linux_io_uring) {
2051         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2052         luring_io_plug(bs, aio);
2053     }
2054 #endif
2055 }
2056 
2057 static void raw_aio_unplug(BlockDriverState *bs)
2058 {
2059     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2060 #ifdef CONFIG_LINUX_AIO
2061     if (s->use_linux_aio) {
2062         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2063         laio_io_unplug(bs, aio);
2064     }
2065 #endif
2066 #ifdef CONFIG_LINUX_IO_URING
2067     if (s->use_linux_io_uring) {
2068         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2069         luring_io_unplug(bs, aio);
2070     }
2071 #endif
2072 }
2073 
2074 static int raw_co_flush_to_disk(BlockDriverState *bs)
2075 {
2076     BDRVRawState *s = bs->opaque;
2077     RawPosixAIOData acb;
2078     int ret;
2079 
2080     ret = fd_open(bs);
2081     if (ret < 0) {
2082         return ret;
2083     }
2084 
2085     acb = (RawPosixAIOData) {
2086         .bs             = bs,
2087         .aio_fildes     = s->fd,
2088         .aio_type       = QEMU_AIO_FLUSH,
2089     };
2090 
2091 #ifdef CONFIG_LINUX_IO_URING
2092     if (s->use_linux_io_uring) {
2093         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2094         return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
2095     }
2096 #endif
2097     return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
2098 }
2099 
2100 static void raw_aio_attach_aio_context(BlockDriverState *bs,
2101                                        AioContext *new_context)
2102 {
2103     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2104 #ifdef CONFIG_LINUX_AIO
2105     if (s->use_linux_aio) {
2106         Error *local_err = NULL;
2107         if (!aio_setup_linux_aio(new_context, &local_err)) {
2108             error_reportf_err(local_err, "Unable to use native AIO, "
2109                                          "falling back to thread pool: ");
2110             s->use_linux_aio = false;
2111         }
2112     }
2113 #endif
2114 #ifdef CONFIG_LINUX_IO_URING
2115     if (s->use_linux_io_uring) {
2116         Error *local_err;
2117         if (!aio_setup_linux_io_uring(new_context, &local_err)) {
2118             error_reportf_err(local_err, "Unable to use linux io_uring, "
2119                                          "falling back to thread pool: ");
2120             s->use_linux_io_uring = false;
2121         }
2122     }
2123 #endif
2124 }
2125 
2126 static void raw_close(BlockDriverState *bs)
2127 {
2128     BDRVRawState *s = bs->opaque;
2129 
2130     if (s->fd >= 0) {
2131         qemu_close(s->fd);
2132         s->fd = -1;
2133     }
2134 }
2135 
2136 /**
2137  * Truncates the given regular file @fd to @offset and, when growing, fills the
2138  * new space according to @prealloc.
2139  *
2140  * Returns: 0 on success, -errno on failure.
2141  */
2142 static int coroutine_fn
2143 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2144                      PreallocMode prealloc, Error **errp)
2145 {
2146     RawPosixAIOData acb;
2147 
2148     acb = (RawPosixAIOData) {
2149         .bs             = bs,
2150         .aio_fildes     = fd,
2151         .aio_type       = QEMU_AIO_TRUNCATE,
2152         .aio_offset     = offset,
2153         .truncate       = {
2154             .prealloc       = prealloc,
2155             .errp           = errp,
2156         },
2157     };
2158 
2159     return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2160 }
2161 
2162 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2163                                         bool exact, PreallocMode prealloc,
2164                                         BdrvRequestFlags flags, Error **errp)
2165 {
2166     BDRVRawState *s = bs->opaque;
2167     struct stat st;
2168     int ret;
2169 
2170     if (fstat(s->fd, &st)) {
2171         ret = -errno;
2172         error_setg_errno(errp, -ret, "Failed to fstat() the file");
2173         return ret;
2174     }
2175 
2176     if (S_ISREG(st.st_mode)) {
2177         /* Always resizes to the exact @offset */
2178         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2179     }
2180 
2181     if (prealloc != PREALLOC_MODE_OFF) {
2182         error_setg(errp, "Preallocation mode '%s' unsupported for this "
2183                    "non-regular file", PreallocMode_str(prealloc));
2184         return -ENOTSUP;
2185     }
2186 
2187     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2188         int64_t cur_length = raw_getlength(bs);
2189 
2190         if (offset != cur_length && exact) {
2191             error_setg(errp, "Cannot resize device files");
2192             return -ENOTSUP;
2193         } else if (offset > cur_length) {
2194             error_setg(errp, "Cannot grow device files");
2195             return -EINVAL;
2196         }
2197     } else {
2198         error_setg(errp, "Resizing this file is not supported");
2199         return -ENOTSUP;
2200     }
2201 
2202     return 0;
2203 }
2204 
2205 #ifdef __OpenBSD__
2206 static int64_t raw_getlength(BlockDriverState *bs)
2207 {
2208     BDRVRawState *s = bs->opaque;
2209     int fd = s->fd;
2210     struct stat st;
2211 
2212     if (fstat(fd, &st))
2213         return -errno;
2214     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2215         struct disklabel dl;
2216 
2217         if (ioctl(fd, DIOCGDINFO, &dl))
2218             return -errno;
2219         return (uint64_t)dl.d_secsize *
2220             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2221     } else
2222         return st.st_size;
2223 }
2224 #elif defined(__NetBSD__)
2225 static int64_t raw_getlength(BlockDriverState *bs)
2226 {
2227     BDRVRawState *s = bs->opaque;
2228     int fd = s->fd;
2229     struct stat st;
2230 
2231     if (fstat(fd, &st))
2232         return -errno;
2233     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2234         struct dkwedge_info dkw;
2235 
2236         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2237             return dkw.dkw_size * 512;
2238         } else {
2239             struct disklabel dl;
2240 
2241             if (ioctl(fd, DIOCGDINFO, &dl))
2242                 return -errno;
2243             return (uint64_t)dl.d_secsize *
2244                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2245         }
2246     } else
2247         return st.st_size;
2248 }
2249 #elif defined(__sun__)
2250 static int64_t raw_getlength(BlockDriverState *bs)
2251 {
2252     BDRVRawState *s = bs->opaque;
2253     struct dk_minfo minfo;
2254     int ret;
2255     int64_t size;
2256 
2257     ret = fd_open(bs);
2258     if (ret < 0) {
2259         return ret;
2260     }
2261 
2262     /*
2263      * Use the DKIOCGMEDIAINFO ioctl to read the size.
2264      */
2265     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2266     if (ret != -1) {
2267         return minfo.dki_lbsize * minfo.dki_capacity;
2268     }
2269 
2270     /*
2271      * There are reports that lseek on some devices fails, but
2272      * irc discussion said that contingency on contingency was overkill.
2273      */
2274     size = lseek(s->fd, 0, SEEK_END);
2275     if (size < 0) {
2276         return -errno;
2277     }
2278     return size;
2279 }
2280 #elif defined(CONFIG_BSD)
2281 static int64_t raw_getlength(BlockDriverState *bs)
2282 {
2283     BDRVRawState *s = bs->opaque;
2284     int fd = s->fd;
2285     int64_t size;
2286     struct stat sb;
2287 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2288     int reopened = 0;
2289 #endif
2290     int ret;
2291 
2292     ret = fd_open(bs);
2293     if (ret < 0)
2294         return ret;
2295 
2296 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2297 again:
2298 #endif
2299     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2300 #ifdef DIOCGMEDIASIZE
2301         if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2302 #elif defined(DIOCGPART)
2303         {
2304                 struct partinfo pi;
2305                 if (ioctl(fd, DIOCGPART, &pi) == 0)
2306                         size = pi.media_size;
2307                 else
2308                         size = 0;
2309         }
2310         if (size == 0)
2311 #endif
2312 #if defined(__APPLE__) && defined(__MACH__)
2313         {
2314             uint64_t sectors = 0;
2315             uint32_t sector_size = 0;
2316 
2317             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2318                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2319                 size = sectors * sector_size;
2320             } else {
2321                 size = lseek(fd, 0LL, SEEK_END);
2322                 if (size < 0) {
2323                     return -errno;
2324                 }
2325             }
2326         }
2327 #else
2328         size = lseek(fd, 0LL, SEEK_END);
2329         if (size < 0) {
2330             return -errno;
2331         }
2332 #endif
2333 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2334         switch(s->type) {
2335         case FTYPE_CD:
2336             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2337             if (size == 2048LL * (unsigned)-1)
2338                 size = 0;
2339             /* XXX no disc?  maybe we need to reopen... */
2340             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2341                 reopened = 1;
2342                 goto again;
2343             }
2344         }
2345 #endif
2346     } else {
2347         size = lseek(fd, 0, SEEK_END);
2348         if (size < 0) {
2349             return -errno;
2350         }
2351     }
2352     return size;
2353 }
2354 #else
2355 static int64_t raw_getlength(BlockDriverState *bs)
2356 {
2357     BDRVRawState *s = bs->opaque;
2358     int ret;
2359     int64_t size;
2360 
2361     ret = fd_open(bs);
2362     if (ret < 0) {
2363         return ret;
2364     }
2365 
2366     size = lseek(s->fd, 0, SEEK_END);
2367     if (size < 0) {
2368         return -errno;
2369     }
2370     return size;
2371 }
2372 #endif
2373 
2374 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2375 {
2376     struct stat st;
2377     BDRVRawState *s = bs->opaque;
2378 
2379     if (fstat(s->fd, &st) < 0) {
2380         return -errno;
2381     }
2382     return (int64_t)st.st_blocks * 512;
2383 }
2384 
2385 static int coroutine_fn
2386 raw_co_create(BlockdevCreateOptions *options, Error **errp)
2387 {
2388     BlockdevCreateOptionsFile *file_opts;
2389     Error *local_err = NULL;
2390     int fd;
2391     uint64_t perm, shared;
2392     int result = 0;
2393 
2394     /* Validate options and set default values */
2395     assert(options->driver == BLOCKDEV_DRIVER_FILE);
2396     file_opts = &options->u.file;
2397 
2398     if (!file_opts->has_nocow) {
2399         file_opts->nocow = false;
2400     }
2401     if (!file_opts->has_preallocation) {
2402         file_opts->preallocation = PREALLOC_MODE_OFF;
2403     }
2404     if (!file_opts->has_extent_size_hint) {
2405         file_opts->extent_size_hint = 1 * MiB;
2406     }
2407     if (file_opts->extent_size_hint > UINT32_MAX) {
2408         result = -EINVAL;
2409         error_setg(errp, "Extent size hint is too large");
2410         goto out;
2411     }
2412 
2413     /* Create file */
2414     fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
2415     if (fd < 0) {
2416         result = -errno;
2417         error_setg_errno(errp, -result, "Could not create file");
2418         goto out;
2419     }
2420 
2421     /* Take permissions: We want to discard everything, so we need
2422      * BLK_PERM_WRITE; and truncation to the desired size requires
2423      * BLK_PERM_RESIZE.
2424      * On the other hand, we cannot share the RESIZE permission
2425      * because we promise that after this function, the file has the
2426      * size given in the options.  If someone else were to resize it
2427      * concurrently, we could not guarantee that.
2428      * Note that after this function, we can no longer guarantee that
2429      * the file is not touched by a third party, so it may be resized
2430      * then. */
2431     perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2432     shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2433 
2434     /* Step one: Take locks */
2435     result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2436     if (result < 0) {
2437         goto out_close;
2438     }
2439 
2440     /* Step two: Check that nobody else has taken conflicting locks */
2441     result = raw_check_lock_bytes(fd, perm, shared, errp);
2442     if (result < 0) {
2443         error_append_hint(errp,
2444                           "Is another process using the image [%s]?\n",
2445                           file_opts->filename);
2446         goto out_unlock;
2447     }
2448 
2449     /* Clear the file by truncating it to 0 */
2450     result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2451     if (result < 0) {
2452         goto out_unlock;
2453     }
2454 
2455     if (file_opts->nocow) {
2456 #ifdef __linux__
2457         /* Set NOCOW flag to solve performance issue on fs like btrfs.
2458          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2459          * will be ignored since any failure of this operation should not
2460          * block the left work.
2461          */
2462         int attr;
2463         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2464             attr |= FS_NOCOW_FL;
2465             ioctl(fd, FS_IOC_SETFLAGS, &attr);
2466         }
2467 #endif
2468     }
2469 #ifdef FS_IOC_FSSETXATTR
2470     /*
2471      * Try to set the extent size hint. Failure is not fatal, and a warning is
2472      * only printed if the option was explicitly specified.
2473      */
2474     {
2475         struct fsxattr attr;
2476         result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
2477         if (result == 0) {
2478             attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
2479             attr.fsx_extsize = file_opts->extent_size_hint;
2480             result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
2481         }
2482         if (result < 0 && file_opts->has_extent_size_hint &&
2483             file_opts->extent_size_hint)
2484         {
2485             warn_report("Failed to set extent size hint: %s",
2486                         strerror(errno));
2487         }
2488     }
2489 #endif
2490 
2491     /* Resize and potentially preallocate the file to the desired
2492      * final size */
2493     result = raw_regular_truncate(NULL, fd, file_opts->size,
2494                                   file_opts->preallocation, errp);
2495     if (result < 0) {
2496         goto out_unlock;
2497     }
2498 
2499 out_unlock:
2500     raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2501     if (local_err) {
2502         /* The above call should not fail, and if it does, that does
2503          * not mean the whole creation operation has failed.  So
2504          * report it the user for their convenience, but do not report
2505          * it to the caller. */
2506         warn_report_err(local_err);
2507     }
2508 
2509 out_close:
2510     if (qemu_close(fd) != 0 && result == 0) {
2511         result = -errno;
2512         error_setg_errno(errp, -result, "Could not close the new file");
2513     }
2514 out:
2515     return result;
2516 }
2517 
2518 static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
2519                                            const char *filename,
2520                                            QemuOpts *opts,
2521                                            Error **errp)
2522 {
2523     BlockdevCreateOptions options;
2524     int64_t total_size = 0;
2525     int64_t extent_size_hint = 0;
2526     bool has_extent_size_hint = false;
2527     bool nocow = false;
2528     PreallocMode prealloc;
2529     char *buf = NULL;
2530     Error *local_err = NULL;
2531 
2532     /* Skip file: protocol prefix */
2533     strstart(filename, "file:", &filename);
2534 
2535     /* Read out options */
2536     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2537                           BDRV_SECTOR_SIZE);
2538     if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
2539         has_extent_size_hint = true;
2540         extent_size_hint =
2541             qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
2542     }
2543     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2544     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2545     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2546                                PREALLOC_MODE_OFF, &local_err);
2547     g_free(buf);
2548     if (local_err) {
2549         error_propagate(errp, local_err);
2550         return -EINVAL;
2551     }
2552 
2553     options = (BlockdevCreateOptions) {
2554         .driver     = BLOCKDEV_DRIVER_FILE,
2555         .u.file     = {
2556             .filename           = (char *) filename,
2557             .size               = total_size,
2558             .has_preallocation  = true,
2559             .preallocation      = prealloc,
2560             .has_nocow          = true,
2561             .nocow              = nocow,
2562             .has_extent_size_hint = has_extent_size_hint,
2563             .extent_size_hint   = extent_size_hint,
2564         },
2565     };
2566     return raw_co_create(&options, errp);
2567 }
2568 
2569 static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
2570                                            Error **errp)
2571 {
2572     struct stat st;
2573     int ret;
2574 
2575     if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
2576         error_setg_errno(errp, ENOENT, "%s is not a regular file",
2577                          bs->filename);
2578         return -ENOENT;
2579     }
2580 
2581     ret = unlink(bs->filename);
2582     if (ret < 0) {
2583         ret = -errno;
2584         error_setg_errno(errp, -ret, "Error when deleting file %s",
2585                          bs->filename);
2586     }
2587 
2588     return ret;
2589 }
2590 
2591 /*
2592  * Find allocation range in @bs around offset @start.
2593  * May change underlying file descriptor's file offset.
2594  * If @start is not in a hole, store @start in @data, and the
2595  * beginning of the next hole in @hole, and return 0.
2596  * If @start is in a non-trailing hole, store @start in @hole and the
2597  * beginning of the next non-hole in @data, and return 0.
2598  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2599  * If we can't find out, return a negative errno other than -ENXIO.
2600  */
2601 static int find_allocation(BlockDriverState *bs, off_t start,
2602                            off_t *data, off_t *hole)
2603 {
2604 #if defined SEEK_HOLE && defined SEEK_DATA
2605     BDRVRawState *s = bs->opaque;
2606     off_t offs;
2607 
2608     /*
2609      * SEEK_DATA cases:
2610      * D1. offs == start: start is in data
2611      * D2. offs > start: start is in a hole, next data at offs
2612      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2613      *                              or start is beyond EOF
2614      *     If the latter happens, the file has been truncated behind
2615      *     our back since we opened it.  All bets are off then.
2616      *     Treating like a trailing hole is simplest.
2617      * D4. offs < 0, errno != ENXIO: we learned nothing
2618      */
2619     offs = lseek(s->fd, start, SEEK_DATA);
2620     if (offs < 0) {
2621         return -errno;          /* D3 or D4 */
2622     }
2623 
2624     if (offs < start) {
2625         /* This is not a valid return by lseek().  We are safe to just return
2626          * -EIO in this case, and we'll treat it like D4. */
2627         return -EIO;
2628     }
2629 
2630     if (offs > start) {
2631         /* D2: in hole, next data at offs */
2632         *hole = start;
2633         *data = offs;
2634         return 0;
2635     }
2636 
2637     /* D1: in data, end not yet known */
2638 
2639     /*
2640      * SEEK_HOLE cases:
2641      * H1. offs == start: start is in a hole
2642      *     If this happens here, a hole has been dug behind our back
2643      *     since the previous lseek().
2644      * H2. offs > start: either start is in data, next hole at offs,
2645      *                   or start is in trailing hole, EOF at offs
2646      *     Linux treats trailing holes like any other hole: offs ==
2647      *     start.  Solaris seeks to EOF instead: offs > start (blech).
2648      *     If that happens here, a hole has been dug behind our back
2649      *     since the previous lseek().
2650      * H3. offs < 0, errno = ENXIO: start is beyond EOF
2651      *     If this happens, the file has been truncated behind our
2652      *     back since we opened it.  Treat it like a trailing hole.
2653      * H4. offs < 0, errno != ENXIO: we learned nothing
2654      *     Pretend we know nothing at all, i.e. "forget" about D1.
2655      */
2656     offs = lseek(s->fd, start, SEEK_HOLE);
2657     if (offs < 0) {
2658         return -errno;          /* D1 and (H3 or H4) */
2659     }
2660 
2661     if (offs < start) {
2662         /* This is not a valid return by lseek().  We are safe to just return
2663          * -EIO in this case, and we'll treat it like H4. */
2664         return -EIO;
2665     }
2666 
2667     if (offs > start) {
2668         /*
2669          * D1 and H2: either in data, next hole at offs, or it was in
2670          * data but is now in a trailing hole.  In the latter case,
2671          * all bets are off.  Treating it as if it there was data all
2672          * the way to EOF is safe, so simply do that.
2673          */
2674         *data = start;
2675         *hole = offs;
2676         return 0;
2677     }
2678 
2679     /* D1 and H1 */
2680     return -EBUSY;
2681 #else
2682     return -ENOTSUP;
2683 #endif
2684 }
2685 
2686 /*
2687  * Returns the allocation status of the specified offset.
2688  *
2689  * The block layer guarantees 'offset' and 'bytes' are within bounds.
2690  *
2691  * 'pnum' is set to the number of bytes (including and immediately following
2692  * the specified offset) that are known to be in the same
2693  * allocated/unallocated state.
2694  *
2695  * 'bytes' is the max value 'pnum' should be set to.
2696  */
2697 static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2698                                             bool want_zero,
2699                                             int64_t offset,
2700                                             int64_t bytes, int64_t *pnum,
2701                                             int64_t *map,
2702                                             BlockDriverState **file)
2703 {
2704     off_t data = 0, hole = 0;
2705     int ret;
2706 
2707     assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2708 
2709     ret = fd_open(bs);
2710     if (ret < 0) {
2711         return ret;
2712     }
2713 
2714     if (!want_zero) {
2715         *pnum = bytes;
2716         *map = offset;
2717         *file = bs;
2718         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2719     }
2720 
2721     ret = find_allocation(bs, offset, &data, &hole);
2722     if (ret == -ENXIO) {
2723         /* Trailing hole */
2724         *pnum = bytes;
2725         ret = BDRV_BLOCK_ZERO;
2726     } else if (ret < 0) {
2727         /* No info available, so pretend there are no holes */
2728         *pnum = bytes;
2729         ret = BDRV_BLOCK_DATA;
2730     } else if (data == offset) {
2731         /* On a data extent, compute bytes to the end of the extent,
2732          * possibly including a partial sector at EOF. */
2733         *pnum = MIN(bytes, hole - offset);
2734 
2735         /*
2736          * We are not allowed to return partial sectors, though, so
2737          * round up if necessary.
2738          */
2739         if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2740             int64_t file_length = raw_getlength(bs);
2741             if (file_length > 0) {
2742                 /* Ignore errors, this is just a safeguard */
2743                 assert(hole == file_length);
2744             }
2745             *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2746         }
2747 
2748         ret = BDRV_BLOCK_DATA;
2749     } else {
2750         /* On a hole, compute bytes to the beginning of the next extent.  */
2751         assert(hole == offset);
2752         *pnum = MIN(bytes, data - offset);
2753         ret = BDRV_BLOCK_ZERO;
2754     }
2755     *map = offset;
2756     *file = bs;
2757     return ret | BDRV_BLOCK_OFFSET_VALID;
2758 }
2759 
2760 #if defined(__linux__)
2761 /* Verify that the file is not in the page cache */
2762 static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2763 {
2764     const size_t window_size = 128 * 1024 * 1024;
2765     BDRVRawState *s = bs->opaque;
2766     void *window = NULL;
2767     size_t length = 0;
2768     unsigned char *vec;
2769     size_t page_size;
2770     off_t offset;
2771     off_t end;
2772 
2773     /* mincore(2) page status information requires 1 byte per page */
2774     page_size = sysconf(_SC_PAGESIZE);
2775     vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2776 
2777     end = raw_getlength(bs);
2778 
2779     for (offset = 0; offset < end; offset += window_size) {
2780         void *new_window;
2781         size_t new_length;
2782         size_t vec_end;
2783         size_t i;
2784         int ret;
2785 
2786         /* Unmap previous window if size has changed */
2787         new_length = MIN(end - offset, window_size);
2788         if (new_length != length) {
2789             munmap(window, length);
2790             window = NULL;
2791             length = 0;
2792         }
2793 
2794         new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2795                           s->fd, offset);
2796         if (new_window == MAP_FAILED) {
2797             error_setg_errno(errp, errno, "mmap failed");
2798             break;
2799         }
2800 
2801         window = new_window;
2802         length = new_length;
2803 
2804         ret = mincore(window, length, vec);
2805         if (ret < 0) {
2806             error_setg_errno(errp, errno, "mincore failed");
2807             break;
2808         }
2809 
2810         vec_end = DIV_ROUND_UP(length, page_size);
2811         for (i = 0; i < vec_end; i++) {
2812             if (vec[i] & 0x1) {
2813                 break;
2814             }
2815         }
2816         if (i < vec_end) {
2817             error_setg(errp, "page cache still in use!");
2818             break;
2819         }
2820     }
2821 
2822     if (window) {
2823         munmap(window, length);
2824     }
2825 
2826     g_free(vec);
2827 }
2828 #endif /* __linux__ */
2829 
2830 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2831                                                  Error **errp)
2832 {
2833     BDRVRawState *s = bs->opaque;
2834     int ret;
2835 
2836     ret = fd_open(bs);
2837     if (ret < 0) {
2838         error_setg_errno(errp, -ret, "The file descriptor is not open");
2839         return;
2840     }
2841 
2842     if (!s->drop_cache) {
2843         return;
2844     }
2845 
2846     if (s->open_flags & O_DIRECT) {
2847         return; /* No host kernel page cache */
2848     }
2849 
2850 #if defined(__linux__)
2851     /* This sets the scene for the next syscall... */
2852     ret = bdrv_co_flush(bs);
2853     if (ret < 0) {
2854         error_setg_errno(errp, -ret, "flush failed");
2855         return;
2856     }
2857 
2858     /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2859      * process.  These limitations are okay because we just fsynced the file,
2860      * we don't use mmap, and the file should not be in use by other processes.
2861      */
2862     ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2863     if (ret != 0) { /* the return value is a positive errno */
2864         error_setg_errno(errp, ret, "fadvise failed");
2865         return;
2866     }
2867 
2868     if (s->check_cache_dropped) {
2869         check_cache_dropped(bs, errp);
2870     }
2871 #else /* __linux__ */
2872     /* Do nothing.  Live migration to a remote host with cache.direct=off is
2873      * unsupported on other host operating systems.  Cache consistency issues
2874      * may occur but no error is reported here, partly because that's the
2875      * historical behavior and partly because it's hard to differentiate valid
2876      * configurations that should not cause errors.
2877      */
2878 #endif /* !__linux__ */
2879 }
2880 
2881 static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
2882 {
2883     if (ret) {
2884         s->stats.discard_nb_failed++;
2885     } else {
2886         s->stats.discard_nb_ok++;
2887         s->stats.discard_bytes_ok += nbytes;
2888     }
2889 }
2890 
2891 static coroutine_fn int
2892 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
2893 {
2894     BDRVRawState *s = bs->opaque;
2895     RawPosixAIOData acb;
2896     int ret;
2897 
2898     acb = (RawPosixAIOData) {
2899         .bs             = bs,
2900         .aio_fildes     = s->fd,
2901         .aio_type       = QEMU_AIO_DISCARD,
2902         .aio_offset     = offset,
2903         .aio_nbytes     = bytes,
2904     };
2905 
2906     if (blkdev) {
2907         acb.aio_type |= QEMU_AIO_BLKDEV;
2908     }
2909 
2910     ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
2911     raw_account_discard(s, bytes, ret);
2912     return ret;
2913 }
2914 
2915 static coroutine_fn int
2916 raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2917 {
2918     return raw_do_pdiscard(bs, offset, bytes, false);
2919 }
2920 
2921 static int coroutine_fn
2922 raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
2923                      BdrvRequestFlags flags, bool blkdev)
2924 {
2925     BDRVRawState *s = bs->opaque;
2926     RawPosixAIOData acb;
2927     ThreadPoolFunc *handler;
2928 
2929 #ifdef CONFIG_FALLOCATE
2930     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
2931         BdrvTrackedRequest *req;
2932         uint64_t end;
2933 
2934         /*
2935          * This is a workaround for a bug in the Linux XFS driver,
2936          * where writes submitted through the AIO interface will be
2937          * discarded if they happen beyond a concurrently running
2938          * fallocate() that increases the file length (i.e., both the
2939          * write and the fallocate() happen beyond the EOF).
2940          *
2941          * To work around it, we extend the tracked request for this
2942          * zero write until INT64_MAX (effectively infinity), and mark
2943          * it as serializing.
2944          *
2945          * We have to enable this workaround for all filesystems and
2946          * AIO modes (not just XFS with aio=native), because for
2947          * remote filesystems we do not know the host configuration.
2948          */
2949 
2950         req = bdrv_co_get_self_request(bs);
2951         assert(req);
2952         assert(req->type == BDRV_TRACKED_WRITE);
2953         assert(req->offset <= offset);
2954         assert(req->offset + req->bytes >= offset + bytes);
2955 
2956         end = INT64_MAX & -(uint64_t)bs->bl.request_alignment;
2957         req->bytes = end - req->offset;
2958         req->overlap_bytes = req->bytes;
2959 
2960         bdrv_mark_request_serialising(req, bs->bl.request_alignment);
2961     }
2962 #endif
2963 
2964     acb = (RawPosixAIOData) {
2965         .bs             = bs,
2966         .aio_fildes     = s->fd,
2967         .aio_type       = QEMU_AIO_WRITE_ZEROES,
2968         .aio_offset     = offset,
2969         .aio_nbytes     = bytes,
2970     };
2971 
2972     if (blkdev) {
2973         acb.aio_type |= QEMU_AIO_BLKDEV;
2974     }
2975     if (flags & BDRV_REQ_NO_FALLBACK) {
2976         acb.aio_type |= QEMU_AIO_NO_FALLBACK;
2977     }
2978 
2979     if (flags & BDRV_REQ_MAY_UNMAP) {
2980         acb.aio_type |= QEMU_AIO_DISCARD;
2981         handler = handle_aiocb_write_zeroes_unmap;
2982     } else {
2983         handler = handle_aiocb_write_zeroes;
2984     }
2985 
2986     return raw_thread_pool_submit(bs, handler, &acb);
2987 }
2988 
2989 static int coroutine_fn raw_co_pwrite_zeroes(
2990     BlockDriverState *bs, int64_t offset,
2991     int bytes, BdrvRequestFlags flags)
2992 {
2993     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
2994 }
2995 
2996 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2997 {
2998     return 0;
2999 }
3000 
3001 static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
3002 {
3003     BDRVRawState *s = bs->opaque;
3004     return (BlockStatsSpecificFile) {
3005         .discard_nb_ok = s->stats.discard_nb_ok,
3006         .discard_nb_failed = s->stats.discard_nb_failed,
3007         .discard_bytes_ok = s->stats.discard_bytes_ok,
3008     };
3009 }
3010 
3011 static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
3012 {
3013     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3014 
3015     stats->driver = BLOCKDEV_DRIVER_FILE;
3016     stats->u.file = get_blockstats_specific_file(bs);
3017 
3018     return stats;
3019 }
3020 
3021 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
3022 {
3023     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3024 
3025     stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
3026     stats->u.host_device = get_blockstats_specific_file(bs);
3027 
3028     return stats;
3029 }
3030 
3031 static QemuOptsList raw_create_opts = {
3032     .name = "raw-create-opts",
3033     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
3034     .desc = {
3035         {
3036             .name = BLOCK_OPT_SIZE,
3037             .type = QEMU_OPT_SIZE,
3038             .help = "Virtual disk size"
3039         },
3040         {
3041             .name = BLOCK_OPT_NOCOW,
3042             .type = QEMU_OPT_BOOL,
3043             .help = "Turn off copy-on-write (valid only on btrfs)"
3044         },
3045         {
3046             .name = BLOCK_OPT_PREALLOC,
3047             .type = QEMU_OPT_STRING,
3048             .help = "Preallocation mode (allowed values: off"
3049 #ifdef CONFIG_POSIX_FALLOCATE
3050                     ", falloc"
3051 #endif
3052                     ", full)"
3053         },
3054         {
3055             .name = BLOCK_OPT_EXTENT_SIZE_HINT,
3056             .type = QEMU_OPT_SIZE,
3057             .help = "Extent size hint for the image file, 0 to disable"
3058         },
3059         { /* end of list */ }
3060     }
3061 };
3062 
3063 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
3064                           Error **errp)
3065 {
3066     BDRVRawState *s = bs->opaque;
3067     BDRVRawReopenState *rs = NULL;
3068     int open_flags;
3069     int ret;
3070 
3071     if (s->perm_change_fd) {
3072         /*
3073          * In the context of reopen, this function may be called several times
3074          * (directly and recursively while change permissions of the parent).
3075          * This is even true for children that don't inherit from the original
3076          * reopen node, so s->reopen_state is not set.
3077          *
3078          * Ignore all but the first call.
3079          */
3080         return 0;
3081     }
3082 
3083     if (s->reopen_state) {
3084         /* We already have a new file descriptor to set permissions for */
3085         assert(s->reopen_state->perm == perm);
3086         assert(s->reopen_state->shared_perm == shared);
3087         rs = s->reopen_state->opaque;
3088         s->perm_change_fd = rs->fd;
3089         s->perm_change_flags = rs->open_flags;
3090     } else {
3091         /* We may need a new fd if auto-read-only switches the mode */
3092         ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
3093                                     false, errp);
3094         if (ret < 0) {
3095             return ret;
3096         } else if (ret != s->fd) {
3097             s->perm_change_fd = ret;
3098             s->perm_change_flags = open_flags;
3099         }
3100     }
3101 
3102     /* Prepare permissions on old fd to avoid conflicts between old and new,
3103      * but keep everything locked that new will need. */
3104     ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
3105     if (ret < 0) {
3106         goto fail;
3107     }
3108 
3109     /* Copy locks to the new fd */
3110     if (s->perm_change_fd) {
3111         ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
3112                                    false, errp);
3113         if (ret < 0) {
3114             raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3115             goto fail;
3116         }
3117     }
3118     return 0;
3119 
3120 fail:
3121     if (s->perm_change_fd && !s->reopen_state) {
3122         qemu_close(s->perm_change_fd);
3123     }
3124     s->perm_change_fd = 0;
3125     return ret;
3126 }
3127 
3128 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3129 {
3130     BDRVRawState *s = bs->opaque;
3131 
3132     /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3133      * called after .bdrv_reopen_commit) */
3134     if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3135         qemu_close(s->fd);
3136         s->fd = s->perm_change_fd;
3137         s->open_flags = s->perm_change_flags;
3138     }
3139     s->perm_change_fd = 0;
3140 
3141     raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3142     s->perm = perm;
3143     s->shared_perm = shared;
3144 }
3145 
3146 static void raw_abort_perm_update(BlockDriverState *bs)
3147 {
3148     BDRVRawState *s = bs->opaque;
3149 
3150     /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3151      * the file descriptor. */
3152     if (s->perm_change_fd && !s->reopen_state) {
3153         qemu_close(s->perm_change_fd);
3154     }
3155     s->perm_change_fd = 0;
3156 
3157     raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3158 }
3159 
3160 static int coroutine_fn raw_co_copy_range_from(
3161         BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
3162         BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
3163         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3164 {
3165     return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3166                                  read_flags, write_flags);
3167 }
3168 
3169 static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
3170                                              BdrvChild *src,
3171                                              uint64_t src_offset,
3172                                              BdrvChild *dst,
3173                                              uint64_t dst_offset,
3174                                              uint64_t bytes,
3175                                              BdrvRequestFlags read_flags,
3176                                              BdrvRequestFlags write_flags)
3177 {
3178     RawPosixAIOData acb;
3179     BDRVRawState *s = bs->opaque;
3180     BDRVRawState *src_s;
3181 
3182     assert(dst->bs == bs);
3183     if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3184         return -ENOTSUP;
3185     }
3186 
3187     src_s = src->bs->opaque;
3188     if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3189         return -EIO;
3190     }
3191 
3192     acb = (RawPosixAIOData) {
3193         .bs             = bs,
3194         .aio_type       = QEMU_AIO_COPY_RANGE,
3195         .aio_fildes     = src_s->fd,
3196         .aio_offset     = src_offset,
3197         .aio_nbytes     = bytes,
3198         .copy_range     = {
3199             .aio_fd2        = s->fd,
3200             .aio_offset2    = dst_offset,
3201         },
3202     };
3203 
3204     return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
3205 }
3206 
3207 BlockDriver bdrv_file = {
3208     .format_name = "file",
3209     .protocol_name = "file",
3210     .instance_size = sizeof(BDRVRawState),
3211     .bdrv_needs_filename = true,
3212     .bdrv_probe = NULL, /* no probe for protocols */
3213     .bdrv_parse_filename = raw_parse_filename,
3214     .bdrv_file_open = raw_open,
3215     .bdrv_reopen_prepare = raw_reopen_prepare,
3216     .bdrv_reopen_commit = raw_reopen_commit,
3217     .bdrv_reopen_abort = raw_reopen_abort,
3218     .bdrv_close = raw_close,
3219     .bdrv_co_create = raw_co_create,
3220     .bdrv_co_create_opts = raw_co_create_opts,
3221     .bdrv_has_zero_init = bdrv_has_zero_init_1,
3222     .bdrv_co_block_status = raw_co_block_status,
3223     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3224     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3225     .bdrv_co_delete_file = raw_co_delete_file,
3226 
3227     .bdrv_co_preadv         = raw_co_preadv,
3228     .bdrv_co_pwritev        = raw_co_pwritev,
3229     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3230     .bdrv_co_pdiscard       = raw_co_pdiscard,
3231     .bdrv_co_copy_range_from = raw_co_copy_range_from,
3232     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3233     .bdrv_refresh_limits = raw_refresh_limits,
3234     .bdrv_io_plug = raw_aio_plug,
3235     .bdrv_io_unplug = raw_aio_unplug,
3236     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3237 
3238     .bdrv_co_truncate = raw_co_truncate,
3239     .bdrv_getlength = raw_getlength,
3240     .bdrv_get_info = raw_get_info,
3241     .bdrv_get_allocated_file_size
3242                         = raw_get_allocated_file_size,
3243     .bdrv_get_specific_stats = raw_get_specific_stats,
3244     .bdrv_check_perm = raw_check_perm,
3245     .bdrv_set_perm   = raw_set_perm,
3246     .bdrv_abort_perm_update = raw_abort_perm_update,
3247     .create_opts = &raw_create_opts,
3248     .mutable_opts = mutable_opts,
3249 };
3250 
3251 /***********************************************/
3252 /* host device */
3253 
3254 #if defined(__APPLE__) && defined(__MACH__)
3255 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3256                                 CFIndex maxPathSize, int flags);
3257 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
3258 {
3259     kern_return_t kernResult = KERN_FAILURE;
3260     mach_port_t     masterPort;
3261     CFMutableDictionaryRef  classesToMatch;
3262     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
3263     char *mediaType = NULL;
3264 
3265     kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
3266     if ( KERN_SUCCESS != kernResult ) {
3267         printf( "IOMasterPort returned %d\n", kernResult );
3268     }
3269 
3270     int index;
3271     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3272         classesToMatch = IOServiceMatching(matching_array[index]);
3273         if (classesToMatch == NULL) {
3274             error_report("IOServiceMatching returned NULL for %s",
3275                          matching_array[index]);
3276             continue;
3277         }
3278         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3279                              kCFBooleanTrue);
3280         kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
3281                                                   mediaIterator);
3282         if (kernResult != KERN_SUCCESS) {
3283             error_report("Note: IOServiceGetMatchingServices returned %d",
3284                          kernResult);
3285             continue;
3286         }
3287 
3288         /* If a match was found, leave the loop */
3289         if (*mediaIterator != 0) {
3290             trace_file_FindEjectableOpticalMedia(matching_array[index]);
3291             mediaType = g_strdup(matching_array[index]);
3292             break;
3293         }
3294     }
3295     return mediaType;
3296 }
3297 
3298 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3299                          CFIndex maxPathSize, int flags)
3300 {
3301     io_object_t     nextMedia;
3302     kern_return_t   kernResult = KERN_FAILURE;
3303     *bsdPath = '\0';
3304     nextMedia = IOIteratorNext( mediaIterator );
3305     if ( nextMedia )
3306     {
3307         CFTypeRef   bsdPathAsCFString;
3308     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3309         if ( bsdPathAsCFString ) {
3310             size_t devPathLength;
3311             strcpy( bsdPath, _PATH_DEV );
3312             if (flags & BDRV_O_NOCACHE) {
3313                 strcat(bsdPath, "r");
3314             }
3315             devPathLength = strlen( bsdPath );
3316             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3317                 kernResult = KERN_SUCCESS;
3318             }
3319             CFRelease( bsdPathAsCFString );
3320         }
3321         IOObjectRelease( nextMedia );
3322     }
3323 
3324     return kernResult;
3325 }
3326 
3327 /* Sets up a real cdrom for use in QEMU */
3328 static bool setup_cdrom(char *bsd_path, Error **errp)
3329 {
3330     int index, num_of_test_partitions = 2, fd;
3331     char test_partition[MAXPATHLEN];
3332     bool partition_found = false;
3333 
3334     /* look for a working partition */
3335     for (index = 0; index < num_of_test_partitions; index++) {
3336         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3337                  index);
3338         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
3339         if (fd >= 0) {
3340             partition_found = true;
3341             qemu_close(fd);
3342             break;
3343         }
3344     }
3345 
3346     /* if a working partition on the device was not found */
3347     if (partition_found == false) {
3348         error_setg(errp, "Failed to find a working partition on disc");
3349     } else {
3350         trace_file_setup_cdrom(test_partition);
3351         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3352     }
3353     return partition_found;
3354 }
3355 
3356 /* Prints directions on mounting and unmounting a device */
3357 static void print_unmounting_directions(const char *file_name)
3358 {
3359     error_report("If device %s is mounted on the desktop, unmount"
3360                  " it first before using it in QEMU", file_name);
3361     error_report("Command to unmount device: diskutil unmountDisk %s",
3362                  file_name);
3363     error_report("Command to mount device: diskutil mountDisk %s", file_name);
3364 }
3365 
3366 #endif /* defined(__APPLE__) && defined(__MACH__) */
3367 
3368 static int hdev_probe_device(const char *filename)
3369 {
3370     struct stat st;
3371 
3372     /* allow a dedicated CD-ROM driver to match with a higher priority */
3373     if (strstart(filename, "/dev/cdrom", NULL))
3374         return 50;
3375 
3376     if (stat(filename, &st) >= 0 &&
3377             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3378         return 100;
3379     }
3380 
3381     return 0;
3382 }
3383 
3384 static void hdev_parse_filename(const char *filename, QDict *options,
3385                                 Error **errp)
3386 {
3387     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3388 }
3389 
3390 static bool hdev_is_sg(BlockDriverState *bs)
3391 {
3392 
3393 #if defined(__linux__)
3394 
3395     BDRVRawState *s = bs->opaque;
3396     struct stat st;
3397     struct sg_scsi_id scsiid;
3398     int sg_version;
3399     int ret;
3400 
3401     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3402         return false;
3403     }
3404 
3405     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3406     if (ret < 0) {
3407         return false;
3408     }
3409 
3410     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3411     if (ret >= 0) {
3412         trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3413         return true;
3414     }
3415 
3416 #endif
3417 
3418     return false;
3419 }
3420 
3421 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3422                      Error **errp)
3423 {
3424     BDRVRawState *s = bs->opaque;
3425     int ret;
3426 
3427 #if defined(__APPLE__) && defined(__MACH__)
3428     /*
3429      * Caution: while qdict_get_str() is fine, getting non-string types
3430      * would require more care.  When @options come from -blockdev or
3431      * blockdev_add, its members are typed according to the QAPI
3432      * schema, but when they come from -drive, they're all QString.
3433      */
3434     const char *filename = qdict_get_str(options, "filename");
3435     char bsd_path[MAXPATHLEN] = "";
3436     bool error_occurred = false;
3437 
3438     /* If using a real cdrom */
3439     if (strcmp(filename, "/dev/cdrom") == 0) {
3440         char *mediaType = NULL;
3441         kern_return_t ret_val;
3442         io_iterator_t mediaIterator = 0;
3443 
3444         mediaType = FindEjectableOpticalMedia(&mediaIterator);
3445         if (mediaType == NULL) {
3446             error_setg(errp, "Please make sure your CD/DVD is in the optical"
3447                        " drive");
3448             error_occurred = true;
3449             goto hdev_open_Mac_error;
3450         }
3451 
3452         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3453         if (ret_val != KERN_SUCCESS) {
3454             error_setg(errp, "Could not get BSD path for optical drive");
3455             error_occurred = true;
3456             goto hdev_open_Mac_error;
3457         }
3458 
3459         /* If a real optical drive was not found */
3460         if (bsd_path[0] == '\0') {
3461             error_setg(errp, "Failed to obtain bsd path for optical drive");
3462             error_occurred = true;
3463             goto hdev_open_Mac_error;
3464         }
3465 
3466         /* If using a cdrom disc and finding a partition on the disc failed */
3467         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3468             setup_cdrom(bsd_path, errp) == false) {
3469             print_unmounting_directions(bsd_path);
3470             error_occurred = true;
3471             goto hdev_open_Mac_error;
3472         }
3473 
3474         qdict_put_str(options, "filename", bsd_path);
3475 
3476 hdev_open_Mac_error:
3477         g_free(mediaType);
3478         if (mediaIterator) {
3479             IOObjectRelease(mediaIterator);
3480         }
3481         if (error_occurred) {
3482             return -ENOENT;
3483         }
3484     }
3485 #endif /* defined(__APPLE__) && defined(__MACH__) */
3486 
3487     s->type = FTYPE_FILE;
3488 
3489     ret = raw_open_common(bs, options, flags, 0, true, errp);
3490     if (ret < 0) {
3491 #if defined(__APPLE__) && defined(__MACH__)
3492         if (*bsd_path) {
3493             filename = bsd_path;
3494         }
3495         /* if a physical device experienced an error while being opened */
3496         if (strncmp(filename, "/dev/", 5) == 0) {
3497             print_unmounting_directions(filename);
3498         }
3499 #endif /* defined(__APPLE__) && defined(__MACH__) */
3500         return ret;
3501     }
3502 
3503     /* Since this does ioctl the device must be already opened */
3504     bs->sg = hdev_is_sg(bs);
3505 
3506     return ret;
3507 }
3508 
3509 #if defined(__linux__)
3510 static int coroutine_fn
3511 hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3512 {
3513     BDRVRawState *s = bs->opaque;
3514     RawPosixAIOData acb;
3515     int ret;
3516 
3517     ret = fd_open(bs);
3518     if (ret < 0) {
3519         return ret;
3520     }
3521 
3522     if (req == SG_IO && s->pr_mgr) {
3523         struct sg_io_hdr *io_hdr = buf;
3524         if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3525             io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3526             return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3527                                       s->fd, io_hdr);
3528         }
3529     }
3530 
3531     acb = (RawPosixAIOData) {
3532         .bs         = bs,
3533         .aio_type   = QEMU_AIO_IOCTL,
3534         .aio_fildes = s->fd,
3535         .aio_offset = 0,
3536         .ioctl      = {
3537             .buf        = buf,
3538             .cmd        = req,
3539         },
3540     };
3541 
3542     return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3543 }
3544 #endif /* linux */
3545 
3546 static int fd_open(BlockDriverState *bs)
3547 {
3548     BDRVRawState *s = bs->opaque;
3549 
3550     /* this is just to ensure s->fd is sane (its called by io ops) */
3551     if (s->fd >= 0)
3552         return 0;
3553     return -EIO;
3554 }
3555 
3556 static coroutine_fn int
3557 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3558 {
3559     BDRVRawState *s = bs->opaque;
3560     int ret;
3561 
3562     ret = fd_open(bs);
3563     if (ret < 0) {
3564         raw_account_discard(s, bytes, ret);
3565         return ret;
3566     }
3567     return raw_do_pdiscard(bs, offset, bytes, true);
3568 }
3569 
3570 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3571     int64_t offset, int bytes, BdrvRequestFlags flags)
3572 {
3573     int rc;
3574 
3575     rc = fd_open(bs);
3576     if (rc < 0) {
3577         return rc;
3578     }
3579 
3580     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3581 }
3582 
3583 static BlockDriver bdrv_host_device = {
3584     .format_name        = "host_device",
3585     .protocol_name        = "host_device",
3586     .instance_size      = sizeof(BDRVRawState),
3587     .bdrv_needs_filename = true,
3588     .bdrv_probe_device  = hdev_probe_device,
3589     .bdrv_parse_filename = hdev_parse_filename,
3590     .bdrv_file_open     = hdev_open,
3591     .bdrv_close         = raw_close,
3592     .bdrv_reopen_prepare = raw_reopen_prepare,
3593     .bdrv_reopen_commit  = raw_reopen_commit,
3594     .bdrv_reopen_abort   = raw_reopen_abort,
3595     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3596     .create_opts         = &bdrv_create_opts_simple,
3597     .mutable_opts        = mutable_opts,
3598     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3599     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3600 
3601     .bdrv_co_preadv         = raw_co_preadv,
3602     .bdrv_co_pwritev        = raw_co_pwritev,
3603     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3604     .bdrv_co_pdiscard       = hdev_co_pdiscard,
3605     .bdrv_co_copy_range_from = raw_co_copy_range_from,
3606     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3607     .bdrv_refresh_limits = raw_refresh_limits,
3608     .bdrv_io_plug = raw_aio_plug,
3609     .bdrv_io_unplug = raw_aio_unplug,
3610     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3611 
3612     .bdrv_co_truncate       = raw_co_truncate,
3613     .bdrv_getlength	= raw_getlength,
3614     .bdrv_get_info = raw_get_info,
3615     .bdrv_get_allocated_file_size
3616                         = raw_get_allocated_file_size,
3617     .bdrv_get_specific_stats = hdev_get_specific_stats,
3618     .bdrv_check_perm = raw_check_perm,
3619     .bdrv_set_perm   = raw_set_perm,
3620     .bdrv_abort_perm_update = raw_abort_perm_update,
3621     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3622     .bdrv_probe_geometry = hdev_probe_geometry,
3623 
3624     /* generic scsi device */
3625 #ifdef __linux__
3626     .bdrv_co_ioctl          = hdev_co_ioctl,
3627 #endif
3628 };
3629 
3630 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3631 static void cdrom_parse_filename(const char *filename, QDict *options,
3632                                  Error **errp)
3633 {
3634     bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3635 }
3636 #endif
3637 
3638 #ifdef __linux__
3639 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3640                       Error **errp)
3641 {
3642     BDRVRawState *s = bs->opaque;
3643 
3644     s->type = FTYPE_CD;
3645 
3646     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3647     return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3648 }
3649 
3650 static int cdrom_probe_device(const char *filename)
3651 {
3652     int fd, ret;
3653     int prio = 0;
3654     struct stat st;
3655 
3656     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
3657     if (fd < 0) {
3658         goto out;
3659     }
3660     ret = fstat(fd, &st);
3661     if (ret == -1 || !S_ISBLK(st.st_mode)) {
3662         goto outc;
3663     }
3664 
3665     /* Attempt to detect via a CDROM specific ioctl */
3666     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3667     if (ret >= 0)
3668         prio = 100;
3669 
3670 outc:
3671     qemu_close(fd);
3672 out:
3673     return prio;
3674 }
3675 
3676 static bool cdrom_is_inserted(BlockDriverState *bs)
3677 {
3678     BDRVRawState *s = bs->opaque;
3679     int ret;
3680 
3681     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3682     return ret == CDS_DISC_OK;
3683 }
3684 
3685 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3686 {
3687     BDRVRawState *s = bs->opaque;
3688 
3689     if (eject_flag) {
3690         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3691             perror("CDROMEJECT");
3692     } else {
3693         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3694             perror("CDROMEJECT");
3695     }
3696 }
3697 
3698 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3699 {
3700     BDRVRawState *s = bs->opaque;
3701 
3702     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3703         /*
3704          * Note: an error can happen if the distribution automatically
3705          * mounts the CD-ROM
3706          */
3707         /* perror("CDROM_LOCKDOOR"); */
3708     }
3709 }
3710 
3711 static BlockDriver bdrv_host_cdrom = {
3712     .format_name        = "host_cdrom",
3713     .protocol_name      = "host_cdrom",
3714     .instance_size      = sizeof(BDRVRawState),
3715     .bdrv_needs_filename = true,
3716     .bdrv_probe_device	= cdrom_probe_device,
3717     .bdrv_parse_filename = cdrom_parse_filename,
3718     .bdrv_file_open     = cdrom_open,
3719     .bdrv_close         = raw_close,
3720     .bdrv_reopen_prepare = raw_reopen_prepare,
3721     .bdrv_reopen_commit  = raw_reopen_commit,
3722     .bdrv_reopen_abort   = raw_reopen_abort,
3723     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3724     .create_opts         = &bdrv_create_opts_simple,
3725     .mutable_opts        = mutable_opts,
3726     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3727 
3728     .bdrv_co_preadv         = raw_co_preadv,
3729     .bdrv_co_pwritev        = raw_co_pwritev,
3730     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3731     .bdrv_refresh_limits = raw_refresh_limits,
3732     .bdrv_io_plug = raw_aio_plug,
3733     .bdrv_io_unplug = raw_aio_unplug,
3734     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3735 
3736     .bdrv_co_truncate    = raw_co_truncate,
3737     .bdrv_getlength      = raw_getlength,
3738     .has_variable_length = true,
3739     .bdrv_get_allocated_file_size
3740                         = raw_get_allocated_file_size,
3741 
3742     /* removable device support */
3743     .bdrv_is_inserted   = cdrom_is_inserted,
3744     .bdrv_eject         = cdrom_eject,
3745     .bdrv_lock_medium   = cdrom_lock_medium,
3746 
3747     /* generic scsi device */
3748     .bdrv_co_ioctl      = hdev_co_ioctl,
3749 };
3750 #endif /* __linux__ */
3751 
3752 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3753 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3754                       Error **errp)
3755 {
3756     BDRVRawState *s = bs->opaque;
3757     int ret;
3758 
3759     s->type = FTYPE_CD;
3760 
3761     ret = raw_open_common(bs, options, flags, 0, true, errp);
3762     if (ret) {
3763         return ret;
3764     }
3765 
3766     /* make sure the door isn't locked at this time */
3767     ioctl(s->fd, CDIOCALLOW);
3768     return 0;
3769 }
3770 
3771 static int cdrom_probe_device(const char *filename)
3772 {
3773     if (strstart(filename, "/dev/cd", NULL) ||
3774             strstart(filename, "/dev/acd", NULL))
3775         return 100;
3776     return 0;
3777 }
3778 
3779 static int cdrom_reopen(BlockDriverState *bs)
3780 {
3781     BDRVRawState *s = bs->opaque;
3782     int fd;
3783 
3784     /*
3785      * Force reread of possibly changed/newly loaded disc,
3786      * FreeBSD seems to not notice sometimes...
3787      */
3788     if (s->fd >= 0)
3789         qemu_close(s->fd);
3790     fd = qemu_open(bs->filename, s->open_flags, 0644);
3791     if (fd < 0) {
3792         s->fd = -1;
3793         return -EIO;
3794     }
3795     s->fd = fd;
3796 
3797     /* make sure the door isn't locked at this time */
3798     ioctl(s->fd, CDIOCALLOW);
3799     return 0;
3800 }
3801 
3802 static bool cdrom_is_inserted(BlockDriverState *bs)
3803 {
3804     return raw_getlength(bs) > 0;
3805 }
3806 
3807 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3808 {
3809     BDRVRawState *s = bs->opaque;
3810 
3811     if (s->fd < 0)
3812         return;
3813 
3814     (void) ioctl(s->fd, CDIOCALLOW);
3815 
3816     if (eject_flag) {
3817         if (ioctl(s->fd, CDIOCEJECT) < 0)
3818             perror("CDIOCEJECT");
3819     } else {
3820         if (ioctl(s->fd, CDIOCCLOSE) < 0)
3821             perror("CDIOCCLOSE");
3822     }
3823 
3824     cdrom_reopen(bs);
3825 }
3826 
3827 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3828 {
3829     BDRVRawState *s = bs->opaque;
3830 
3831     if (s->fd < 0)
3832         return;
3833     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3834         /*
3835          * Note: an error can happen if the distribution automatically
3836          * mounts the CD-ROM
3837          */
3838         /* perror("CDROM_LOCKDOOR"); */
3839     }
3840 }
3841 
3842 static BlockDriver bdrv_host_cdrom = {
3843     .format_name        = "host_cdrom",
3844     .protocol_name      = "host_cdrom",
3845     .instance_size      = sizeof(BDRVRawState),
3846     .bdrv_needs_filename = true,
3847     .bdrv_probe_device	= cdrom_probe_device,
3848     .bdrv_parse_filename = cdrom_parse_filename,
3849     .bdrv_file_open     = cdrom_open,
3850     .bdrv_close         = raw_close,
3851     .bdrv_reopen_prepare = raw_reopen_prepare,
3852     .bdrv_reopen_commit  = raw_reopen_commit,
3853     .bdrv_reopen_abort   = raw_reopen_abort,
3854     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3855     .create_opts         = &bdrv_create_opts_simple,
3856     .mutable_opts       = mutable_opts,
3857 
3858     .bdrv_co_preadv         = raw_co_preadv,
3859     .bdrv_co_pwritev        = raw_co_pwritev,
3860     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3861     .bdrv_refresh_limits = raw_refresh_limits,
3862     .bdrv_io_plug = raw_aio_plug,
3863     .bdrv_io_unplug = raw_aio_unplug,
3864     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3865 
3866     .bdrv_co_truncate    = raw_co_truncate,
3867     .bdrv_getlength      = raw_getlength,
3868     .has_variable_length = true,
3869     .bdrv_get_allocated_file_size
3870                         = raw_get_allocated_file_size,
3871 
3872     /* removable device support */
3873     .bdrv_is_inserted   = cdrom_is_inserted,
3874     .bdrv_eject         = cdrom_eject,
3875     .bdrv_lock_medium   = cdrom_lock_medium,
3876 };
3877 #endif /* __FreeBSD__ */
3878 
3879 static void bdrv_file_init(void)
3880 {
3881     /*
3882      * Register all the drivers.  Note that order is important, the driver
3883      * registered last will get probed first.
3884      */
3885     bdrv_register(&bdrv_file);
3886     bdrv_register(&bdrv_host_device);
3887 #ifdef __linux__
3888     bdrv_register(&bdrv_host_cdrom);
3889 #endif
3890 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3891     bdrv_register(&bdrv_host_cdrom);
3892 #endif
3893 }
3894 
3895 block_init(bdrv_file_init);
3896