xref: /openbmc/qemu/block/file-posix.c (revision eba61056)
1 /*
2  * Block driver for RAW files (posix)
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qapi/error.h"
28 #include "qemu/cutils.h"
29 #include "qemu/error-report.h"
30 #include "block/block_int.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "qemu/units.h"
34 #include "trace.h"
35 #include "block/thread-pool.h"
36 #include "qemu/iov.h"
37 #include "block/raw-aio.h"
38 #include "qapi/qmp/qdict.h"
39 #include "qapi/qmp/qstring.h"
40 
41 #include "scsi/pr-manager.h"
42 #include "scsi/constants.h"
43 
44 #if defined(__APPLE__) && (__MACH__)
45 #include <sys/ioctl.h>
46 #if defined(HAVE_HOST_BLOCK_DEVICE)
47 #include <paths.h>
48 #include <sys/param.h>
49 #include <IOKit/IOKitLib.h>
50 #include <IOKit/IOBSD.h>
51 #include <IOKit/storage/IOMediaBSDClient.h>
52 #include <IOKit/storage/IOMedia.h>
53 #include <IOKit/storage/IOCDMedia.h>
54 //#include <IOKit/storage/IOCDTypes.h>
55 #include <IOKit/storage/IODVDMedia.h>
56 #include <CoreFoundation/CoreFoundation.h>
57 #endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
58 #endif
59 
60 #ifdef __sun__
61 #define _POSIX_PTHREAD_SEMANTICS 1
62 #include <sys/dkio.h>
63 #endif
64 #ifdef __linux__
65 #include <sys/ioctl.h>
66 #include <sys/param.h>
67 #include <sys/syscall.h>
68 #include <sys/vfs.h>
69 #include <linux/cdrom.h>
70 #include <linux/fd.h>
71 #include <linux/fs.h>
72 #include <linux/hdreg.h>
73 #include <linux/magic.h>
74 #include <scsi/sg.h>
75 #ifdef __s390__
76 #include <asm/dasd.h>
77 #endif
78 #ifndef FS_NOCOW_FL
79 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
80 #endif
81 #endif
82 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
83 #include <linux/falloc.h>
84 #endif
85 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
86 #include <sys/disk.h>
87 #include <sys/cdio.h>
88 #endif
89 
90 #ifdef __OpenBSD__
91 #include <sys/ioctl.h>
92 #include <sys/disklabel.h>
93 #include <sys/dkio.h>
94 #endif
95 
96 #ifdef __NetBSD__
97 #include <sys/ioctl.h>
98 #include <sys/disklabel.h>
99 #include <sys/dkio.h>
100 #include <sys/disk.h>
101 #endif
102 
103 #ifdef __DragonFly__
104 #include <sys/ioctl.h>
105 #include <sys/diskslice.h>
106 #endif
107 
108 #ifdef CONFIG_XFS
109 #include <xfs/xfs.h>
110 #endif
111 
112 /* OS X does not have O_DSYNC */
113 #ifndef O_DSYNC
114 #ifdef O_SYNC
115 #define O_DSYNC O_SYNC
116 #elif defined(O_FSYNC)
117 #define O_DSYNC O_FSYNC
118 #endif
119 #endif
120 
121 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
122 #ifndef O_DIRECT
123 #define O_DIRECT O_DSYNC
124 #endif
125 
126 #define FTYPE_FILE   0
127 #define FTYPE_CD     1
128 
129 #define MAX_BLOCKSIZE	4096
130 
131 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
132  * leaving a few more bytes for its future use. */
133 #define RAW_LOCK_PERM_BASE             100
134 #define RAW_LOCK_SHARED_BASE           200
135 
136 typedef struct BDRVRawState {
137     int fd;
138     bool use_lock;
139     int type;
140     int open_flags;
141     size_t buf_align;
142 
143     /* The current permissions. */
144     uint64_t perm;
145     uint64_t shared_perm;
146 
147     /* The perms bits whose corresponding bytes are already locked in
148      * s->fd. */
149     uint64_t locked_perm;
150     uint64_t locked_shared_perm;
151 
152     int perm_change_fd;
153     int perm_change_flags;
154     BDRVReopenState *reopen_state;
155 
156 #ifdef CONFIG_XFS
157     bool is_xfs:1;
158 #endif
159     bool has_discard:1;
160     bool has_write_zeroes:1;
161     bool discard_zeroes:1;
162     bool use_linux_aio:1;
163     bool use_linux_io_uring:1;
164     int page_cache_inconsistent; /* errno from fdatasync failure */
165     bool has_fallocate;
166     bool needs_alignment;
167     bool drop_cache;
168     bool check_cache_dropped;
169     struct {
170         uint64_t discard_nb_ok;
171         uint64_t discard_nb_failed;
172         uint64_t discard_bytes_ok;
173     } stats;
174 
175     PRManager *pr_mgr;
176 } BDRVRawState;
177 
178 typedef struct BDRVRawReopenState {
179     int open_flags;
180     bool drop_cache;
181     bool check_cache_dropped;
182 } BDRVRawReopenState;
183 
184 static int fd_open(BlockDriverState *bs)
185 {
186     BDRVRawState *s = bs->opaque;
187 
188     /* this is just to ensure s->fd is sane (its called by io ops) */
189     if (s->fd >= 0) {
190         return 0;
191     }
192     return -EIO;
193 }
194 
195 static int64_t raw_getlength(BlockDriverState *bs);
196 
197 typedef struct RawPosixAIOData {
198     BlockDriverState *bs;
199     int aio_type;
200     int aio_fildes;
201 
202     off_t aio_offset;
203     uint64_t aio_nbytes;
204 
205     union {
206         struct {
207             struct iovec *iov;
208             int niov;
209         } io;
210         struct {
211             uint64_t cmd;
212             void *buf;
213         } ioctl;
214         struct {
215             int aio_fd2;
216             off_t aio_offset2;
217         } copy_range;
218         struct {
219             PreallocMode prealloc;
220             Error **errp;
221         } truncate;
222     };
223 } RawPosixAIOData;
224 
225 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
226 static int cdrom_reopen(BlockDriverState *bs);
227 #endif
228 
229 /*
230  * Elide EAGAIN and EACCES details when failing to lock, as this
231  * indicates that the specified file region is already locked by
232  * another process, which is considered a common scenario.
233  */
234 #define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
235     do {                                                                \
236         if ((err) == EAGAIN || (err) == EACCES) {                       \
237             error_setg((errp), (fmt), ## __VA_ARGS__);                  \
238         } else {                                                        \
239             error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
240         }                                                               \
241     } while (0)
242 
243 #if defined(__NetBSD__)
244 static int raw_normalize_devicepath(const char **filename, Error **errp)
245 {
246     static char namebuf[PATH_MAX];
247     const char *dp, *fname;
248     struct stat sb;
249 
250     fname = *filename;
251     dp = strrchr(fname, '/');
252     if (lstat(fname, &sb) < 0) {
253         error_setg_file_open(errp, errno, fname);
254         return -errno;
255     }
256 
257     if (!S_ISBLK(sb.st_mode)) {
258         return 0;
259     }
260 
261     if (dp == NULL) {
262         snprintf(namebuf, PATH_MAX, "r%s", fname);
263     } else {
264         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
265             (int)(dp - fname), fname, dp + 1);
266     }
267     *filename = namebuf;
268     warn_report("%s is a block device, using %s", fname, *filename);
269 
270     return 0;
271 }
272 #else
273 static int raw_normalize_devicepath(const char **filename, Error **errp)
274 {
275     return 0;
276 }
277 #endif
278 
279 /*
280  * Get logical block size via ioctl. On success store it in @sector_size_p.
281  */
282 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
283 {
284     unsigned int sector_size;
285     bool success = false;
286     int i;
287 
288     errno = ENOTSUP;
289     static const unsigned long ioctl_list[] = {
290 #ifdef BLKSSZGET
291         BLKSSZGET,
292 #endif
293 #ifdef DKIOCGETBLOCKSIZE
294         DKIOCGETBLOCKSIZE,
295 #endif
296 #ifdef DIOCGSECTORSIZE
297         DIOCGSECTORSIZE,
298 #endif
299     };
300 
301     /* Try a few ioctls to get the right size */
302     for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
303         if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
304             *sector_size_p = sector_size;
305             success = true;
306         }
307     }
308 
309     return success ? 0 : -errno;
310 }
311 
312 /**
313  * Get physical block size of @fd.
314  * On success, store it in @blk_size and return 0.
315  * On failure, return -errno.
316  */
317 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
318 {
319 #ifdef BLKPBSZGET
320     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
321         return -errno;
322     }
323     return 0;
324 #else
325     return -ENOTSUP;
326 #endif
327 }
328 
329 /*
330  * Returns true if no alignment restrictions are necessary even for files
331  * opened with O_DIRECT.
332  *
333  * raw_probe_alignment() probes the required alignment and assume that 1 means
334  * the probing failed, so it falls back to a safe default of 4k. This can be
335  * avoided if we know that byte alignment is okay for the file.
336  */
337 static bool dio_byte_aligned(int fd)
338 {
339 #ifdef __linux__
340     struct statfs buf;
341     int ret;
342 
343     ret = fstatfs(fd, &buf);
344     if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
345         return true;
346     }
347 #endif
348     return false;
349 }
350 
351 /* Check if read is allowed with given memory buffer and length.
352  *
353  * This function is used to check O_DIRECT memory buffer and request alignment.
354  */
355 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
356 {
357     ssize_t ret = pread(fd, buf, len, 0);
358 
359     if (ret >= 0) {
360         return true;
361     }
362 
363 #ifdef __linux__
364     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
365      * other errors (e.g. real I/O error), which could happen on a failed
366      * drive, since we only care about probing alignment.
367      */
368     if (errno != EINVAL) {
369         return true;
370     }
371 #endif
372 
373     return false;
374 }
375 
376 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
377 {
378     BDRVRawState *s = bs->opaque;
379     char *buf;
380     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
381     size_t alignments[] = {1, 512, 1024, 2048, 4096};
382 
383     /* For SCSI generic devices the alignment is not really used.
384        With buffered I/O, we don't have any restrictions. */
385     if (bdrv_is_sg(bs) || !s->needs_alignment) {
386         bs->bl.request_alignment = 1;
387         s->buf_align = 1;
388         return;
389     }
390 
391     bs->bl.request_alignment = 0;
392     s->buf_align = 0;
393     /* Let's try to use the logical blocksize for the alignment. */
394     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
395         bs->bl.request_alignment = 0;
396     }
397 #ifdef CONFIG_XFS
398     if (s->is_xfs) {
399         struct dioattr da;
400         if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
401             bs->bl.request_alignment = da.d_miniosz;
402             /* The kernel returns wrong information for d_mem */
403             /* s->buf_align = da.d_mem; */
404         }
405     }
406 #endif
407 
408     /*
409      * If we could not get the sizes so far, we can only guess them. First try
410      * to detect request alignment, since it is more likely to succeed. Then
411      * try to detect buf_align, which cannot be detected in some cases (e.g.
412      * Gluster). If buf_align cannot be detected, we fallback to the value of
413      * request_alignment.
414      */
415 
416     if (!bs->bl.request_alignment) {
417         int i;
418         size_t align;
419         buf = qemu_memalign(max_align, max_align);
420         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
421             align = alignments[i];
422             if (raw_is_io_aligned(fd, buf, align)) {
423                 /* Fallback to safe value. */
424                 bs->bl.request_alignment = (align != 1) ? align : max_align;
425                 break;
426             }
427         }
428         qemu_vfree(buf);
429     }
430 
431     if (!s->buf_align) {
432         int i;
433         size_t align;
434         buf = qemu_memalign(max_align, 2 * max_align);
435         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
436             align = alignments[i];
437             if (raw_is_io_aligned(fd, buf + align, max_align)) {
438                 /* Fallback to request_alignment. */
439                 s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
440                 break;
441             }
442         }
443         qemu_vfree(buf);
444     }
445 
446     if (!s->buf_align || !bs->bl.request_alignment) {
447         error_setg(errp, "Could not find working O_DIRECT alignment");
448         error_append_hint(errp, "Try cache.direct=off\n");
449     }
450 }
451 
452 static int check_hdev_writable(int fd)
453 {
454 #if defined(BLKROGET)
455     /* Linux block devices can be configured "read-only" using blockdev(8).
456      * This is independent of device node permissions and therefore open(2)
457      * with O_RDWR succeeds.  Actual writes fail with EPERM.
458      *
459      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
460      * check for read-only block devices so that Linux block devices behave
461      * properly.
462      */
463     struct stat st;
464     int readonly = 0;
465 
466     if (fstat(fd, &st)) {
467         return -errno;
468     }
469 
470     if (!S_ISBLK(st.st_mode)) {
471         return 0;
472     }
473 
474     if (ioctl(fd, BLKROGET, &readonly) < 0) {
475         return -errno;
476     }
477 
478     if (readonly) {
479         return -EACCES;
480     }
481 #endif /* defined(BLKROGET) */
482     return 0;
483 }
484 
485 static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
486 {
487     bool read_write = false;
488     assert(open_flags != NULL);
489 
490     *open_flags |= O_BINARY;
491     *open_flags &= ~O_ACCMODE;
492 
493     if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
494         read_write = has_writers;
495     } else if (bdrv_flags & BDRV_O_RDWR) {
496         read_write = true;
497     }
498 
499     if (read_write) {
500         *open_flags |= O_RDWR;
501     } else {
502         *open_flags |= O_RDONLY;
503     }
504 
505     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
506      * and O_DIRECT for no caching. */
507     if ((bdrv_flags & BDRV_O_NOCACHE)) {
508         *open_flags |= O_DIRECT;
509     }
510 }
511 
512 static void raw_parse_filename(const char *filename, QDict *options,
513                                Error **errp)
514 {
515     bdrv_parse_filename_strip_prefix(filename, "file:", options);
516 }
517 
518 static QemuOptsList raw_runtime_opts = {
519     .name = "raw",
520     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
521     .desc = {
522         {
523             .name = "filename",
524             .type = QEMU_OPT_STRING,
525             .help = "File name of the image",
526         },
527         {
528             .name = "aio",
529             .type = QEMU_OPT_STRING,
530             .help = "host AIO implementation (threads, native, io_uring)",
531         },
532         {
533             .name = "locking",
534             .type = QEMU_OPT_STRING,
535             .help = "file locking mode (on/off/auto, default: auto)",
536         },
537         {
538             .name = "pr-manager",
539             .type = QEMU_OPT_STRING,
540             .help = "id of persistent reservation manager object (default: none)",
541         },
542 #if defined(__linux__)
543         {
544             .name = "drop-cache",
545             .type = QEMU_OPT_BOOL,
546             .help = "invalidate page cache during live migration (default: on)",
547         },
548 #endif
549         {
550             .name = "x-check-cache-dropped",
551             .type = QEMU_OPT_BOOL,
552             .help = "check that page cache was dropped on live migration (default: off)"
553         },
554         { /* end of list */ }
555     },
556 };
557 
558 static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
559 
560 static int raw_open_common(BlockDriverState *bs, QDict *options,
561                            int bdrv_flags, int open_flags,
562                            bool device, Error **errp)
563 {
564     BDRVRawState *s = bs->opaque;
565     QemuOpts *opts;
566     Error *local_err = NULL;
567     const char *filename = NULL;
568     const char *str;
569     BlockdevAioOptions aio, aio_default;
570     int fd, ret;
571     struct stat st;
572     OnOffAuto locking;
573 
574     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
575     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
576         ret = -EINVAL;
577         goto fail;
578     }
579 
580     filename = qemu_opt_get(opts, "filename");
581 
582     ret = raw_normalize_devicepath(&filename, errp);
583     if (ret != 0) {
584         goto fail;
585     }
586 
587     if (bdrv_flags & BDRV_O_NATIVE_AIO) {
588         aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
589 #ifdef CONFIG_LINUX_IO_URING
590     } else if (bdrv_flags & BDRV_O_IO_URING) {
591         aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
592 #endif
593     } else {
594         aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
595     }
596 
597     aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
598                           qemu_opt_get(opts, "aio"),
599                           aio_default, &local_err);
600     if (local_err) {
601         error_propagate(errp, local_err);
602         ret = -EINVAL;
603         goto fail;
604     }
605 
606     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
607 #ifdef CONFIG_LINUX_IO_URING
608     s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
609 #endif
610 
611     locking = qapi_enum_parse(&OnOffAuto_lookup,
612                               qemu_opt_get(opts, "locking"),
613                               ON_OFF_AUTO_AUTO, &local_err);
614     if (local_err) {
615         error_propagate(errp, local_err);
616         ret = -EINVAL;
617         goto fail;
618     }
619     switch (locking) {
620     case ON_OFF_AUTO_ON:
621         s->use_lock = true;
622         if (!qemu_has_ofd_lock()) {
623             warn_report("File lock requested but OFD locking syscall is "
624                         "unavailable, falling back to POSIX file locks");
625             error_printf("Due to the implementation, locks can be lost "
626                          "unexpectedly.\n");
627         }
628         break;
629     case ON_OFF_AUTO_OFF:
630         s->use_lock = false;
631         break;
632     case ON_OFF_AUTO_AUTO:
633         s->use_lock = qemu_has_ofd_lock();
634         break;
635     default:
636         abort();
637     }
638 
639     str = qemu_opt_get(opts, "pr-manager");
640     if (str) {
641         s->pr_mgr = pr_manager_lookup(str, &local_err);
642         if (local_err) {
643             error_propagate(errp, local_err);
644             ret = -EINVAL;
645             goto fail;
646         }
647     }
648 
649     s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
650     s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
651                                                false);
652 
653     s->open_flags = open_flags;
654     raw_parse_flags(bdrv_flags, &s->open_flags, false);
655 
656     s->fd = -1;
657     fd = qemu_open(filename, s->open_flags, errp);
658     ret = fd < 0 ? -errno : 0;
659 
660     if (ret < 0) {
661         if (ret == -EROFS) {
662             ret = -EACCES;
663         }
664         goto fail;
665     }
666     s->fd = fd;
667 
668     /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
669     if (s->open_flags & O_RDWR) {
670         ret = check_hdev_writable(s->fd);
671         if (ret < 0) {
672             error_setg_errno(errp, -ret, "The device is not writable");
673             goto fail;
674         }
675     }
676 
677     s->perm = 0;
678     s->shared_perm = BLK_PERM_ALL;
679 
680 #ifdef CONFIG_LINUX_AIO
681      /* Currently Linux does AIO only for files opened with O_DIRECT */
682     if (s->use_linux_aio) {
683         if (!(s->open_flags & O_DIRECT)) {
684             error_setg(errp, "aio=native was specified, but it requires "
685                              "cache.direct=on, which was not specified.");
686             ret = -EINVAL;
687             goto fail;
688         }
689         if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
690             error_prepend(errp, "Unable to use native AIO: ");
691             goto fail;
692         }
693     }
694 #else
695     if (s->use_linux_aio) {
696         error_setg(errp, "aio=native was specified, but is not supported "
697                          "in this build.");
698         ret = -EINVAL;
699         goto fail;
700     }
701 #endif /* !defined(CONFIG_LINUX_AIO) */
702 
703 #ifdef CONFIG_LINUX_IO_URING
704     if (s->use_linux_io_uring) {
705         if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
706             error_prepend(errp, "Unable to use io_uring: ");
707             goto fail;
708         }
709     }
710 #else
711     if (s->use_linux_io_uring) {
712         error_setg(errp, "aio=io_uring was specified, but is not supported "
713                          "in this build.");
714         ret = -EINVAL;
715         goto fail;
716     }
717 #endif /* !defined(CONFIG_LINUX_IO_URING) */
718 
719     s->has_discard = true;
720     s->has_write_zeroes = true;
721     if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
722         s->needs_alignment = true;
723     }
724 
725     if (fstat(s->fd, &st) < 0) {
726         ret = -errno;
727         error_setg_errno(errp, errno, "Could not stat file");
728         goto fail;
729     }
730 
731     if (!device) {
732         if (!S_ISREG(st.st_mode)) {
733             error_setg(errp, "'%s' driver requires '%s' to be a regular file",
734                        bs->drv->format_name, bs->filename);
735             ret = -EINVAL;
736             goto fail;
737         } else {
738             s->discard_zeroes = true;
739             s->has_fallocate = true;
740         }
741     } else {
742         if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
743             error_setg(errp, "'%s' driver requires '%s' to be either "
744                        "a character or block device",
745                        bs->drv->format_name, bs->filename);
746             ret = -EINVAL;
747             goto fail;
748         }
749     }
750 
751     if (S_ISBLK(st.st_mode)) {
752 #ifdef BLKDISCARDZEROES
753         unsigned int arg;
754         if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
755             s->discard_zeroes = true;
756         }
757 #endif
758 #ifdef __linux__
759         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
760          * not rely on the contents of discarded blocks unless using O_DIRECT.
761          * Same for BLKZEROOUT.
762          */
763         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
764             s->discard_zeroes = false;
765             s->has_write_zeroes = false;
766         }
767 #endif
768     }
769 #ifdef __FreeBSD__
770     if (S_ISCHR(st.st_mode)) {
771         /*
772          * The file is a char device (disk), which on FreeBSD isn't behind
773          * a pager, so force all requests to be aligned. This is needed
774          * so QEMU makes sure all IO operations on the device are aligned
775          * to sector size, or else FreeBSD will reject them with EINVAL.
776          */
777         s->needs_alignment = true;
778     }
779 #endif
780 
781 #ifdef CONFIG_XFS
782     if (platform_test_xfs_fd(s->fd)) {
783         s->is_xfs = true;
784     }
785 #endif
786 
787     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
788     if (S_ISREG(st.st_mode)) {
789         /* When extending regular files, we get zeros from the OS */
790         bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
791     }
792     ret = 0;
793 fail:
794     if (ret < 0 && s->fd != -1) {
795         qemu_close(s->fd);
796     }
797     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
798         unlink(filename);
799     }
800     qemu_opts_del(opts);
801     return ret;
802 }
803 
804 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
805                     Error **errp)
806 {
807     BDRVRawState *s = bs->opaque;
808 
809     s->type = FTYPE_FILE;
810     return raw_open_common(bs, options, flags, 0, false, errp);
811 }
812 
813 typedef enum {
814     RAW_PL_PREPARE,
815     RAW_PL_COMMIT,
816     RAW_PL_ABORT,
817 } RawPermLockOp;
818 
819 #define PERM_FOREACH(i) \
820     for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
821 
822 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
823  * file; if @unlock == true, also unlock the unneeded bytes.
824  * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
825  */
826 static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
827                                 uint64_t perm_lock_bits,
828                                 uint64_t shared_perm_lock_bits,
829                                 bool unlock, Error **errp)
830 {
831     int ret;
832     int i;
833     uint64_t locked_perm, locked_shared_perm;
834 
835     if (s) {
836         locked_perm = s->locked_perm;
837         locked_shared_perm = s->locked_shared_perm;
838     } else {
839         /*
840          * We don't have the previous bits, just lock/unlock for each of the
841          * requested bits.
842          */
843         if (unlock) {
844             locked_perm = BLK_PERM_ALL;
845             locked_shared_perm = BLK_PERM_ALL;
846         } else {
847             locked_perm = 0;
848             locked_shared_perm = 0;
849         }
850     }
851 
852     PERM_FOREACH(i) {
853         int off = RAW_LOCK_PERM_BASE + i;
854         uint64_t bit = (1ULL << i);
855         if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
856             ret = qemu_lock_fd(fd, off, 1, false);
857             if (ret) {
858                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
859                                           off);
860                 return ret;
861             } else if (s) {
862                 s->locked_perm |= bit;
863             }
864         } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
865             ret = qemu_unlock_fd(fd, off, 1);
866             if (ret) {
867                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
868                 return ret;
869             } else if (s) {
870                 s->locked_perm &= ~bit;
871             }
872         }
873     }
874     PERM_FOREACH(i) {
875         int off = RAW_LOCK_SHARED_BASE + i;
876         uint64_t bit = (1ULL << i);
877         if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
878             ret = qemu_lock_fd(fd, off, 1, false);
879             if (ret) {
880                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
881                                           off);
882                 return ret;
883             } else if (s) {
884                 s->locked_shared_perm |= bit;
885             }
886         } else if (unlock && (locked_shared_perm & bit) &&
887                    !(shared_perm_lock_bits & bit)) {
888             ret = qemu_unlock_fd(fd, off, 1);
889             if (ret) {
890                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
891                 return ret;
892             } else if (s) {
893                 s->locked_shared_perm &= ~bit;
894             }
895         }
896     }
897     return 0;
898 }
899 
900 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
901 static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
902                                 Error **errp)
903 {
904     int ret;
905     int i;
906 
907     PERM_FOREACH(i) {
908         int off = RAW_LOCK_SHARED_BASE + i;
909         uint64_t p = 1ULL << i;
910         if (perm & p) {
911             ret = qemu_lock_fd_test(fd, off, 1, true);
912             if (ret) {
913                 char *perm_name = bdrv_perm_names(p);
914 
915                 raw_lock_error_setg_errno(errp, -ret,
916                                           "Failed to get \"%s\" lock",
917                                           perm_name);
918                 g_free(perm_name);
919                 return ret;
920             }
921         }
922     }
923     PERM_FOREACH(i) {
924         int off = RAW_LOCK_PERM_BASE + i;
925         uint64_t p = 1ULL << i;
926         if (!(shared_perm & p)) {
927             ret = qemu_lock_fd_test(fd, off, 1, true);
928             if (ret) {
929                 char *perm_name = bdrv_perm_names(p);
930 
931                 raw_lock_error_setg_errno(errp, -ret,
932                                           "Failed to get shared \"%s\" lock",
933                                           perm_name);
934                 g_free(perm_name);
935                 return ret;
936             }
937         }
938     }
939     return 0;
940 }
941 
942 static int raw_handle_perm_lock(BlockDriverState *bs,
943                                 RawPermLockOp op,
944                                 uint64_t new_perm, uint64_t new_shared,
945                                 Error **errp)
946 {
947     BDRVRawState *s = bs->opaque;
948     int ret = 0;
949     Error *local_err = NULL;
950 
951     if (!s->use_lock) {
952         return 0;
953     }
954 
955     if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
956         return 0;
957     }
958 
959     switch (op) {
960     case RAW_PL_PREPARE:
961         if ((s->perm | new_perm) == s->perm &&
962             (s->shared_perm & new_shared) == s->shared_perm)
963         {
964             /*
965              * We are going to unlock bytes, it should not fail. If it fail due
966              * to some fs-dependent permission-unrelated reasons (which occurs
967              * sometimes on NFS and leads to abort in bdrv_replace_child) we
968              * can't prevent such errors by any check here. And we ignore them
969              * anyway in ABORT and COMMIT.
970              */
971             return 0;
972         }
973         ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
974                                    ~s->shared_perm | ~new_shared,
975                                    false, errp);
976         if (!ret) {
977             ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
978             if (!ret) {
979                 return 0;
980             }
981             error_append_hint(errp,
982                               "Is another process using the image [%s]?\n",
983                               bs->filename);
984         }
985         /* fall through to unlock bytes. */
986     case RAW_PL_ABORT:
987         raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
988                              true, &local_err);
989         if (local_err) {
990             /* Theoretically the above call only unlocks bytes and it cannot
991              * fail. Something weird happened, report it.
992              */
993             warn_report_err(local_err);
994         }
995         break;
996     case RAW_PL_COMMIT:
997         raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
998                              true, &local_err);
999         if (local_err) {
1000             /* Theoretically the above call only unlocks bytes and it cannot
1001              * fail. Something weird happened, report it.
1002              */
1003             warn_report_err(local_err);
1004         }
1005         break;
1006     }
1007     return ret;
1008 }
1009 
1010 static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
1011                                  int *open_flags, uint64_t perm, bool force_dup,
1012                                  Error **errp)
1013 {
1014     BDRVRawState *s = bs->opaque;
1015     int fd = -1;
1016     int ret;
1017     bool has_writers = perm &
1018         (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
1019     int fcntl_flags = O_APPEND | O_NONBLOCK;
1020 #ifdef O_NOATIME
1021     fcntl_flags |= O_NOATIME;
1022 #endif
1023 
1024     *open_flags = 0;
1025     if (s->type == FTYPE_CD) {
1026         *open_flags |= O_NONBLOCK;
1027     }
1028 
1029     raw_parse_flags(flags, open_flags, has_writers);
1030 
1031 #ifdef O_ASYNC
1032     /* Not all operating systems have O_ASYNC, and those that don't
1033      * will not let us track the state into rs->open_flags (typically
1034      * you achieve the same effect with an ioctl, for example I_SETSIG
1035      * on Solaris). But we do not use O_ASYNC, so that's fine.
1036      */
1037     assert((s->open_flags & O_ASYNC) == 0);
1038 #endif
1039 
1040     if (!force_dup && *open_flags == s->open_flags) {
1041         /* We're lucky, the existing fd is fine */
1042         return s->fd;
1043     }
1044 
1045     if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
1046         /* dup the original fd */
1047         fd = qemu_dup(s->fd);
1048         if (fd >= 0) {
1049             ret = fcntl_setfl(fd, *open_flags);
1050             if (ret) {
1051                 qemu_close(fd);
1052                 fd = -1;
1053             }
1054         }
1055     }
1056 
1057     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
1058     if (fd == -1) {
1059         const char *normalized_filename = bs->filename;
1060         ret = raw_normalize_devicepath(&normalized_filename, errp);
1061         if (ret >= 0) {
1062             fd = qemu_open(normalized_filename, *open_flags, errp);
1063             if (fd == -1) {
1064                 return -1;
1065             }
1066         }
1067     }
1068 
1069     if (fd != -1 && (*open_flags & O_RDWR)) {
1070         ret = check_hdev_writable(fd);
1071         if (ret < 0) {
1072             qemu_close(fd);
1073             error_setg_errno(errp, -ret, "The device is not writable");
1074             return -1;
1075         }
1076     }
1077 
1078     return fd;
1079 }
1080 
1081 static int raw_reopen_prepare(BDRVReopenState *state,
1082                               BlockReopenQueue *queue, Error **errp)
1083 {
1084     BDRVRawState *s;
1085     BDRVRawReopenState *rs;
1086     QemuOpts *opts;
1087     int ret;
1088 
1089     assert(state != NULL);
1090     assert(state->bs != NULL);
1091 
1092     s = state->bs->opaque;
1093 
1094     state->opaque = g_new0(BDRVRawReopenState, 1);
1095     rs = state->opaque;
1096 
1097     /* Handle options changes */
1098     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1099     if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
1100         ret = -EINVAL;
1101         goto out;
1102     }
1103 
1104     rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1105     rs->check_cache_dropped =
1106         qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1107 
1108     /* This driver's reopen function doesn't currently allow changing
1109      * other options, so let's put them back in the original QDict and
1110      * bdrv_reopen_prepare() will detect changes and complain. */
1111     qemu_opts_to_qdict(opts, state->options);
1112 
1113     /*
1114      * As part of reopen prepare we also want to create new fd by
1115      * raw_reconfigure_getfd(). But it wants updated "perm", when in
1116      * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
1117      * permission update. Happily, permission update is always a part (a seprate
1118      * stage) of bdrv_reopen_multiple() so we can rely on this fact and
1119      * reconfigure fd in raw_check_perm().
1120      */
1121 
1122     s->reopen_state = state;
1123     ret = 0;
1124 
1125 out:
1126     qemu_opts_del(opts);
1127     return ret;
1128 }
1129 
1130 static void raw_reopen_commit(BDRVReopenState *state)
1131 {
1132     BDRVRawReopenState *rs = state->opaque;
1133     BDRVRawState *s = state->bs->opaque;
1134 
1135     s->drop_cache = rs->drop_cache;
1136     s->check_cache_dropped = rs->check_cache_dropped;
1137     s->open_flags = rs->open_flags;
1138     g_free(state->opaque);
1139     state->opaque = NULL;
1140 
1141     assert(s->reopen_state == state);
1142     s->reopen_state = NULL;
1143 }
1144 
1145 
1146 static void raw_reopen_abort(BDRVReopenState *state)
1147 {
1148     BDRVRawReopenState *rs = state->opaque;
1149     BDRVRawState *s = state->bs->opaque;
1150 
1151      /* nothing to do if NULL, we didn't get far enough */
1152     if (rs == NULL) {
1153         return;
1154     }
1155 
1156     g_free(state->opaque);
1157     state->opaque = NULL;
1158 
1159     assert(s->reopen_state == state);
1160     s->reopen_state = NULL;
1161 }
1162 
1163 static int hdev_get_max_hw_transfer(int fd, struct stat *st)
1164 {
1165 #ifdef BLKSECTGET
1166     if (S_ISBLK(st->st_mode)) {
1167         unsigned short max_sectors = 0;
1168         if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
1169             return max_sectors * 512;
1170         }
1171     } else {
1172         int max_bytes = 0;
1173         if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1174             return max_bytes;
1175         }
1176     }
1177     return -errno;
1178 #else
1179     return -ENOSYS;
1180 #endif
1181 }
1182 
1183 static int hdev_get_max_segments(int fd, struct stat *st)
1184 {
1185 #ifdef CONFIG_LINUX
1186     char buf[32];
1187     const char *end;
1188     char *sysfspath = NULL;
1189     int ret;
1190     int sysfd = -1;
1191     long max_segments;
1192 
1193     if (S_ISCHR(st->st_mode)) {
1194         if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
1195             return ret;
1196         }
1197         return -ENOTSUP;
1198     }
1199 
1200     if (!S_ISBLK(st->st_mode)) {
1201         return -ENOTSUP;
1202     }
1203 
1204     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1205                                 major(st->st_rdev), minor(st->st_rdev));
1206     sysfd = open(sysfspath, O_RDONLY);
1207     if (sysfd == -1) {
1208         ret = -errno;
1209         goto out;
1210     }
1211     do {
1212         ret = read(sysfd, buf, sizeof(buf) - 1);
1213     } while (ret == -1 && errno == EINTR);
1214     if (ret < 0) {
1215         ret = -errno;
1216         goto out;
1217     } else if (ret == 0) {
1218         ret = -EIO;
1219         goto out;
1220     }
1221     buf[ret] = 0;
1222     /* The file is ended with '\n', pass 'end' to accept that. */
1223     ret = qemu_strtol(buf, &end, 10, &max_segments);
1224     if (ret == 0 && end && *end == '\n') {
1225         ret = max_segments;
1226     }
1227 
1228 out:
1229     if (sysfd != -1) {
1230         close(sysfd);
1231     }
1232     g_free(sysfspath);
1233     return ret;
1234 #else
1235     return -ENOTSUP;
1236 #endif
1237 }
1238 
1239 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1240 {
1241     BDRVRawState *s = bs->opaque;
1242     struct stat st;
1243 
1244     raw_probe_alignment(bs, s->fd, errp);
1245     bs->bl.min_mem_alignment = s->buf_align;
1246     bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
1247 
1248     /*
1249      * Maximum transfers are best effort, so it is okay to ignore any
1250      * errors.  That said, based on the man page errors in fstat would be
1251      * very much unexpected; the only possible case seems to be ENOMEM.
1252      */
1253     if (fstat(s->fd, &st)) {
1254         return;
1255     }
1256 
1257     if (bs->sg || S_ISBLK(st.st_mode)) {
1258         int ret = hdev_get_max_hw_transfer(s->fd, &st);
1259 
1260         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1261             bs->bl.max_hw_transfer = ret;
1262         }
1263 
1264         ret = hdev_get_max_segments(s->fd, &st);
1265         if (ret > 0) {
1266             bs->bl.max_iov = ret;
1267         }
1268     }
1269 }
1270 
1271 static int check_for_dasd(int fd)
1272 {
1273 #ifdef BIODASDINFO2
1274     struct dasd_information2_t info = {0};
1275 
1276     return ioctl(fd, BIODASDINFO2, &info);
1277 #else
1278     return -1;
1279 #endif
1280 }
1281 
1282 /**
1283  * Try to get @bs's logical and physical block size.
1284  * On success, store them in @bsz and return zero.
1285  * On failure, return negative errno.
1286  */
1287 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1288 {
1289     BDRVRawState *s = bs->opaque;
1290     int ret;
1291 
1292     /* If DASD, get blocksizes */
1293     if (check_for_dasd(s->fd) < 0) {
1294         return -ENOTSUP;
1295     }
1296     ret = probe_logical_blocksize(s->fd, &bsz->log);
1297     if (ret < 0) {
1298         return ret;
1299     }
1300     return probe_physical_blocksize(s->fd, &bsz->phys);
1301 }
1302 
1303 /**
1304  * Try to get @bs's geometry: cyls, heads, sectors.
1305  * On success, store them in @geo and return 0.
1306  * On failure return -errno.
1307  * (Allows block driver to assign default geometry values that guest sees)
1308  */
1309 #ifdef __linux__
1310 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1311 {
1312     BDRVRawState *s = bs->opaque;
1313     struct hd_geometry ioctl_geo = {0};
1314 
1315     /* If DASD, get its geometry */
1316     if (check_for_dasd(s->fd) < 0) {
1317         return -ENOTSUP;
1318     }
1319     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1320         return -errno;
1321     }
1322     /* HDIO_GETGEO may return success even though geo contains zeros
1323        (e.g. certain multipath setups) */
1324     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1325         return -ENOTSUP;
1326     }
1327     /* Do not return a geometry for partition */
1328     if (ioctl_geo.start != 0) {
1329         return -ENOTSUP;
1330     }
1331     geo->heads = ioctl_geo.heads;
1332     geo->sectors = ioctl_geo.sectors;
1333     geo->cylinders = ioctl_geo.cylinders;
1334 
1335     return 0;
1336 }
1337 #else /* __linux__ */
1338 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1339 {
1340     return -ENOTSUP;
1341 }
1342 #endif
1343 
1344 #if defined(__linux__)
1345 static int handle_aiocb_ioctl(void *opaque)
1346 {
1347     RawPosixAIOData *aiocb = opaque;
1348     int ret;
1349 
1350     do {
1351         ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1352     } while (ret == -1 && errno == EINTR);
1353     if (ret == -1) {
1354         return -errno;
1355     }
1356 
1357     return 0;
1358 }
1359 #endif /* linux */
1360 
1361 static int handle_aiocb_flush(void *opaque)
1362 {
1363     RawPosixAIOData *aiocb = opaque;
1364     BDRVRawState *s = aiocb->bs->opaque;
1365     int ret;
1366 
1367     if (s->page_cache_inconsistent) {
1368         return -s->page_cache_inconsistent;
1369     }
1370 
1371     ret = qemu_fdatasync(aiocb->aio_fildes);
1372     if (ret == -1) {
1373         trace_file_flush_fdatasync_failed(errno);
1374 
1375         /* There is no clear definition of the semantics of a failing fsync(),
1376          * so we may have to assume the worst. The sad truth is that this
1377          * assumption is correct for Linux. Some pages are now probably marked
1378          * clean in the page cache even though they are inconsistent with the
1379          * on-disk contents. The next fdatasync() call would succeed, but no
1380          * further writeback attempt will be made. We can't get back to a state
1381          * in which we know what is on disk (we would have to rewrite
1382          * everything that was touched since the last fdatasync() at least), so
1383          * make bdrv_flush() fail permanently. Given that the behaviour isn't
1384          * really defined, I have little hope that other OSes are doing better.
1385          *
1386          * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1387          * cache. */
1388         if ((s->open_flags & O_DIRECT) == 0) {
1389             s->page_cache_inconsistent = errno;
1390         }
1391         return -errno;
1392     }
1393     return 0;
1394 }
1395 
1396 #ifdef CONFIG_PREADV
1397 
1398 static bool preadv_present = true;
1399 
1400 static ssize_t
1401 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1402 {
1403     return preadv(fd, iov, nr_iov, offset);
1404 }
1405 
1406 static ssize_t
1407 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1408 {
1409     return pwritev(fd, iov, nr_iov, offset);
1410 }
1411 
1412 #else
1413 
1414 static bool preadv_present = false;
1415 
1416 static ssize_t
1417 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1418 {
1419     return -ENOSYS;
1420 }
1421 
1422 static ssize_t
1423 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1424 {
1425     return -ENOSYS;
1426 }
1427 
1428 #endif
1429 
1430 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1431 {
1432     ssize_t len;
1433 
1434     do {
1435         if (aiocb->aio_type & QEMU_AIO_WRITE)
1436             len = qemu_pwritev(aiocb->aio_fildes,
1437                                aiocb->io.iov,
1438                                aiocb->io.niov,
1439                                aiocb->aio_offset);
1440          else
1441             len = qemu_preadv(aiocb->aio_fildes,
1442                               aiocb->io.iov,
1443                               aiocb->io.niov,
1444                               aiocb->aio_offset);
1445     } while (len == -1 && errno == EINTR);
1446 
1447     if (len == -1) {
1448         return -errno;
1449     }
1450     return len;
1451 }
1452 
1453 /*
1454  * Read/writes the data to/from a given linear buffer.
1455  *
1456  * Returns the number of bytes handles or -errno in case of an error. Short
1457  * reads are only returned if the end of the file is reached.
1458  */
1459 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1460 {
1461     ssize_t offset = 0;
1462     ssize_t len;
1463 
1464     while (offset < aiocb->aio_nbytes) {
1465         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1466             len = pwrite(aiocb->aio_fildes,
1467                          (const char *)buf + offset,
1468                          aiocb->aio_nbytes - offset,
1469                          aiocb->aio_offset + offset);
1470         } else {
1471             len = pread(aiocb->aio_fildes,
1472                         buf + offset,
1473                         aiocb->aio_nbytes - offset,
1474                         aiocb->aio_offset + offset);
1475         }
1476         if (len == -1 && errno == EINTR) {
1477             continue;
1478         } else if (len == -1 && errno == EINVAL &&
1479                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1480                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1481                    offset > 0) {
1482             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1483              * after a short read.  Assume that O_DIRECT short reads only occur
1484              * at EOF.  Therefore this is a short read, not an I/O error.
1485              */
1486             break;
1487         } else if (len == -1) {
1488             offset = -errno;
1489             break;
1490         } else if (len == 0) {
1491             break;
1492         }
1493         offset += len;
1494     }
1495 
1496     return offset;
1497 }
1498 
1499 static int handle_aiocb_rw(void *opaque)
1500 {
1501     RawPosixAIOData *aiocb = opaque;
1502     ssize_t nbytes;
1503     char *buf;
1504 
1505     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1506         /*
1507          * If there is just a single buffer, and it is properly aligned
1508          * we can just use plain pread/pwrite without any problems.
1509          */
1510         if (aiocb->io.niov == 1) {
1511             nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1512             goto out;
1513         }
1514         /*
1515          * We have more than one iovec, and all are properly aligned.
1516          *
1517          * Try preadv/pwritev first and fall back to linearizing the
1518          * buffer if it's not supported.
1519          */
1520         if (preadv_present) {
1521             nbytes = handle_aiocb_rw_vector(aiocb);
1522             if (nbytes == aiocb->aio_nbytes ||
1523                 (nbytes < 0 && nbytes != -ENOSYS)) {
1524                 goto out;
1525             }
1526             preadv_present = false;
1527         }
1528 
1529         /*
1530          * XXX(hch): short read/write.  no easy way to handle the reminder
1531          * using these interfaces.  For now retry using plain
1532          * pread/pwrite?
1533          */
1534     }
1535 
1536     /*
1537      * Ok, we have to do it the hard way, copy all segments into
1538      * a single aligned buffer.
1539      */
1540     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1541     if (buf == NULL) {
1542         nbytes = -ENOMEM;
1543         goto out;
1544     }
1545 
1546     if (aiocb->aio_type & QEMU_AIO_WRITE) {
1547         char *p = buf;
1548         int i;
1549 
1550         for (i = 0; i < aiocb->io.niov; ++i) {
1551             memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1552             p += aiocb->io.iov[i].iov_len;
1553         }
1554         assert(p - buf == aiocb->aio_nbytes);
1555     }
1556 
1557     nbytes = handle_aiocb_rw_linear(aiocb, buf);
1558     if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1559         char *p = buf;
1560         size_t count = aiocb->aio_nbytes, copy;
1561         int i;
1562 
1563         for (i = 0; i < aiocb->io.niov && count; ++i) {
1564             copy = count;
1565             if (copy > aiocb->io.iov[i].iov_len) {
1566                 copy = aiocb->io.iov[i].iov_len;
1567             }
1568             memcpy(aiocb->io.iov[i].iov_base, p, copy);
1569             assert(count >= copy);
1570             p     += copy;
1571             count -= copy;
1572         }
1573         assert(count == 0);
1574     }
1575     qemu_vfree(buf);
1576 
1577 out:
1578     if (nbytes == aiocb->aio_nbytes) {
1579         return 0;
1580     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1581         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1582             return -EINVAL;
1583         } else {
1584             iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1585                       0, aiocb->aio_nbytes - nbytes);
1586             return 0;
1587         }
1588     } else {
1589         assert(nbytes < 0);
1590         return nbytes;
1591     }
1592 }
1593 
1594 static int translate_err(int err)
1595 {
1596     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1597         err == -ENOTTY) {
1598         err = -ENOTSUP;
1599     }
1600     return err;
1601 }
1602 
1603 #ifdef CONFIG_FALLOCATE
1604 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1605 {
1606     do {
1607         if (fallocate(fd, mode, offset, len) == 0) {
1608             return 0;
1609         }
1610     } while (errno == EINTR);
1611     return translate_err(-errno);
1612 }
1613 #endif
1614 
1615 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1616 {
1617     int ret = -ENOTSUP;
1618     BDRVRawState *s = aiocb->bs->opaque;
1619 
1620     if (!s->has_write_zeroes) {
1621         return -ENOTSUP;
1622     }
1623 
1624 #ifdef BLKZEROOUT
1625     /* The BLKZEROOUT implementation in the kernel doesn't set
1626      * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1627      * fallbacks. */
1628     if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1629         do {
1630             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1631             if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1632                 return 0;
1633             }
1634         } while (errno == EINTR);
1635 
1636         ret = translate_err(-errno);
1637         if (ret == -ENOTSUP) {
1638             s->has_write_zeroes = false;
1639         }
1640     }
1641 #endif
1642 
1643     return ret;
1644 }
1645 
1646 static int handle_aiocb_write_zeroes(void *opaque)
1647 {
1648     RawPosixAIOData *aiocb = opaque;
1649 #ifdef CONFIG_FALLOCATE
1650     BDRVRawState *s = aiocb->bs->opaque;
1651     int64_t len;
1652 #endif
1653 
1654     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1655         return handle_aiocb_write_zeroes_block(aiocb);
1656     }
1657 
1658 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1659     if (s->has_write_zeroes) {
1660         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1661                                aiocb->aio_offset, aiocb->aio_nbytes);
1662         if (ret == -ENOTSUP) {
1663             s->has_write_zeroes = false;
1664         } else if (ret == 0 || ret != -EINVAL) {
1665             return ret;
1666         }
1667         /*
1668          * Note: Some file systems do not like unaligned byte ranges, and
1669          * return EINVAL in such a case, though they should not do it according
1670          * to the man-page of fallocate(). Thus we simply ignore this return
1671          * value and try the other fallbacks instead.
1672          */
1673     }
1674 #endif
1675 
1676 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1677     if (s->has_discard && s->has_fallocate) {
1678         int ret = do_fallocate(s->fd,
1679                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1680                                aiocb->aio_offset, aiocb->aio_nbytes);
1681         if (ret == 0) {
1682             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1683             if (ret == 0 || ret != -ENOTSUP) {
1684                 return ret;
1685             }
1686             s->has_fallocate = false;
1687         } else if (ret == -EINVAL) {
1688             /*
1689              * Some file systems like older versions of GPFS do not like un-
1690              * aligned byte ranges, and return EINVAL in such a case, though
1691              * they should not do it according to the man-page of fallocate().
1692              * Warn about the bad filesystem and try the final fallback instead.
1693              */
1694             warn_report_once("Your file system is misbehaving: "
1695                              "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
1696                              "Please report this bug to your file sytem "
1697                              "vendor.");
1698         } else if (ret != -ENOTSUP) {
1699             return ret;
1700         } else {
1701             s->has_discard = false;
1702         }
1703     }
1704 #endif
1705 
1706 #ifdef CONFIG_FALLOCATE
1707     /* Last resort: we are trying to extend the file with zeroed data. This
1708      * can be done via fallocate(fd, 0) */
1709     len = bdrv_getlength(aiocb->bs);
1710     if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1711         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1712         if (ret == 0 || ret != -ENOTSUP) {
1713             return ret;
1714         }
1715         s->has_fallocate = false;
1716     }
1717 #endif
1718 
1719     return -ENOTSUP;
1720 }
1721 
1722 static int handle_aiocb_write_zeroes_unmap(void *opaque)
1723 {
1724     RawPosixAIOData *aiocb = opaque;
1725     BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1726 
1727     /* First try to write zeros and unmap at the same time */
1728 
1729 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1730     int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1731                            aiocb->aio_offset, aiocb->aio_nbytes);
1732     switch (ret) {
1733     case -ENOTSUP:
1734     case -EINVAL:
1735     case -EBUSY:
1736         break;
1737     default:
1738         return ret;
1739     }
1740 #endif
1741 
1742     /* If we couldn't manage to unmap while guaranteed that the area reads as
1743      * all-zero afterwards, just write zeroes without unmapping */
1744     return handle_aiocb_write_zeroes(aiocb);
1745 }
1746 
1747 #ifndef HAVE_COPY_FILE_RANGE
1748 static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1749                              off_t *out_off, size_t len, unsigned int flags)
1750 {
1751 #ifdef __NR_copy_file_range
1752     return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1753                    out_off, len, flags);
1754 #else
1755     errno = ENOSYS;
1756     return -1;
1757 #endif
1758 }
1759 #endif
1760 
1761 static int handle_aiocb_copy_range(void *opaque)
1762 {
1763     RawPosixAIOData *aiocb = opaque;
1764     uint64_t bytes = aiocb->aio_nbytes;
1765     off_t in_off = aiocb->aio_offset;
1766     off_t out_off = aiocb->copy_range.aio_offset2;
1767 
1768     while (bytes) {
1769         ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1770                                       aiocb->copy_range.aio_fd2, &out_off,
1771                                       bytes, 0);
1772         trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1773                                    aiocb->copy_range.aio_fd2, out_off, bytes,
1774                                    0, ret);
1775         if (ret == 0) {
1776             /* No progress (e.g. when beyond EOF), let the caller fall back to
1777              * buffer I/O. */
1778             return -ENOSPC;
1779         }
1780         if (ret < 0) {
1781             switch (errno) {
1782             case ENOSYS:
1783                 return -ENOTSUP;
1784             case EINTR:
1785                 continue;
1786             default:
1787                 return -errno;
1788             }
1789         }
1790         bytes -= ret;
1791     }
1792     return 0;
1793 }
1794 
1795 static int handle_aiocb_discard(void *opaque)
1796 {
1797     RawPosixAIOData *aiocb = opaque;
1798     int ret = -EOPNOTSUPP;
1799     BDRVRawState *s = aiocb->bs->opaque;
1800 
1801     if (!s->has_discard) {
1802         return -ENOTSUP;
1803     }
1804 
1805     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1806 #ifdef BLKDISCARD
1807         do {
1808             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1809             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1810                 return 0;
1811             }
1812         } while (errno == EINTR);
1813 
1814         ret = -errno;
1815 #endif
1816     } else {
1817 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1818         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1819                            aiocb->aio_offset, aiocb->aio_nbytes);
1820 #endif
1821     }
1822 
1823     ret = translate_err(ret);
1824     if (ret == -ENOTSUP) {
1825         s->has_discard = false;
1826     }
1827     return ret;
1828 }
1829 
1830 /*
1831  * Help alignment probing by allocating the first block.
1832  *
1833  * When reading with direct I/O from unallocated area on Gluster backed by XFS,
1834  * reading succeeds regardless of request length. In this case we fallback to
1835  * safe alignment which is not optimal. Allocating the first block avoids this
1836  * fallback.
1837  *
1838  * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
1839  * request alignment, so we use safe values.
1840  *
1841  * Returns: 0 on success, -errno on failure. Since this is an optimization,
1842  * caller may ignore failures.
1843  */
1844 static int allocate_first_block(int fd, size_t max_size)
1845 {
1846     size_t write_size = (max_size < MAX_BLOCKSIZE)
1847         ? BDRV_SECTOR_SIZE
1848         : MAX_BLOCKSIZE;
1849     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
1850     void *buf;
1851     ssize_t n;
1852     int ret;
1853 
1854     buf = qemu_memalign(max_align, write_size);
1855     memset(buf, 0, write_size);
1856 
1857     do {
1858         n = pwrite(fd, buf, write_size, 0);
1859     } while (n == -1 && errno == EINTR);
1860 
1861     ret = (n == -1) ? -errno : 0;
1862 
1863     qemu_vfree(buf);
1864     return ret;
1865 }
1866 
1867 static int handle_aiocb_truncate(void *opaque)
1868 {
1869     RawPosixAIOData *aiocb = opaque;
1870     int result = 0;
1871     int64_t current_length = 0;
1872     char *buf = NULL;
1873     struct stat st;
1874     int fd = aiocb->aio_fildes;
1875     int64_t offset = aiocb->aio_offset;
1876     PreallocMode prealloc = aiocb->truncate.prealloc;
1877     Error **errp = aiocb->truncate.errp;
1878 
1879     if (fstat(fd, &st) < 0) {
1880         result = -errno;
1881         error_setg_errno(errp, -result, "Could not stat file");
1882         return result;
1883     }
1884 
1885     current_length = st.st_size;
1886     if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1887         error_setg(errp, "Cannot use preallocation for shrinking files");
1888         return -ENOTSUP;
1889     }
1890 
1891     switch (prealloc) {
1892 #ifdef CONFIG_POSIX_FALLOCATE
1893     case PREALLOC_MODE_FALLOC:
1894         /*
1895          * Truncating before posix_fallocate() makes it about twice slower on
1896          * file systems that do not support fallocate(), trying to check if a
1897          * block is allocated before allocating it, so don't do that here.
1898          */
1899         if (offset != current_length) {
1900             result = -posix_fallocate(fd, current_length,
1901                                       offset - current_length);
1902             if (result != 0) {
1903                 /* posix_fallocate() doesn't set errno. */
1904                 error_setg_errno(errp, -result,
1905                                  "Could not preallocate new data");
1906             } else if (current_length == 0) {
1907                 /*
1908                  * posix_fallocate() uses fallocate() if the filesystem
1909                  * supports it, or fallback to manually writing zeroes. If
1910                  * fallocate() was used, unaligned reads from the fallocated
1911                  * area in raw_probe_alignment() will succeed, hence we need to
1912                  * allocate the first block.
1913                  *
1914                  * Optimize future alignment probing; ignore failures.
1915                  */
1916                 allocate_first_block(fd, offset);
1917             }
1918         } else {
1919             result = 0;
1920         }
1921         goto out;
1922 #endif
1923     case PREALLOC_MODE_FULL:
1924     {
1925         int64_t num = 0, left = offset - current_length;
1926         off_t seek_result;
1927 
1928         /*
1929          * Knowing the final size from the beginning could allow the file
1930          * system driver to do less allocations and possibly avoid
1931          * fragmentation of the file.
1932          */
1933         if (ftruncate(fd, offset) != 0) {
1934             result = -errno;
1935             error_setg_errno(errp, -result, "Could not resize file");
1936             goto out;
1937         }
1938 
1939         buf = g_malloc0(65536);
1940 
1941         seek_result = lseek(fd, current_length, SEEK_SET);
1942         if (seek_result < 0) {
1943             result = -errno;
1944             error_setg_errno(errp, -result,
1945                              "Failed to seek to the old end of file");
1946             goto out;
1947         }
1948 
1949         while (left > 0) {
1950             num = MIN(left, 65536);
1951             result = write(fd, buf, num);
1952             if (result < 0) {
1953                 if (errno == EINTR) {
1954                     continue;
1955                 }
1956                 result = -errno;
1957                 error_setg_errno(errp, -result,
1958                                  "Could not write zeros for preallocation");
1959                 goto out;
1960             }
1961             left -= result;
1962         }
1963         if (result >= 0) {
1964             result = fsync(fd);
1965             if (result < 0) {
1966                 result = -errno;
1967                 error_setg_errno(errp, -result,
1968                                  "Could not flush file to disk");
1969                 goto out;
1970             }
1971         }
1972         goto out;
1973     }
1974     case PREALLOC_MODE_OFF:
1975         if (ftruncate(fd, offset) != 0) {
1976             result = -errno;
1977             error_setg_errno(errp, -result, "Could not resize file");
1978         } else if (current_length == 0 && offset > current_length) {
1979             /* Optimize future alignment probing; ignore failures. */
1980             allocate_first_block(fd, offset);
1981         }
1982         return result;
1983     default:
1984         result = -ENOTSUP;
1985         error_setg(errp, "Unsupported preallocation mode: %s",
1986                    PreallocMode_str(prealloc));
1987         return result;
1988     }
1989 
1990 out:
1991     if (result < 0) {
1992         if (ftruncate(fd, current_length) < 0) {
1993             error_report("Failed to restore old file length: %s",
1994                          strerror(errno));
1995         }
1996     }
1997 
1998     g_free(buf);
1999     return result;
2000 }
2001 
2002 static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
2003                                                ThreadPoolFunc func, void *arg)
2004 {
2005     /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
2006     ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
2007     return thread_pool_submit_co(pool, func, arg);
2008 }
2009 
2010 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
2011                                    uint64_t bytes, QEMUIOVector *qiov, int type)
2012 {
2013     BDRVRawState *s = bs->opaque;
2014     RawPosixAIOData acb;
2015 
2016     if (fd_open(bs) < 0)
2017         return -EIO;
2018 
2019     /*
2020      * When using O_DIRECT, the request must be aligned to be able to use
2021      * either libaio or io_uring interface. If not fail back to regular thread
2022      * pool read/write code which emulates this for us if we
2023      * set QEMU_AIO_MISALIGNED.
2024      */
2025     if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
2026         type |= QEMU_AIO_MISALIGNED;
2027 #ifdef CONFIG_LINUX_IO_URING
2028     } else if (s->use_linux_io_uring) {
2029         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2030         assert(qiov->size == bytes);
2031         return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
2032 #endif
2033 #ifdef CONFIG_LINUX_AIO
2034     } else if (s->use_linux_aio) {
2035         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2036         assert(qiov->size == bytes);
2037         return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
2038 #endif
2039     }
2040 
2041     acb = (RawPosixAIOData) {
2042         .bs             = bs,
2043         .aio_fildes     = s->fd,
2044         .aio_type       = type,
2045         .aio_offset     = offset,
2046         .aio_nbytes     = bytes,
2047         .io             = {
2048             .iov            = qiov->iov,
2049             .niov           = qiov->niov,
2050         },
2051     };
2052 
2053     assert(qiov->size == bytes);
2054     return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
2055 }
2056 
2057 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
2058                                       uint64_t bytes, QEMUIOVector *qiov,
2059                                       int flags)
2060 {
2061     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
2062 }
2063 
2064 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
2065                                        uint64_t bytes, QEMUIOVector *qiov,
2066                                        int flags)
2067 {
2068     assert(flags == 0);
2069     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
2070 }
2071 
2072 static void raw_aio_plug(BlockDriverState *bs)
2073 {
2074     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2075 #ifdef CONFIG_LINUX_AIO
2076     if (s->use_linux_aio) {
2077         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2078         laio_io_plug(bs, aio);
2079     }
2080 #endif
2081 #ifdef CONFIG_LINUX_IO_URING
2082     if (s->use_linux_io_uring) {
2083         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2084         luring_io_plug(bs, aio);
2085     }
2086 #endif
2087 }
2088 
2089 static void raw_aio_unplug(BlockDriverState *bs)
2090 {
2091     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2092 #ifdef CONFIG_LINUX_AIO
2093     if (s->use_linux_aio) {
2094         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2095         laio_io_unplug(bs, aio);
2096     }
2097 #endif
2098 #ifdef CONFIG_LINUX_IO_URING
2099     if (s->use_linux_io_uring) {
2100         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2101         luring_io_unplug(bs, aio);
2102     }
2103 #endif
2104 }
2105 
2106 static int raw_co_flush_to_disk(BlockDriverState *bs)
2107 {
2108     BDRVRawState *s = bs->opaque;
2109     RawPosixAIOData acb;
2110     int ret;
2111 
2112     ret = fd_open(bs);
2113     if (ret < 0) {
2114         return ret;
2115     }
2116 
2117     acb = (RawPosixAIOData) {
2118         .bs             = bs,
2119         .aio_fildes     = s->fd,
2120         .aio_type       = QEMU_AIO_FLUSH,
2121     };
2122 
2123 #ifdef CONFIG_LINUX_IO_URING
2124     if (s->use_linux_io_uring) {
2125         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2126         return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
2127     }
2128 #endif
2129     return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
2130 }
2131 
2132 static void raw_aio_attach_aio_context(BlockDriverState *bs,
2133                                        AioContext *new_context)
2134 {
2135     BDRVRawState __attribute__((unused)) *s = bs->opaque;
2136 #ifdef CONFIG_LINUX_AIO
2137     if (s->use_linux_aio) {
2138         Error *local_err = NULL;
2139         if (!aio_setup_linux_aio(new_context, &local_err)) {
2140             error_reportf_err(local_err, "Unable to use native AIO, "
2141                                          "falling back to thread pool: ");
2142             s->use_linux_aio = false;
2143         }
2144     }
2145 #endif
2146 #ifdef CONFIG_LINUX_IO_URING
2147     if (s->use_linux_io_uring) {
2148         Error *local_err = NULL;
2149         if (!aio_setup_linux_io_uring(new_context, &local_err)) {
2150             error_reportf_err(local_err, "Unable to use linux io_uring, "
2151                                          "falling back to thread pool: ");
2152             s->use_linux_io_uring = false;
2153         }
2154     }
2155 #endif
2156 }
2157 
2158 static void raw_close(BlockDriverState *bs)
2159 {
2160     BDRVRawState *s = bs->opaque;
2161 
2162     if (s->fd >= 0) {
2163         qemu_close(s->fd);
2164         s->fd = -1;
2165     }
2166 }
2167 
2168 /**
2169  * Truncates the given regular file @fd to @offset and, when growing, fills the
2170  * new space according to @prealloc.
2171  *
2172  * Returns: 0 on success, -errno on failure.
2173  */
2174 static int coroutine_fn
2175 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2176                      PreallocMode prealloc, Error **errp)
2177 {
2178     RawPosixAIOData acb;
2179 
2180     acb = (RawPosixAIOData) {
2181         .bs             = bs,
2182         .aio_fildes     = fd,
2183         .aio_type       = QEMU_AIO_TRUNCATE,
2184         .aio_offset     = offset,
2185         .truncate       = {
2186             .prealloc       = prealloc,
2187             .errp           = errp,
2188         },
2189     };
2190 
2191     return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2192 }
2193 
2194 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2195                                         bool exact, PreallocMode prealloc,
2196                                         BdrvRequestFlags flags, Error **errp)
2197 {
2198     BDRVRawState *s = bs->opaque;
2199     struct stat st;
2200     int ret;
2201 
2202     if (fstat(s->fd, &st)) {
2203         ret = -errno;
2204         error_setg_errno(errp, -ret, "Failed to fstat() the file");
2205         return ret;
2206     }
2207 
2208     if (S_ISREG(st.st_mode)) {
2209         /* Always resizes to the exact @offset */
2210         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2211     }
2212 
2213     if (prealloc != PREALLOC_MODE_OFF) {
2214         error_setg(errp, "Preallocation mode '%s' unsupported for this "
2215                    "non-regular file", PreallocMode_str(prealloc));
2216         return -ENOTSUP;
2217     }
2218 
2219     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2220         int64_t cur_length = raw_getlength(bs);
2221 
2222         if (offset != cur_length && exact) {
2223             error_setg(errp, "Cannot resize device files");
2224             return -ENOTSUP;
2225         } else if (offset > cur_length) {
2226             error_setg(errp, "Cannot grow device files");
2227             return -EINVAL;
2228         }
2229     } else {
2230         error_setg(errp, "Resizing this file is not supported");
2231         return -ENOTSUP;
2232     }
2233 
2234     return 0;
2235 }
2236 
2237 #ifdef __OpenBSD__
2238 static int64_t raw_getlength(BlockDriverState *bs)
2239 {
2240     BDRVRawState *s = bs->opaque;
2241     int fd = s->fd;
2242     struct stat st;
2243 
2244     if (fstat(fd, &st))
2245         return -errno;
2246     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2247         struct disklabel dl;
2248 
2249         if (ioctl(fd, DIOCGDINFO, &dl))
2250             return -errno;
2251         return (uint64_t)dl.d_secsize *
2252             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2253     } else
2254         return st.st_size;
2255 }
2256 #elif defined(__NetBSD__)
2257 static int64_t raw_getlength(BlockDriverState *bs)
2258 {
2259     BDRVRawState *s = bs->opaque;
2260     int fd = s->fd;
2261     struct stat st;
2262 
2263     if (fstat(fd, &st))
2264         return -errno;
2265     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2266         struct dkwedge_info dkw;
2267 
2268         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2269             return dkw.dkw_size * 512;
2270         } else {
2271             struct disklabel dl;
2272 
2273             if (ioctl(fd, DIOCGDINFO, &dl))
2274                 return -errno;
2275             return (uint64_t)dl.d_secsize *
2276                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2277         }
2278     } else
2279         return st.st_size;
2280 }
2281 #elif defined(__sun__)
2282 static int64_t raw_getlength(BlockDriverState *bs)
2283 {
2284     BDRVRawState *s = bs->opaque;
2285     struct dk_minfo minfo;
2286     int ret;
2287     int64_t size;
2288 
2289     ret = fd_open(bs);
2290     if (ret < 0) {
2291         return ret;
2292     }
2293 
2294     /*
2295      * Use the DKIOCGMEDIAINFO ioctl to read the size.
2296      */
2297     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2298     if (ret != -1) {
2299         return minfo.dki_lbsize * minfo.dki_capacity;
2300     }
2301 
2302     /*
2303      * There are reports that lseek on some devices fails, but
2304      * irc discussion said that contingency on contingency was overkill.
2305      */
2306     size = lseek(s->fd, 0, SEEK_END);
2307     if (size < 0) {
2308         return -errno;
2309     }
2310     return size;
2311 }
2312 #elif defined(CONFIG_BSD)
2313 static int64_t raw_getlength(BlockDriverState *bs)
2314 {
2315     BDRVRawState *s = bs->opaque;
2316     int fd = s->fd;
2317     int64_t size;
2318     struct stat sb;
2319 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2320     int reopened = 0;
2321 #endif
2322     int ret;
2323 
2324     ret = fd_open(bs);
2325     if (ret < 0)
2326         return ret;
2327 
2328 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2329 again:
2330 #endif
2331     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2332         size = 0;
2333 #ifdef DIOCGMEDIASIZE
2334         if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
2335             size = 0;
2336         }
2337 #endif
2338 #ifdef DIOCGPART
2339         if (size == 0) {
2340             struct partinfo pi;
2341             if (ioctl(fd, DIOCGPART, &pi) == 0) {
2342                 size = pi.media_size;
2343             }
2344         }
2345 #endif
2346 #if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
2347         if (size == 0) {
2348             uint64_t sectors = 0;
2349             uint32_t sector_size = 0;
2350 
2351             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2352                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2353                 size = sectors * sector_size;
2354             }
2355         }
2356 #endif
2357         if (size == 0) {
2358             size = lseek(fd, 0LL, SEEK_END);
2359         }
2360         if (size < 0) {
2361             return -errno;
2362         }
2363 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2364         switch(s->type) {
2365         case FTYPE_CD:
2366             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2367             if (size == 2048LL * (unsigned)-1)
2368                 size = 0;
2369             /* XXX no disc?  maybe we need to reopen... */
2370             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2371                 reopened = 1;
2372                 goto again;
2373             }
2374         }
2375 #endif
2376     } else {
2377         size = lseek(fd, 0, SEEK_END);
2378         if (size < 0) {
2379             return -errno;
2380         }
2381     }
2382     return size;
2383 }
2384 #else
2385 static int64_t raw_getlength(BlockDriverState *bs)
2386 {
2387     BDRVRawState *s = bs->opaque;
2388     int ret;
2389     int64_t size;
2390 
2391     ret = fd_open(bs);
2392     if (ret < 0) {
2393         return ret;
2394     }
2395 
2396     size = lseek(s->fd, 0, SEEK_END);
2397     if (size < 0) {
2398         return -errno;
2399     }
2400     return size;
2401 }
2402 #endif
2403 
2404 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2405 {
2406     struct stat st;
2407     BDRVRawState *s = bs->opaque;
2408 
2409     if (fstat(s->fd, &st) < 0) {
2410         return -errno;
2411     }
2412     return (int64_t)st.st_blocks * 512;
2413 }
2414 
2415 static int coroutine_fn
2416 raw_co_create(BlockdevCreateOptions *options, Error **errp)
2417 {
2418     BlockdevCreateOptionsFile *file_opts;
2419     Error *local_err = NULL;
2420     int fd;
2421     uint64_t perm, shared;
2422     int result = 0;
2423 
2424     /* Validate options and set default values */
2425     assert(options->driver == BLOCKDEV_DRIVER_FILE);
2426     file_opts = &options->u.file;
2427 
2428     if (!file_opts->has_nocow) {
2429         file_opts->nocow = false;
2430     }
2431     if (!file_opts->has_preallocation) {
2432         file_opts->preallocation = PREALLOC_MODE_OFF;
2433     }
2434     if (!file_opts->has_extent_size_hint) {
2435         file_opts->extent_size_hint = 1 * MiB;
2436     }
2437     if (file_opts->extent_size_hint > UINT32_MAX) {
2438         result = -EINVAL;
2439         error_setg(errp, "Extent size hint is too large");
2440         goto out;
2441     }
2442 
2443     /* Create file */
2444     fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
2445     if (fd < 0) {
2446         result = -errno;
2447         goto out;
2448     }
2449 
2450     /* Take permissions: We want to discard everything, so we need
2451      * BLK_PERM_WRITE; and truncation to the desired size requires
2452      * BLK_PERM_RESIZE.
2453      * On the other hand, we cannot share the RESIZE permission
2454      * because we promise that after this function, the file has the
2455      * size given in the options.  If someone else were to resize it
2456      * concurrently, we could not guarantee that.
2457      * Note that after this function, we can no longer guarantee that
2458      * the file is not touched by a third party, so it may be resized
2459      * then. */
2460     perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2461     shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2462 
2463     /* Step one: Take locks */
2464     result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2465     if (result < 0) {
2466         goto out_close;
2467     }
2468 
2469     /* Step two: Check that nobody else has taken conflicting locks */
2470     result = raw_check_lock_bytes(fd, perm, shared, errp);
2471     if (result < 0) {
2472         error_append_hint(errp,
2473                           "Is another process using the image [%s]?\n",
2474                           file_opts->filename);
2475         goto out_unlock;
2476     }
2477 
2478     /* Clear the file by truncating it to 0 */
2479     result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2480     if (result < 0) {
2481         goto out_unlock;
2482     }
2483 
2484     if (file_opts->nocow) {
2485 #ifdef __linux__
2486         /* Set NOCOW flag to solve performance issue on fs like btrfs.
2487          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2488          * will be ignored since any failure of this operation should not
2489          * block the left work.
2490          */
2491         int attr;
2492         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2493             attr |= FS_NOCOW_FL;
2494             ioctl(fd, FS_IOC_SETFLAGS, &attr);
2495         }
2496 #endif
2497     }
2498 #ifdef FS_IOC_FSSETXATTR
2499     /*
2500      * Try to set the extent size hint. Failure is not fatal, and a warning is
2501      * only printed if the option was explicitly specified.
2502      */
2503     {
2504         struct fsxattr attr;
2505         result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
2506         if (result == 0) {
2507             attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
2508             attr.fsx_extsize = file_opts->extent_size_hint;
2509             result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
2510         }
2511         if (result < 0 && file_opts->has_extent_size_hint &&
2512             file_opts->extent_size_hint)
2513         {
2514             warn_report("Failed to set extent size hint: %s",
2515                         strerror(errno));
2516         }
2517     }
2518 #endif
2519 
2520     /* Resize and potentially preallocate the file to the desired
2521      * final size */
2522     result = raw_regular_truncate(NULL, fd, file_opts->size,
2523                                   file_opts->preallocation, errp);
2524     if (result < 0) {
2525         goto out_unlock;
2526     }
2527 
2528 out_unlock:
2529     raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2530     if (local_err) {
2531         /* The above call should not fail, and if it does, that does
2532          * not mean the whole creation operation has failed.  So
2533          * report it the user for their convenience, but do not report
2534          * it to the caller. */
2535         warn_report_err(local_err);
2536     }
2537 
2538 out_close:
2539     if (qemu_close(fd) != 0 && result == 0) {
2540         result = -errno;
2541         error_setg_errno(errp, -result, "Could not close the new file");
2542     }
2543 out:
2544     return result;
2545 }
2546 
2547 static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
2548                                            const char *filename,
2549                                            QemuOpts *opts,
2550                                            Error **errp)
2551 {
2552     BlockdevCreateOptions options;
2553     int64_t total_size = 0;
2554     int64_t extent_size_hint = 0;
2555     bool has_extent_size_hint = false;
2556     bool nocow = false;
2557     PreallocMode prealloc;
2558     char *buf = NULL;
2559     Error *local_err = NULL;
2560 
2561     /* Skip file: protocol prefix */
2562     strstart(filename, "file:", &filename);
2563 
2564     /* Read out options */
2565     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2566                           BDRV_SECTOR_SIZE);
2567     if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
2568         has_extent_size_hint = true;
2569         extent_size_hint =
2570             qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
2571     }
2572     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2573     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2574     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2575                                PREALLOC_MODE_OFF, &local_err);
2576     g_free(buf);
2577     if (local_err) {
2578         error_propagate(errp, local_err);
2579         return -EINVAL;
2580     }
2581 
2582     options = (BlockdevCreateOptions) {
2583         .driver     = BLOCKDEV_DRIVER_FILE,
2584         .u.file     = {
2585             .filename           = (char *) filename,
2586             .size               = total_size,
2587             .has_preallocation  = true,
2588             .preallocation      = prealloc,
2589             .has_nocow          = true,
2590             .nocow              = nocow,
2591             .has_extent_size_hint = has_extent_size_hint,
2592             .extent_size_hint   = extent_size_hint,
2593         },
2594     };
2595     return raw_co_create(&options, errp);
2596 }
2597 
2598 static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
2599                                            Error **errp)
2600 {
2601     struct stat st;
2602     int ret;
2603 
2604     if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
2605         error_setg_errno(errp, ENOENT, "%s is not a regular file",
2606                          bs->filename);
2607         return -ENOENT;
2608     }
2609 
2610     ret = unlink(bs->filename);
2611     if (ret < 0) {
2612         ret = -errno;
2613         error_setg_errno(errp, -ret, "Error when deleting file %s",
2614                          bs->filename);
2615     }
2616 
2617     return ret;
2618 }
2619 
2620 /*
2621  * Find allocation range in @bs around offset @start.
2622  * May change underlying file descriptor's file offset.
2623  * If @start is not in a hole, store @start in @data, and the
2624  * beginning of the next hole in @hole, and return 0.
2625  * If @start is in a non-trailing hole, store @start in @hole and the
2626  * beginning of the next non-hole in @data, and return 0.
2627  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2628  * If we can't find out, return a negative errno other than -ENXIO.
2629  */
2630 static int find_allocation(BlockDriverState *bs, off_t start,
2631                            off_t *data, off_t *hole)
2632 {
2633 #if defined SEEK_HOLE && defined SEEK_DATA
2634     BDRVRawState *s = bs->opaque;
2635     off_t offs;
2636 
2637     /*
2638      * SEEK_DATA cases:
2639      * D1. offs == start: start is in data
2640      * D2. offs > start: start is in a hole, next data at offs
2641      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2642      *                              or start is beyond EOF
2643      *     If the latter happens, the file has been truncated behind
2644      *     our back since we opened it.  All bets are off then.
2645      *     Treating like a trailing hole is simplest.
2646      * D4. offs < 0, errno != ENXIO: we learned nothing
2647      */
2648     offs = lseek(s->fd, start, SEEK_DATA);
2649     if (offs < 0) {
2650         return -errno;          /* D3 or D4 */
2651     }
2652 
2653     if (offs < start) {
2654         /* This is not a valid return by lseek().  We are safe to just return
2655          * -EIO in this case, and we'll treat it like D4. */
2656         return -EIO;
2657     }
2658 
2659     if (offs > start) {
2660         /* D2: in hole, next data at offs */
2661         *hole = start;
2662         *data = offs;
2663         return 0;
2664     }
2665 
2666     /* D1: in data, end not yet known */
2667 
2668     /*
2669      * SEEK_HOLE cases:
2670      * H1. offs == start: start is in a hole
2671      *     If this happens here, a hole has been dug behind our back
2672      *     since the previous lseek().
2673      * H2. offs > start: either start is in data, next hole at offs,
2674      *                   or start is in trailing hole, EOF at offs
2675      *     Linux treats trailing holes like any other hole: offs ==
2676      *     start.  Solaris seeks to EOF instead: offs > start (blech).
2677      *     If that happens here, a hole has been dug behind our back
2678      *     since the previous lseek().
2679      * H3. offs < 0, errno = ENXIO: start is beyond EOF
2680      *     If this happens, the file has been truncated behind our
2681      *     back since we opened it.  Treat it like a trailing hole.
2682      * H4. offs < 0, errno != ENXIO: we learned nothing
2683      *     Pretend we know nothing at all, i.e. "forget" about D1.
2684      */
2685     offs = lseek(s->fd, start, SEEK_HOLE);
2686     if (offs < 0) {
2687         return -errno;          /* D1 and (H3 or H4) */
2688     }
2689 
2690     if (offs < start) {
2691         /* This is not a valid return by lseek().  We are safe to just return
2692          * -EIO in this case, and we'll treat it like H4. */
2693         return -EIO;
2694     }
2695 
2696     if (offs > start) {
2697         /*
2698          * D1 and H2: either in data, next hole at offs, or it was in
2699          * data but is now in a trailing hole.  In the latter case,
2700          * all bets are off.  Treating it as if it there was data all
2701          * the way to EOF is safe, so simply do that.
2702          */
2703         *data = start;
2704         *hole = offs;
2705         return 0;
2706     }
2707 
2708     /* D1 and H1 */
2709     return -EBUSY;
2710 #else
2711     return -ENOTSUP;
2712 #endif
2713 }
2714 
2715 /*
2716  * Returns the allocation status of the specified offset.
2717  *
2718  * The block layer guarantees 'offset' and 'bytes' are within bounds.
2719  *
2720  * 'pnum' is set to the number of bytes (including and immediately following
2721  * the specified offset) that are known to be in the same
2722  * allocated/unallocated state.
2723  *
2724  * 'bytes' is the max value 'pnum' should be set to.
2725  */
2726 static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2727                                             bool want_zero,
2728                                             int64_t offset,
2729                                             int64_t bytes, int64_t *pnum,
2730                                             int64_t *map,
2731                                             BlockDriverState **file)
2732 {
2733     off_t data = 0, hole = 0;
2734     int ret;
2735 
2736     assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2737 
2738     ret = fd_open(bs);
2739     if (ret < 0) {
2740         return ret;
2741     }
2742 
2743     if (!want_zero) {
2744         *pnum = bytes;
2745         *map = offset;
2746         *file = bs;
2747         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2748     }
2749 
2750     ret = find_allocation(bs, offset, &data, &hole);
2751     if (ret == -ENXIO) {
2752         /* Trailing hole */
2753         *pnum = bytes;
2754         ret = BDRV_BLOCK_ZERO;
2755     } else if (ret < 0) {
2756         /* No info available, so pretend there are no holes */
2757         *pnum = bytes;
2758         ret = BDRV_BLOCK_DATA;
2759     } else if (data == offset) {
2760         /* On a data extent, compute bytes to the end of the extent,
2761          * possibly including a partial sector at EOF. */
2762         *pnum = MIN(bytes, hole - offset);
2763 
2764         /*
2765          * We are not allowed to return partial sectors, though, so
2766          * round up if necessary.
2767          */
2768         if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2769             int64_t file_length = raw_getlength(bs);
2770             if (file_length > 0) {
2771                 /* Ignore errors, this is just a safeguard */
2772                 assert(hole == file_length);
2773             }
2774             *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2775         }
2776 
2777         ret = BDRV_BLOCK_DATA;
2778     } else {
2779         /* On a hole, compute bytes to the beginning of the next extent.  */
2780         assert(hole == offset);
2781         *pnum = MIN(bytes, data - offset);
2782         ret = BDRV_BLOCK_ZERO;
2783     }
2784     *map = offset;
2785     *file = bs;
2786     return ret | BDRV_BLOCK_OFFSET_VALID;
2787 }
2788 
2789 #if defined(__linux__)
2790 /* Verify that the file is not in the page cache */
2791 static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2792 {
2793     const size_t window_size = 128 * 1024 * 1024;
2794     BDRVRawState *s = bs->opaque;
2795     void *window = NULL;
2796     size_t length = 0;
2797     unsigned char *vec;
2798     size_t page_size;
2799     off_t offset;
2800     off_t end;
2801 
2802     /* mincore(2) page status information requires 1 byte per page */
2803     page_size = sysconf(_SC_PAGESIZE);
2804     vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2805 
2806     end = raw_getlength(bs);
2807 
2808     for (offset = 0; offset < end; offset += window_size) {
2809         void *new_window;
2810         size_t new_length;
2811         size_t vec_end;
2812         size_t i;
2813         int ret;
2814 
2815         /* Unmap previous window if size has changed */
2816         new_length = MIN(end - offset, window_size);
2817         if (new_length != length) {
2818             munmap(window, length);
2819             window = NULL;
2820             length = 0;
2821         }
2822 
2823         new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2824                           s->fd, offset);
2825         if (new_window == MAP_FAILED) {
2826             error_setg_errno(errp, errno, "mmap failed");
2827             break;
2828         }
2829 
2830         window = new_window;
2831         length = new_length;
2832 
2833         ret = mincore(window, length, vec);
2834         if (ret < 0) {
2835             error_setg_errno(errp, errno, "mincore failed");
2836             break;
2837         }
2838 
2839         vec_end = DIV_ROUND_UP(length, page_size);
2840         for (i = 0; i < vec_end; i++) {
2841             if (vec[i] & 0x1) {
2842                 break;
2843             }
2844         }
2845         if (i < vec_end) {
2846             error_setg(errp, "page cache still in use!");
2847             break;
2848         }
2849     }
2850 
2851     if (window) {
2852         munmap(window, length);
2853     }
2854 
2855     g_free(vec);
2856 }
2857 #endif /* __linux__ */
2858 
2859 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2860                                                  Error **errp)
2861 {
2862     BDRVRawState *s = bs->opaque;
2863     int ret;
2864 
2865     ret = fd_open(bs);
2866     if (ret < 0) {
2867         error_setg_errno(errp, -ret, "The file descriptor is not open");
2868         return;
2869     }
2870 
2871     if (!s->drop_cache) {
2872         return;
2873     }
2874 
2875     if (s->open_flags & O_DIRECT) {
2876         return; /* No host kernel page cache */
2877     }
2878 
2879 #if defined(__linux__)
2880     /* This sets the scene for the next syscall... */
2881     ret = bdrv_co_flush(bs);
2882     if (ret < 0) {
2883         error_setg_errno(errp, -ret, "flush failed");
2884         return;
2885     }
2886 
2887     /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2888      * process.  These limitations are okay because we just fsynced the file,
2889      * we don't use mmap, and the file should not be in use by other processes.
2890      */
2891     ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2892     if (ret != 0) { /* the return value is a positive errno */
2893         error_setg_errno(errp, ret, "fadvise failed");
2894         return;
2895     }
2896 
2897     if (s->check_cache_dropped) {
2898         check_cache_dropped(bs, errp);
2899     }
2900 #else /* __linux__ */
2901     /* Do nothing.  Live migration to a remote host with cache.direct=off is
2902      * unsupported on other host operating systems.  Cache consistency issues
2903      * may occur but no error is reported here, partly because that's the
2904      * historical behavior and partly because it's hard to differentiate valid
2905      * configurations that should not cause errors.
2906      */
2907 #endif /* !__linux__ */
2908 }
2909 
2910 static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
2911 {
2912     if (ret) {
2913         s->stats.discard_nb_failed++;
2914     } else {
2915         s->stats.discard_nb_ok++;
2916         s->stats.discard_bytes_ok += nbytes;
2917     }
2918 }
2919 
2920 static coroutine_fn int
2921 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
2922 {
2923     BDRVRawState *s = bs->opaque;
2924     RawPosixAIOData acb;
2925     int ret;
2926 
2927     acb = (RawPosixAIOData) {
2928         .bs             = bs,
2929         .aio_fildes     = s->fd,
2930         .aio_type       = QEMU_AIO_DISCARD,
2931         .aio_offset     = offset,
2932         .aio_nbytes     = bytes,
2933     };
2934 
2935     if (blkdev) {
2936         acb.aio_type |= QEMU_AIO_BLKDEV;
2937     }
2938 
2939     ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
2940     raw_account_discard(s, bytes, ret);
2941     return ret;
2942 }
2943 
2944 static coroutine_fn int
2945 raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2946 {
2947     return raw_do_pdiscard(bs, offset, bytes, false);
2948 }
2949 
2950 static int coroutine_fn
2951 raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
2952                      BdrvRequestFlags flags, bool blkdev)
2953 {
2954     BDRVRawState *s = bs->opaque;
2955     RawPosixAIOData acb;
2956     ThreadPoolFunc *handler;
2957 
2958 #ifdef CONFIG_FALLOCATE
2959     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
2960         BdrvTrackedRequest *req;
2961 
2962         /*
2963          * This is a workaround for a bug in the Linux XFS driver,
2964          * where writes submitted through the AIO interface will be
2965          * discarded if they happen beyond a concurrently running
2966          * fallocate() that increases the file length (i.e., both the
2967          * write and the fallocate() happen beyond the EOF).
2968          *
2969          * To work around it, we extend the tracked request for this
2970          * zero write until INT64_MAX (effectively infinity), and mark
2971          * it as serializing.
2972          *
2973          * We have to enable this workaround for all filesystems and
2974          * AIO modes (not just XFS with aio=native), because for
2975          * remote filesystems we do not know the host configuration.
2976          */
2977 
2978         req = bdrv_co_get_self_request(bs);
2979         assert(req);
2980         assert(req->type == BDRV_TRACKED_WRITE);
2981         assert(req->offset <= offset);
2982         assert(req->offset + req->bytes >= offset + bytes);
2983 
2984         req->bytes = BDRV_MAX_LENGTH - req->offset;
2985 
2986         bdrv_check_request(req->offset, req->bytes, &error_abort);
2987 
2988         bdrv_make_request_serialising(req, bs->bl.request_alignment);
2989     }
2990 #endif
2991 
2992     acb = (RawPosixAIOData) {
2993         .bs             = bs,
2994         .aio_fildes     = s->fd,
2995         .aio_type       = QEMU_AIO_WRITE_ZEROES,
2996         .aio_offset     = offset,
2997         .aio_nbytes     = bytes,
2998     };
2999 
3000     if (blkdev) {
3001         acb.aio_type |= QEMU_AIO_BLKDEV;
3002     }
3003     if (flags & BDRV_REQ_NO_FALLBACK) {
3004         acb.aio_type |= QEMU_AIO_NO_FALLBACK;
3005     }
3006 
3007     if (flags & BDRV_REQ_MAY_UNMAP) {
3008         acb.aio_type |= QEMU_AIO_DISCARD;
3009         handler = handle_aiocb_write_zeroes_unmap;
3010     } else {
3011         handler = handle_aiocb_write_zeroes;
3012     }
3013 
3014     return raw_thread_pool_submit(bs, handler, &acb);
3015 }
3016 
3017 static int coroutine_fn raw_co_pwrite_zeroes(
3018     BlockDriverState *bs, int64_t offset,
3019     int bytes, BdrvRequestFlags flags)
3020 {
3021     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
3022 }
3023 
3024 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3025 {
3026     return 0;
3027 }
3028 
3029 static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
3030 {
3031     BDRVRawState *s = bs->opaque;
3032     return (BlockStatsSpecificFile) {
3033         .discard_nb_ok = s->stats.discard_nb_ok,
3034         .discard_nb_failed = s->stats.discard_nb_failed,
3035         .discard_bytes_ok = s->stats.discard_bytes_ok,
3036     };
3037 }
3038 
3039 static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
3040 {
3041     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3042 
3043     stats->driver = BLOCKDEV_DRIVER_FILE;
3044     stats->u.file = get_blockstats_specific_file(bs);
3045 
3046     return stats;
3047 }
3048 
3049 #if defined(HAVE_HOST_BLOCK_DEVICE)
3050 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
3051 {
3052     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3053 
3054     stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
3055     stats->u.host_device = get_blockstats_specific_file(bs);
3056 
3057     return stats;
3058 }
3059 #endif /* HAVE_HOST_BLOCK_DEVICE */
3060 
3061 static QemuOptsList raw_create_opts = {
3062     .name = "raw-create-opts",
3063     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
3064     .desc = {
3065         {
3066             .name = BLOCK_OPT_SIZE,
3067             .type = QEMU_OPT_SIZE,
3068             .help = "Virtual disk size"
3069         },
3070         {
3071             .name = BLOCK_OPT_NOCOW,
3072             .type = QEMU_OPT_BOOL,
3073             .help = "Turn off copy-on-write (valid only on btrfs)"
3074         },
3075         {
3076             .name = BLOCK_OPT_PREALLOC,
3077             .type = QEMU_OPT_STRING,
3078             .help = "Preallocation mode (allowed values: off"
3079 #ifdef CONFIG_POSIX_FALLOCATE
3080                     ", falloc"
3081 #endif
3082                     ", full)"
3083         },
3084         {
3085             .name = BLOCK_OPT_EXTENT_SIZE_HINT,
3086             .type = QEMU_OPT_SIZE,
3087             .help = "Extent size hint for the image file, 0 to disable"
3088         },
3089         { /* end of list */ }
3090     }
3091 };
3092 
3093 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
3094                           Error **errp)
3095 {
3096     BDRVRawState *s = bs->opaque;
3097     int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
3098     int open_flags;
3099     int ret;
3100 
3101     /* We may need a new fd if auto-read-only switches the mode */
3102     ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm,
3103                                 false, errp);
3104     if (ret < 0) {
3105         return ret;
3106     } else if (ret != s->fd) {
3107         Error *local_err = NULL;
3108 
3109         /*
3110          * Fail already check_perm() if we can't get a working O_DIRECT
3111          * alignment with the new fd.
3112          */
3113         raw_probe_alignment(bs, ret, &local_err);
3114         if (local_err) {
3115             error_propagate(errp, local_err);
3116             return -EINVAL;
3117         }
3118 
3119         s->perm_change_fd = ret;
3120         s->perm_change_flags = open_flags;
3121     }
3122 
3123     /* Prepare permissions on old fd to avoid conflicts between old and new,
3124      * but keep everything locked that new will need. */
3125     ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
3126     if (ret < 0) {
3127         goto fail;
3128     }
3129 
3130     /* Copy locks to the new fd */
3131     if (s->perm_change_fd && s->use_lock) {
3132         ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
3133                                    false, errp);
3134         if (ret < 0) {
3135             raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3136             goto fail;
3137         }
3138     }
3139     return 0;
3140 
3141 fail:
3142     if (s->perm_change_fd) {
3143         qemu_close(s->perm_change_fd);
3144     }
3145     s->perm_change_fd = 0;
3146     return ret;
3147 }
3148 
3149 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3150 {
3151     BDRVRawState *s = bs->opaque;
3152 
3153     /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3154      * called after .bdrv_reopen_commit) */
3155     if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3156         qemu_close(s->fd);
3157         s->fd = s->perm_change_fd;
3158         s->open_flags = s->perm_change_flags;
3159     }
3160     s->perm_change_fd = 0;
3161 
3162     raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3163     s->perm = perm;
3164     s->shared_perm = shared;
3165 }
3166 
3167 static void raw_abort_perm_update(BlockDriverState *bs)
3168 {
3169     BDRVRawState *s = bs->opaque;
3170 
3171     /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3172      * the file descriptor. */
3173     if (s->perm_change_fd) {
3174         qemu_close(s->perm_change_fd);
3175     }
3176     s->perm_change_fd = 0;
3177 
3178     raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3179 }
3180 
3181 static int coroutine_fn raw_co_copy_range_from(
3182         BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
3183         BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
3184         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3185 {
3186     return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3187                                  read_flags, write_flags);
3188 }
3189 
3190 static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
3191                                              BdrvChild *src,
3192                                              uint64_t src_offset,
3193                                              BdrvChild *dst,
3194                                              uint64_t dst_offset,
3195                                              uint64_t bytes,
3196                                              BdrvRequestFlags read_flags,
3197                                              BdrvRequestFlags write_flags)
3198 {
3199     RawPosixAIOData acb;
3200     BDRVRawState *s = bs->opaque;
3201     BDRVRawState *src_s;
3202 
3203     assert(dst->bs == bs);
3204     if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3205         return -ENOTSUP;
3206     }
3207 
3208     src_s = src->bs->opaque;
3209     if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3210         return -EIO;
3211     }
3212 
3213     acb = (RawPosixAIOData) {
3214         .bs             = bs,
3215         .aio_type       = QEMU_AIO_COPY_RANGE,
3216         .aio_fildes     = src_s->fd,
3217         .aio_offset     = src_offset,
3218         .aio_nbytes     = bytes,
3219         .copy_range     = {
3220             .aio_fd2        = s->fd,
3221             .aio_offset2    = dst_offset,
3222         },
3223     };
3224 
3225     return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
3226 }
3227 
3228 BlockDriver bdrv_file = {
3229     .format_name = "file",
3230     .protocol_name = "file",
3231     .instance_size = sizeof(BDRVRawState),
3232     .bdrv_needs_filename = true,
3233     .bdrv_probe = NULL, /* no probe for protocols */
3234     .bdrv_parse_filename = raw_parse_filename,
3235     .bdrv_file_open = raw_open,
3236     .bdrv_reopen_prepare = raw_reopen_prepare,
3237     .bdrv_reopen_commit = raw_reopen_commit,
3238     .bdrv_reopen_abort = raw_reopen_abort,
3239     .bdrv_close = raw_close,
3240     .bdrv_co_create = raw_co_create,
3241     .bdrv_co_create_opts = raw_co_create_opts,
3242     .bdrv_has_zero_init = bdrv_has_zero_init_1,
3243     .bdrv_co_block_status = raw_co_block_status,
3244     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3245     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3246     .bdrv_co_delete_file = raw_co_delete_file,
3247 
3248     .bdrv_co_preadv         = raw_co_preadv,
3249     .bdrv_co_pwritev        = raw_co_pwritev,
3250     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3251     .bdrv_co_pdiscard       = raw_co_pdiscard,
3252     .bdrv_co_copy_range_from = raw_co_copy_range_from,
3253     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3254     .bdrv_refresh_limits = raw_refresh_limits,
3255     .bdrv_io_plug = raw_aio_plug,
3256     .bdrv_io_unplug = raw_aio_unplug,
3257     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3258 
3259     .bdrv_co_truncate = raw_co_truncate,
3260     .bdrv_getlength = raw_getlength,
3261     .bdrv_get_info = raw_get_info,
3262     .bdrv_get_allocated_file_size
3263                         = raw_get_allocated_file_size,
3264     .bdrv_get_specific_stats = raw_get_specific_stats,
3265     .bdrv_check_perm = raw_check_perm,
3266     .bdrv_set_perm   = raw_set_perm,
3267     .bdrv_abort_perm_update = raw_abort_perm_update,
3268     .create_opts = &raw_create_opts,
3269     .mutable_opts = mutable_opts,
3270 };
3271 
3272 /***********************************************/
3273 /* host device */
3274 
3275 #if defined(HAVE_HOST_BLOCK_DEVICE)
3276 
3277 #if defined(__APPLE__) && defined(__MACH__)
3278 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3279                                 CFIndex maxPathSize, int flags);
3280 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
3281 {
3282     kern_return_t kernResult = KERN_FAILURE;
3283     mach_port_t     masterPort;
3284     CFMutableDictionaryRef  classesToMatch;
3285     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
3286     char *mediaType = NULL;
3287 
3288     kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
3289     if ( KERN_SUCCESS != kernResult ) {
3290         printf( "IOMasterPort returned %d\n", kernResult );
3291     }
3292 
3293     int index;
3294     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3295         classesToMatch = IOServiceMatching(matching_array[index]);
3296         if (classesToMatch == NULL) {
3297             error_report("IOServiceMatching returned NULL for %s",
3298                          matching_array[index]);
3299             continue;
3300         }
3301         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3302                              kCFBooleanTrue);
3303         kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
3304                                                   mediaIterator);
3305         if (kernResult != KERN_SUCCESS) {
3306             error_report("Note: IOServiceGetMatchingServices returned %d",
3307                          kernResult);
3308             continue;
3309         }
3310 
3311         /* If a match was found, leave the loop */
3312         if (*mediaIterator != 0) {
3313             trace_file_FindEjectableOpticalMedia(matching_array[index]);
3314             mediaType = g_strdup(matching_array[index]);
3315             break;
3316         }
3317     }
3318     return mediaType;
3319 }
3320 
3321 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3322                          CFIndex maxPathSize, int flags)
3323 {
3324     io_object_t     nextMedia;
3325     kern_return_t   kernResult = KERN_FAILURE;
3326     *bsdPath = '\0';
3327     nextMedia = IOIteratorNext( mediaIterator );
3328     if ( nextMedia )
3329     {
3330         CFTypeRef   bsdPathAsCFString;
3331     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3332         if ( bsdPathAsCFString ) {
3333             size_t devPathLength;
3334             strcpy( bsdPath, _PATH_DEV );
3335             if (flags & BDRV_O_NOCACHE) {
3336                 strcat(bsdPath, "r");
3337             }
3338             devPathLength = strlen( bsdPath );
3339             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3340                 kernResult = KERN_SUCCESS;
3341             }
3342             CFRelease( bsdPathAsCFString );
3343         }
3344         IOObjectRelease( nextMedia );
3345     }
3346 
3347     return kernResult;
3348 }
3349 
3350 /* Sets up a real cdrom for use in QEMU */
3351 static bool setup_cdrom(char *bsd_path, Error **errp)
3352 {
3353     int index, num_of_test_partitions = 2, fd;
3354     char test_partition[MAXPATHLEN];
3355     bool partition_found = false;
3356 
3357     /* look for a working partition */
3358     for (index = 0; index < num_of_test_partitions; index++) {
3359         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3360                  index);
3361         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
3362         if (fd >= 0) {
3363             partition_found = true;
3364             qemu_close(fd);
3365             break;
3366         }
3367     }
3368 
3369     /* if a working partition on the device was not found */
3370     if (partition_found == false) {
3371         error_setg(errp, "Failed to find a working partition on disc");
3372     } else {
3373         trace_file_setup_cdrom(test_partition);
3374         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3375     }
3376     return partition_found;
3377 }
3378 
3379 /* Prints directions on mounting and unmounting a device */
3380 static void print_unmounting_directions(const char *file_name)
3381 {
3382     error_report("If device %s is mounted on the desktop, unmount"
3383                  " it first before using it in QEMU", file_name);
3384     error_report("Command to unmount device: diskutil unmountDisk %s",
3385                  file_name);
3386     error_report("Command to mount device: diskutil mountDisk %s", file_name);
3387 }
3388 
3389 #endif /* defined(__APPLE__) && defined(__MACH__) */
3390 
3391 static int hdev_probe_device(const char *filename)
3392 {
3393     struct stat st;
3394 
3395     /* allow a dedicated CD-ROM driver to match with a higher priority */
3396     if (strstart(filename, "/dev/cdrom", NULL))
3397         return 50;
3398 
3399     if (stat(filename, &st) >= 0 &&
3400             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3401         return 100;
3402     }
3403 
3404     return 0;
3405 }
3406 
3407 static void hdev_parse_filename(const char *filename, QDict *options,
3408                                 Error **errp)
3409 {
3410     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3411 }
3412 
3413 static bool hdev_is_sg(BlockDriverState *bs)
3414 {
3415 
3416 #if defined(__linux__)
3417 
3418     BDRVRawState *s = bs->opaque;
3419     struct stat st;
3420     struct sg_scsi_id scsiid;
3421     int sg_version;
3422     int ret;
3423 
3424     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3425         return false;
3426     }
3427 
3428     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3429     if (ret < 0) {
3430         return false;
3431     }
3432 
3433     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3434     if (ret >= 0) {
3435         trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3436         return true;
3437     }
3438 
3439 #endif
3440 
3441     return false;
3442 }
3443 
3444 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3445                      Error **errp)
3446 {
3447     BDRVRawState *s = bs->opaque;
3448     int ret;
3449 
3450 #if defined(__APPLE__) && defined(__MACH__)
3451     /*
3452      * Caution: while qdict_get_str() is fine, getting non-string types
3453      * would require more care.  When @options come from -blockdev or
3454      * blockdev_add, its members are typed according to the QAPI
3455      * schema, but when they come from -drive, they're all QString.
3456      */
3457     const char *filename = qdict_get_str(options, "filename");
3458     char bsd_path[MAXPATHLEN] = "";
3459     bool error_occurred = false;
3460 
3461     /* If using a real cdrom */
3462     if (strcmp(filename, "/dev/cdrom") == 0) {
3463         char *mediaType = NULL;
3464         kern_return_t ret_val;
3465         io_iterator_t mediaIterator = 0;
3466 
3467         mediaType = FindEjectableOpticalMedia(&mediaIterator);
3468         if (mediaType == NULL) {
3469             error_setg(errp, "Please make sure your CD/DVD is in the optical"
3470                        " drive");
3471             error_occurred = true;
3472             goto hdev_open_Mac_error;
3473         }
3474 
3475         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3476         if (ret_val != KERN_SUCCESS) {
3477             error_setg(errp, "Could not get BSD path for optical drive");
3478             error_occurred = true;
3479             goto hdev_open_Mac_error;
3480         }
3481 
3482         /* If a real optical drive was not found */
3483         if (bsd_path[0] == '\0') {
3484             error_setg(errp, "Failed to obtain bsd path for optical drive");
3485             error_occurred = true;
3486             goto hdev_open_Mac_error;
3487         }
3488 
3489         /* If using a cdrom disc and finding a partition on the disc failed */
3490         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3491             setup_cdrom(bsd_path, errp) == false) {
3492             print_unmounting_directions(bsd_path);
3493             error_occurred = true;
3494             goto hdev_open_Mac_error;
3495         }
3496 
3497         qdict_put_str(options, "filename", bsd_path);
3498 
3499 hdev_open_Mac_error:
3500         g_free(mediaType);
3501         if (mediaIterator) {
3502             IOObjectRelease(mediaIterator);
3503         }
3504         if (error_occurred) {
3505             return -ENOENT;
3506         }
3507     }
3508 #endif /* defined(__APPLE__) && defined(__MACH__) */
3509 
3510     s->type = FTYPE_FILE;
3511 
3512     ret = raw_open_common(bs, options, flags, 0, true, errp);
3513     if (ret < 0) {
3514 #if defined(__APPLE__) && defined(__MACH__)
3515         if (*bsd_path) {
3516             filename = bsd_path;
3517         }
3518         /* if a physical device experienced an error while being opened */
3519         if (strncmp(filename, "/dev/", 5) == 0) {
3520             print_unmounting_directions(filename);
3521         }
3522 #endif /* defined(__APPLE__) && defined(__MACH__) */
3523         return ret;
3524     }
3525 
3526     /* Since this does ioctl the device must be already opened */
3527     bs->sg = hdev_is_sg(bs);
3528 
3529     return ret;
3530 }
3531 
3532 #if defined(__linux__)
3533 static int coroutine_fn
3534 hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3535 {
3536     BDRVRawState *s = bs->opaque;
3537     RawPosixAIOData acb;
3538     int ret;
3539 
3540     ret = fd_open(bs);
3541     if (ret < 0) {
3542         return ret;
3543     }
3544 
3545     if (req == SG_IO && s->pr_mgr) {
3546         struct sg_io_hdr *io_hdr = buf;
3547         if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3548             io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3549             return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3550                                       s->fd, io_hdr);
3551         }
3552     }
3553 
3554     acb = (RawPosixAIOData) {
3555         .bs         = bs,
3556         .aio_type   = QEMU_AIO_IOCTL,
3557         .aio_fildes = s->fd,
3558         .aio_offset = 0,
3559         .ioctl      = {
3560             .buf        = buf,
3561             .cmd        = req,
3562         },
3563     };
3564 
3565     return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3566 }
3567 #endif /* linux */
3568 
3569 static coroutine_fn int
3570 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3571 {
3572     BDRVRawState *s = bs->opaque;
3573     int ret;
3574 
3575     ret = fd_open(bs);
3576     if (ret < 0) {
3577         raw_account_discard(s, bytes, ret);
3578         return ret;
3579     }
3580     return raw_do_pdiscard(bs, offset, bytes, true);
3581 }
3582 
3583 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3584     int64_t offset, int bytes, BdrvRequestFlags flags)
3585 {
3586     int rc;
3587 
3588     rc = fd_open(bs);
3589     if (rc < 0) {
3590         return rc;
3591     }
3592 
3593     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3594 }
3595 
3596 static BlockDriver bdrv_host_device = {
3597     .format_name        = "host_device",
3598     .protocol_name        = "host_device",
3599     .instance_size      = sizeof(BDRVRawState),
3600     .bdrv_needs_filename = true,
3601     .bdrv_probe_device  = hdev_probe_device,
3602     .bdrv_parse_filename = hdev_parse_filename,
3603     .bdrv_file_open     = hdev_open,
3604     .bdrv_close         = raw_close,
3605     .bdrv_reopen_prepare = raw_reopen_prepare,
3606     .bdrv_reopen_commit  = raw_reopen_commit,
3607     .bdrv_reopen_abort   = raw_reopen_abort,
3608     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3609     .create_opts         = &bdrv_create_opts_simple,
3610     .mutable_opts        = mutable_opts,
3611     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3612     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3613 
3614     .bdrv_co_preadv         = raw_co_preadv,
3615     .bdrv_co_pwritev        = raw_co_pwritev,
3616     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3617     .bdrv_co_pdiscard       = hdev_co_pdiscard,
3618     .bdrv_co_copy_range_from = raw_co_copy_range_from,
3619     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3620     .bdrv_refresh_limits = raw_refresh_limits,
3621     .bdrv_io_plug = raw_aio_plug,
3622     .bdrv_io_unplug = raw_aio_unplug,
3623     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3624 
3625     .bdrv_co_truncate       = raw_co_truncate,
3626     .bdrv_getlength	= raw_getlength,
3627     .bdrv_get_info = raw_get_info,
3628     .bdrv_get_allocated_file_size
3629                         = raw_get_allocated_file_size,
3630     .bdrv_get_specific_stats = hdev_get_specific_stats,
3631     .bdrv_check_perm = raw_check_perm,
3632     .bdrv_set_perm   = raw_set_perm,
3633     .bdrv_abort_perm_update = raw_abort_perm_update,
3634     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3635     .bdrv_probe_geometry = hdev_probe_geometry,
3636 
3637     /* generic scsi device */
3638 #ifdef __linux__
3639     .bdrv_co_ioctl          = hdev_co_ioctl,
3640 #endif
3641 };
3642 
3643 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3644 static void cdrom_parse_filename(const char *filename, QDict *options,
3645                                  Error **errp)
3646 {
3647     bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3648 }
3649 #endif
3650 
3651 #ifdef __linux__
3652 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3653                       Error **errp)
3654 {
3655     BDRVRawState *s = bs->opaque;
3656 
3657     s->type = FTYPE_CD;
3658 
3659     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3660     return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3661 }
3662 
3663 static int cdrom_probe_device(const char *filename)
3664 {
3665     int fd, ret;
3666     int prio = 0;
3667     struct stat st;
3668 
3669     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
3670     if (fd < 0) {
3671         goto out;
3672     }
3673     ret = fstat(fd, &st);
3674     if (ret == -1 || !S_ISBLK(st.st_mode)) {
3675         goto outc;
3676     }
3677 
3678     /* Attempt to detect via a CDROM specific ioctl */
3679     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3680     if (ret >= 0)
3681         prio = 100;
3682 
3683 outc:
3684     qemu_close(fd);
3685 out:
3686     return prio;
3687 }
3688 
3689 static bool cdrom_is_inserted(BlockDriverState *bs)
3690 {
3691     BDRVRawState *s = bs->opaque;
3692     int ret;
3693 
3694     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3695     return ret == CDS_DISC_OK;
3696 }
3697 
3698 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3699 {
3700     BDRVRawState *s = bs->opaque;
3701 
3702     if (eject_flag) {
3703         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3704             perror("CDROMEJECT");
3705     } else {
3706         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3707             perror("CDROMEJECT");
3708     }
3709 }
3710 
3711 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3712 {
3713     BDRVRawState *s = bs->opaque;
3714 
3715     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3716         /*
3717          * Note: an error can happen if the distribution automatically
3718          * mounts the CD-ROM
3719          */
3720         /* perror("CDROM_LOCKDOOR"); */
3721     }
3722 }
3723 
3724 static BlockDriver bdrv_host_cdrom = {
3725     .format_name        = "host_cdrom",
3726     .protocol_name      = "host_cdrom",
3727     .instance_size      = sizeof(BDRVRawState),
3728     .bdrv_needs_filename = true,
3729     .bdrv_probe_device	= cdrom_probe_device,
3730     .bdrv_parse_filename = cdrom_parse_filename,
3731     .bdrv_file_open     = cdrom_open,
3732     .bdrv_close         = raw_close,
3733     .bdrv_reopen_prepare = raw_reopen_prepare,
3734     .bdrv_reopen_commit  = raw_reopen_commit,
3735     .bdrv_reopen_abort   = raw_reopen_abort,
3736     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3737     .create_opts         = &bdrv_create_opts_simple,
3738     .mutable_opts        = mutable_opts,
3739     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3740 
3741     .bdrv_co_preadv         = raw_co_preadv,
3742     .bdrv_co_pwritev        = raw_co_pwritev,
3743     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3744     .bdrv_refresh_limits = raw_refresh_limits,
3745     .bdrv_io_plug = raw_aio_plug,
3746     .bdrv_io_unplug = raw_aio_unplug,
3747     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3748 
3749     .bdrv_co_truncate    = raw_co_truncate,
3750     .bdrv_getlength      = raw_getlength,
3751     .has_variable_length = true,
3752     .bdrv_get_allocated_file_size
3753                         = raw_get_allocated_file_size,
3754 
3755     /* removable device support */
3756     .bdrv_is_inserted   = cdrom_is_inserted,
3757     .bdrv_eject         = cdrom_eject,
3758     .bdrv_lock_medium   = cdrom_lock_medium,
3759 
3760     /* generic scsi device */
3761     .bdrv_co_ioctl      = hdev_co_ioctl,
3762 };
3763 #endif /* __linux__ */
3764 
3765 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3766 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3767                       Error **errp)
3768 {
3769     BDRVRawState *s = bs->opaque;
3770     int ret;
3771 
3772     s->type = FTYPE_CD;
3773 
3774     ret = raw_open_common(bs, options, flags, 0, true, errp);
3775     if (ret) {
3776         return ret;
3777     }
3778 
3779     /* make sure the door isn't locked at this time */
3780     ioctl(s->fd, CDIOCALLOW);
3781     return 0;
3782 }
3783 
3784 static int cdrom_probe_device(const char *filename)
3785 {
3786     if (strstart(filename, "/dev/cd", NULL) ||
3787             strstart(filename, "/dev/acd", NULL))
3788         return 100;
3789     return 0;
3790 }
3791 
3792 static int cdrom_reopen(BlockDriverState *bs)
3793 {
3794     BDRVRawState *s = bs->opaque;
3795     int fd;
3796 
3797     /*
3798      * Force reread of possibly changed/newly loaded disc,
3799      * FreeBSD seems to not notice sometimes...
3800      */
3801     if (s->fd >= 0)
3802         qemu_close(s->fd);
3803     fd = qemu_open(bs->filename, s->open_flags, NULL);
3804     if (fd < 0) {
3805         s->fd = -1;
3806         return -EIO;
3807     }
3808     s->fd = fd;
3809 
3810     /* make sure the door isn't locked at this time */
3811     ioctl(s->fd, CDIOCALLOW);
3812     return 0;
3813 }
3814 
3815 static bool cdrom_is_inserted(BlockDriverState *bs)
3816 {
3817     return raw_getlength(bs) > 0;
3818 }
3819 
3820 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3821 {
3822     BDRVRawState *s = bs->opaque;
3823 
3824     if (s->fd < 0)
3825         return;
3826 
3827     (void) ioctl(s->fd, CDIOCALLOW);
3828 
3829     if (eject_flag) {
3830         if (ioctl(s->fd, CDIOCEJECT) < 0)
3831             perror("CDIOCEJECT");
3832     } else {
3833         if (ioctl(s->fd, CDIOCCLOSE) < 0)
3834             perror("CDIOCCLOSE");
3835     }
3836 
3837     cdrom_reopen(bs);
3838 }
3839 
3840 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3841 {
3842     BDRVRawState *s = bs->opaque;
3843 
3844     if (s->fd < 0)
3845         return;
3846     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3847         /*
3848          * Note: an error can happen if the distribution automatically
3849          * mounts the CD-ROM
3850          */
3851         /* perror("CDROM_LOCKDOOR"); */
3852     }
3853 }
3854 
3855 static BlockDriver bdrv_host_cdrom = {
3856     .format_name        = "host_cdrom",
3857     .protocol_name      = "host_cdrom",
3858     .instance_size      = sizeof(BDRVRawState),
3859     .bdrv_needs_filename = true,
3860     .bdrv_probe_device	= cdrom_probe_device,
3861     .bdrv_parse_filename = cdrom_parse_filename,
3862     .bdrv_file_open     = cdrom_open,
3863     .bdrv_close         = raw_close,
3864     .bdrv_reopen_prepare = raw_reopen_prepare,
3865     .bdrv_reopen_commit  = raw_reopen_commit,
3866     .bdrv_reopen_abort   = raw_reopen_abort,
3867     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3868     .create_opts         = &bdrv_create_opts_simple,
3869     .mutable_opts       = mutable_opts,
3870 
3871     .bdrv_co_preadv         = raw_co_preadv,
3872     .bdrv_co_pwritev        = raw_co_pwritev,
3873     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3874     .bdrv_refresh_limits = raw_refresh_limits,
3875     .bdrv_io_plug = raw_aio_plug,
3876     .bdrv_io_unplug = raw_aio_unplug,
3877     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3878 
3879     .bdrv_co_truncate    = raw_co_truncate,
3880     .bdrv_getlength      = raw_getlength,
3881     .has_variable_length = true,
3882     .bdrv_get_allocated_file_size
3883                         = raw_get_allocated_file_size,
3884 
3885     /* removable device support */
3886     .bdrv_is_inserted   = cdrom_is_inserted,
3887     .bdrv_eject         = cdrom_eject,
3888     .bdrv_lock_medium   = cdrom_lock_medium,
3889 };
3890 #endif /* __FreeBSD__ */
3891 
3892 #endif /* HAVE_HOST_BLOCK_DEVICE */
3893 
3894 static void bdrv_file_init(void)
3895 {
3896     /*
3897      * Register all the drivers.  Note that order is important, the driver
3898      * registered last will get probed first.
3899      */
3900     bdrv_register(&bdrv_file);
3901 #if defined(HAVE_HOST_BLOCK_DEVICE)
3902     bdrv_register(&bdrv_host_device);
3903 #ifdef __linux__
3904     bdrv_register(&bdrv_host_cdrom);
3905 #endif
3906 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3907     bdrv_register(&bdrv_host_cdrom);
3908 #endif
3909 #endif /* HAVE_HOST_BLOCK_DEVICE */
3910 }
3911 
3912 block_init(bdrv_file_init);
3913