xref: /openbmc/qemu/block/file-posix.c (revision db817b8c500a60873eba80cbf047900ae5b32766)
1 /*
2  * Block driver for RAW files (posix)
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu/cutils.h"
28 #include "qemu/error-report.h"
29 #include "block/block_int.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "trace.h"
33 #include "block/thread-pool.h"
34 #include "qemu/iov.h"
35 #include "block/raw-aio.h"
36 #include "qapi/qmp/qdict.h"
37 #include "qapi/qmp/qstring.h"
38 
39 #include "scsi/pr-manager.h"
40 #include "scsi/constants.h"
41 
42 #if defined(__APPLE__) && (__MACH__)
43 #include <paths.h>
44 #include <sys/param.h>
45 #include <IOKit/IOKitLib.h>
46 #include <IOKit/IOBSD.h>
47 #include <IOKit/storage/IOMediaBSDClient.h>
48 #include <IOKit/storage/IOMedia.h>
49 #include <IOKit/storage/IOCDMedia.h>
50 //#include <IOKit/storage/IOCDTypes.h>
51 #include <IOKit/storage/IODVDMedia.h>
52 #include <CoreFoundation/CoreFoundation.h>
53 #endif
54 
55 #ifdef __sun__
56 #define _POSIX_PTHREAD_SEMANTICS 1
57 #include <sys/dkio.h>
58 #endif
59 #ifdef __linux__
60 #include <sys/ioctl.h>
61 #include <sys/param.h>
62 #include <sys/syscall.h>
63 #include <linux/cdrom.h>
64 #include <linux/fd.h>
65 #include <linux/fs.h>
66 #include <linux/hdreg.h>
67 #include <scsi/sg.h>
68 #ifdef __s390__
69 #include <asm/dasd.h>
70 #endif
71 #ifndef FS_NOCOW_FL
72 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
73 #endif
74 #endif
75 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
76 #include <linux/falloc.h>
77 #endif
78 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
79 #include <sys/disk.h>
80 #include <sys/cdio.h>
81 #endif
82 
83 #ifdef __OpenBSD__
84 #include <sys/ioctl.h>
85 #include <sys/disklabel.h>
86 #include <sys/dkio.h>
87 #endif
88 
89 #ifdef __NetBSD__
90 #include <sys/ioctl.h>
91 #include <sys/disklabel.h>
92 #include <sys/dkio.h>
93 #include <sys/disk.h>
94 #endif
95 
96 #ifdef __DragonFly__
97 #include <sys/ioctl.h>
98 #include <sys/diskslice.h>
99 #endif
100 
101 #ifdef CONFIG_XFS
102 #include <xfs/xfs.h>
103 #endif
104 
105 //#define DEBUG_BLOCK
106 
107 #ifdef DEBUG_BLOCK
108 # define DEBUG_BLOCK_PRINT 1
109 #else
110 # define DEBUG_BLOCK_PRINT 0
111 #endif
112 #define DPRINTF(fmt, ...) \
113 do { \
114     if (DEBUG_BLOCK_PRINT) { \
115         printf(fmt, ## __VA_ARGS__); \
116     } \
117 } while (0)
118 
119 /* OS X does not have O_DSYNC */
120 #ifndef O_DSYNC
121 #ifdef O_SYNC
122 #define O_DSYNC O_SYNC
123 #elif defined(O_FSYNC)
124 #define O_DSYNC O_FSYNC
125 #endif
126 #endif
127 
128 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
129 #ifndef O_DIRECT
130 #define O_DIRECT O_DSYNC
131 #endif
132 
133 #define FTYPE_FILE   0
134 #define FTYPE_CD     1
135 
136 #define MAX_BLOCKSIZE	4096
137 
138 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
139  * leaving a few more bytes for its future use. */
140 #define RAW_LOCK_PERM_BASE             100
141 #define RAW_LOCK_SHARED_BASE           200
142 
143 typedef struct BDRVRawState {
144     int fd;
145     int lock_fd;
146     bool use_lock;
147     int type;
148     int open_flags;
149     size_t buf_align;
150 
151     /* The current permissions. */
152     uint64_t perm;
153     uint64_t shared_perm;
154 
155 #ifdef CONFIG_XFS
156     bool is_xfs:1;
157 #endif
158     bool has_discard:1;
159     bool has_write_zeroes:1;
160     bool discard_zeroes:1;
161     bool use_linux_aio:1;
162     bool page_cache_inconsistent:1;
163     bool has_fallocate;
164     bool needs_alignment;
165     bool check_cache_dropped;
166 
167     PRManager *pr_mgr;
168 } BDRVRawState;
169 
170 typedef struct BDRVRawReopenState {
171     int fd;
172     int open_flags;
173     bool check_cache_dropped;
174 } BDRVRawReopenState;
175 
176 static int fd_open(BlockDriverState *bs);
177 static int64_t raw_getlength(BlockDriverState *bs);
178 
179 typedef struct RawPosixAIOData {
180     BlockDriverState *bs;
181     int aio_fildes;
182     union {
183         struct iovec *aio_iov;
184         void *aio_ioctl_buf;
185     };
186     int aio_niov;
187     uint64_t aio_nbytes;
188 #define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
189     off_t aio_offset;
190     int aio_type;
191     union {
192         struct {
193             int aio_fd2;
194             off_t aio_offset2;
195         };
196         struct {
197             PreallocMode prealloc;
198             Error **errp;
199         };
200     };
201 } RawPosixAIOData;
202 
203 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
204 static int cdrom_reopen(BlockDriverState *bs);
205 #endif
206 
207 #if defined(__NetBSD__)
208 static int raw_normalize_devicepath(const char **filename)
209 {
210     static char namebuf[PATH_MAX];
211     const char *dp, *fname;
212     struct stat sb;
213 
214     fname = *filename;
215     dp = strrchr(fname, '/');
216     if (lstat(fname, &sb) < 0) {
217         fprintf(stderr, "%s: stat failed: %s\n",
218             fname, strerror(errno));
219         return -errno;
220     }
221 
222     if (!S_ISBLK(sb.st_mode)) {
223         return 0;
224     }
225 
226     if (dp == NULL) {
227         snprintf(namebuf, PATH_MAX, "r%s", fname);
228     } else {
229         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
230             (int)(dp - fname), fname, dp + 1);
231     }
232     fprintf(stderr, "%s is a block device", fname);
233     *filename = namebuf;
234     fprintf(stderr, ", using %s\n", *filename);
235 
236     return 0;
237 }
238 #else
239 static int raw_normalize_devicepath(const char **filename)
240 {
241     return 0;
242 }
243 #endif
244 
245 /*
246  * Get logical block size via ioctl. On success store it in @sector_size_p.
247  */
248 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
249 {
250     unsigned int sector_size;
251     bool success = false;
252     int i;
253 
254     errno = ENOTSUP;
255     static const unsigned long ioctl_list[] = {
256 #ifdef BLKSSZGET
257         BLKSSZGET,
258 #endif
259 #ifdef DKIOCGETBLOCKSIZE
260         DKIOCGETBLOCKSIZE,
261 #endif
262 #ifdef DIOCGSECTORSIZE
263         DIOCGSECTORSIZE,
264 #endif
265     };
266 
267     /* Try a few ioctls to get the right size */
268     for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
269         if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
270             *sector_size_p = sector_size;
271             success = true;
272         }
273     }
274 
275     return success ? 0 : -errno;
276 }
277 
278 /**
279  * Get physical block size of @fd.
280  * On success, store it in @blk_size and return 0.
281  * On failure, return -errno.
282  */
283 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
284 {
285 #ifdef BLKPBSZGET
286     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
287         return -errno;
288     }
289     return 0;
290 #else
291     return -ENOTSUP;
292 #endif
293 }
294 
295 /* Check if read is allowed with given memory buffer and length.
296  *
297  * This function is used to check O_DIRECT memory buffer and request alignment.
298  */
299 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
300 {
301     ssize_t ret = pread(fd, buf, len, 0);
302 
303     if (ret >= 0) {
304         return true;
305     }
306 
307 #ifdef __linux__
308     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
309      * other errors (e.g. real I/O error), which could happen on a failed
310      * drive, since we only care about probing alignment.
311      */
312     if (errno != EINVAL) {
313         return true;
314     }
315 #endif
316 
317     return false;
318 }
319 
320 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
321 {
322     BDRVRawState *s = bs->opaque;
323     char *buf;
324     size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
325 
326     /* For SCSI generic devices the alignment is not really used.
327        With buffered I/O, we don't have any restrictions. */
328     if (bdrv_is_sg(bs) || !s->needs_alignment) {
329         bs->bl.request_alignment = 1;
330         s->buf_align = 1;
331         return;
332     }
333 
334     bs->bl.request_alignment = 0;
335     s->buf_align = 0;
336     /* Let's try to use the logical blocksize for the alignment. */
337     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
338         bs->bl.request_alignment = 0;
339     }
340 #ifdef CONFIG_XFS
341     if (s->is_xfs) {
342         struct dioattr da;
343         if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
344             bs->bl.request_alignment = da.d_miniosz;
345             /* The kernel returns wrong information for d_mem */
346             /* s->buf_align = da.d_mem; */
347         }
348     }
349 #endif
350 
351     /* If we could not get the sizes so far, we can only guess them */
352     if (!s->buf_align) {
353         size_t align;
354         buf = qemu_memalign(max_align, 2 * max_align);
355         for (align = 512; align <= max_align; align <<= 1) {
356             if (raw_is_io_aligned(fd, buf + align, max_align)) {
357                 s->buf_align = align;
358                 break;
359             }
360         }
361         qemu_vfree(buf);
362     }
363 
364     if (!bs->bl.request_alignment) {
365         size_t align;
366         buf = qemu_memalign(s->buf_align, max_align);
367         for (align = 512; align <= max_align; align <<= 1) {
368             if (raw_is_io_aligned(fd, buf, align)) {
369                 bs->bl.request_alignment = align;
370                 break;
371             }
372         }
373         qemu_vfree(buf);
374     }
375 
376     if (!s->buf_align || !bs->bl.request_alignment) {
377         error_setg(errp, "Could not find working O_DIRECT alignment");
378         error_append_hint(errp, "Try cache.direct=off\n");
379     }
380 }
381 
382 static void raw_parse_flags(int bdrv_flags, int *open_flags)
383 {
384     assert(open_flags != NULL);
385 
386     *open_flags |= O_BINARY;
387     *open_flags &= ~O_ACCMODE;
388     if (bdrv_flags & BDRV_O_RDWR) {
389         *open_flags |= O_RDWR;
390     } else {
391         *open_flags |= O_RDONLY;
392     }
393 
394     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
395      * and O_DIRECT for no caching. */
396     if ((bdrv_flags & BDRV_O_NOCACHE)) {
397         *open_flags |= O_DIRECT;
398     }
399 }
400 
401 static void raw_parse_filename(const char *filename, QDict *options,
402                                Error **errp)
403 {
404     bdrv_parse_filename_strip_prefix(filename, "file:", options);
405 }
406 
407 static QemuOptsList raw_runtime_opts = {
408     .name = "raw",
409     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
410     .desc = {
411         {
412             .name = "filename",
413             .type = QEMU_OPT_STRING,
414             .help = "File name of the image",
415         },
416         {
417             .name = "aio",
418             .type = QEMU_OPT_STRING,
419             .help = "host AIO implementation (threads, native)",
420         },
421         {
422             .name = "locking",
423             .type = QEMU_OPT_STRING,
424             .help = "file locking mode (on/off/auto, default: auto)",
425         },
426         {
427             .name = "pr-manager",
428             .type = QEMU_OPT_STRING,
429             .help = "id of persistent reservation manager object (default: none)",
430         },
431         {
432             .name = "x-check-cache-dropped",
433             .type = QEMU_OPT_BOOL,
434             .help = "check that page cache was dropped on live migration (default: off)"
435         },
436         { /* end of list */ }
437     },
438 };
439 
440 static int raw_open_common(BlockDriverState *bs, QDict *options,
441                            int bdrv_flags, int open_flags,
442                            bool device, Error **errp)
443 {
444     BDRVRawState *s = bs->opaque;
445     QemuOpts *opts;
446     Error *local_err = NULL;
447     const char *filename = NULL;
448     const char *str;
449     BlockdevAioOptions aio, aio_default;
450     int fd, ret;
451     struct stat st;
452     OnOffAuto locking;
453 
454     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
455     qemu_opts_absorb_qdict(opts, options, &local_err);
456     if (local_err) {
457         error_propagate(errp, local_err);
458         ret = -EINVAL;
459         goto fail;
460     }
461 
462     filename = qemu_opt_get(opts, "filename");
463 
464     ret = raw_normalize_devicepath(&filename);
465     if (ret != 0) {
466         error_setg_errno(errp, -ret, "Could not normalize device path");
467         goto fail;
468     }
469 
470     aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
471                   ? BLOCKDEV_AIO_OPTIONS_NATIVE
472                   : BLOCKDEV_AIO_OPTIONS_THREADS;
473     aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
474                           qemu_opt_get(opts, "aio"),
475                           aio_default, &local_err);
476     if (local_err) {
477         error_propagate(errp, local_err);
478         ret = -EINVAL;
479         goto fail;
480     }
481     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
482 
483     locking = qapi_enum_parse(&OnOffAuto_lookup,
484                               qemu_opt_get(opts, "locking"),
485                               ON_OFF_AUTO_AUTO, &local_err);
486     if (local_err) {
487         error_propagate(errp, local_err);
488         ret = -EINVAL;
489         goto fail;
490     }
491     switch (locking) {
492     case ON_OFF_AUTO_ON:
493         s->use_lock = true;
494         if (!qemu_has_ofd_lock()) {
495             fprintf(stderr,
496                     "File lock requested but OFD locking syscall is "
497                     "unavailable, falling back to POSIX file locks.\n"
498                     "Due to the implementation, locks can be lost "
499                     "unexpectedly.\n");
500         }
501         break;
502     case ON_OFF_AUTO_OFF:
503         s->use_lock = false;
504         break;
505     case ON_OFF_AUTO_AUTO:
506         s->use_lock = qemu_has_ofd_lock();
507         break;
508     default:
509         abort();
510     }
511 
512     str = qemu_opt_get(opts, "pr-manager");
513     if (str) {
514         s->pr_mgr = pr_manager_lookup(str, &local_err);
515         if (local_err) {
516             error_propagate(errp, local_err);
517             ret = -EINVAL;
518             goto fail;
519         }
520     }
521 
522     s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
523                                                false);
524 
525     s->open_flags = open_flags;
526     raw_parse_flags(bdrv_flags, &s->open_flags);
527 
528     s->fd = -1;
529     fd = qemu_open(filename, s->open_flags, 0644);
530     if (fd < 0) {
531         ret = -errno;
532         error_setg_errno(errp, errno, "Could not open '%s'", filename);
533         if (ret == -EROFS) {
534             ret = -EACCES;
535         }
536         goto fail;
537     }
538     s->fd = fd;
539 
540     s->lock_fd = -1;
541     if (s->use_lock) {
542         fd = qemu_open(filename, s->open_flags);
543         if (fd < 0) {
544             ret = -errno;
545             error_setg_errno(errp, errno, "Could not open '%s' for locking",
546                              filename);
547             qemu_close(s->fd);
548             goto fail;
549         }
550         s->lock_fd = fd;
551     }
552     s->perm = 0;
553     s->shared_perm = BLK_PERM_ALL;
554 
555 #ifdef CONFIG_LINUX_AIO
556      /* Currently Linux does AIO only for files opened with O_DIRECT */
557     if (s->use_linux_aio) {
558         if (!(s->open_flags & O_DIRECT)) {
559             error_setg(errp, "aio=native was specified, but it requires "
560                              "cache.direct=on, which was not specified.");
561             ret = -EINVAL;
562             goto fail;
563         }
564         if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
565             error_prepend(errp, "Unable to use native AIO: ");
566             goto fail;
567         }
568     }
569 #else
570     if (s->use_linux_aio) {
571         error_setg(errp, "aio=native was specified, but is not supported "
572                          "in this build.");
573         ret = -EINVAL;
574         goto fail;
575     }
576 #endif /* !defined(CONFIG_LINUX_AIO) */
577 
578     s->has_discard = true;
579     s->has_write_zeroes = true;
580     if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
581         s->needs_alignment = true;
582     }
583 
584     if (fstat(s->fd, &st) < 0) {
585         ret = -errno;
586         error_setg_errno(errp, errno, "Could not stat file");
587         goto fail;
588     }
589 
590     if (!device) {
591         if (S_ISBLK(st.st_mode)) {
592             warn_report("Opening a block device as a file using the '%s' "
593                         "driver is deprecated", bs->drv->format_name);
594         } else if (S_ISCHR(st.st_mode)) {
595             warn_report("Opening a character device as a file using the '%s' "
596                         "driver is deprecated", bs->drv->format_name);
597         } else if (!S_ISREG(st.st_mode)) {
598             error_setg(errp, "A regular file was expected by the '%s' driver, "
599                        "but something else was given", bs->drv->format_name);
600             ret = -EINVAL;
601             goto fail;
602         } else {
603             s->discard_zeroes = true;
604             s->has_fallocate = true;
605         }
606     } else {
607         if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
608             error_setg(errp, "'%s' driver expects either "
609                        "a character or block device", bs->drv->format_name);
610             ret = -EINVAL;
611             goto fail;
612         }
613     }
614 
615     if (S_ISBLK(st.st_mode)) {
616 #ifdef BLKDISCARDZEROES
617         unsigned int arg;
618         if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
619             s->discard_zeroes = true;
620         }
621 #endif
622 #ifdef __linux__
623         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
624          * not rely on the contents of discarded blocks unless using O_DIRECT.
625          * Same for BLKZEROOUT.
626          */
627         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
628             s->discard_zeroes = false;
629             s->has_write_zeroes = false;
630         }
631 #endif
632     }
633 #ifdef __FreeBSD__
634     if (S_ISCHR(st.st_mode)) {
635         /*
636          * The file is a char device (disk), which on FreeBSD isn't behind
637          * a pager, so force all requests to be aligned. This is needed
638          * so QEMU makes sure all IO operations on the device are aligned
639          * to sector size, or else FreeBSD will reject them with EINVAL.
640          */
641         s->needs_alignment = true;
642     }
643 #endif
644 
645 #ifdef CONFIG_XFS
646     if (platform_test_xfs_fd(s->fd)) {
647         s->is_xfs = true;
648     }
649 #endif
650 
651     bs->supported_zero_flags = s->discard_zeroes ? BDRV_REQ_MAY_UNMAP : 0;
652     ret = 0;
653 fail:
654     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
655         unlink(filename);
656     }
657     qemu_opts_del(opts);
658     return ret;
659 }
660 
661 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
662                     Error **errp)
663 {
664     BDRVRawState *s = bs->opaque;
665 
666     s->type = FTYPE_FILE;
667     return raw_open_common(bs, options, flags, 0, false, errp);
668 }
669 
670 typedef enum {
671     RAW_PL_PREPARE,
672     RAW_PL_COMMIT,
673     RAW_PL_ABORT,
674 } RawPermLockOp;
675 
676 #define PERM_FOREACH(i) \
677     for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
678 
679 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
680  * file; if @unlock == true, also unlock the unneeded bytes.
681  * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
682  */
683 static int raw_apply_lock_bytes(int fd,
684                                 uint64_t perm_lock_bits,
685                                 uint64_t shared_perm_lock_bits,
686                                 bool unlock, Error **errp)
687 {
688     int ret;
689     int i;
690 
691     PERM_FOREACH(i) {
692         int off = RAW_LOCK_PERM_BASE + i;
693         if (perm_lock_bits & (1ULL << i)) {
694             ret = qemu_lock_fd(fd, off, 1, false);
695             if (ret) {
696                 error_setg(errp, "Failed to lock byte %d", off);
697                 return ret;
698             }
699         } else if (unlock) {
700             ret = qemu_unlock_fd(fd, off, 1);
701             if (ret) {
702                 error_setg(errp, "Failed to unlock byte %d", off);
703                 return ret;
704             }
705         }
706     }
707     PERM_FOREACH(i) {
708         int off = RAW_LOCK_SHARED_BASE + i;
709         if (shared_perm_lock_bits & (1ULL << i)) {
710             ret = qemu_lock_fd(fd, off, 1, false);
711             if (ret) {
712                 error_setg(errp, "Failed to lock byte %d", off);
713                 return ret;
714             }
715         } else if (unlock) {
716             ret = qemu_unlock_fd(fd, off, 1);
717             if (ret) {
718                 error_setg(errp, "Failed to unlock byte %d", off);
719                 return ret;
720             }
721         }
722     }
723     return 0;
724 }
725 
726 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
727 static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
728                                 Error **errp)
729 {
730     int ret;
731     int i;
732 
733     PERM_FOREACH(i) {
734         int off = RAW_LOCK_SHARED_BASE + i;
735         uint64_t p = 1ULL << i;
736         if (perm & p) {
737             ret = qemu_lock_fd_test(fd, off, 1, true);
738             if (ret) {
739                 char *perm_name = bdrv_perm_names(p);
740                 error_setg(errp,
741                            "Failed to get \"%s\" lock",
742                            perm_name);
743                 g_free(perm_name);
744                 error_append_hint(errp,
745                                   "Is another process using the image?\n");
746                 return ret;
747             }
748         }
749     }
750     PERM_FOREACH(i) {
751         int off = RAW_LOCK_PERM_BASE + i;
752         uint64_t p = 1ULL << i;
753         if (!(shared_perm & p)) {
754             ret = qemu_lock_fd_test(fd, off, 1, true);
755             if (ret) {
756                 char *perm_name = bdrv_perm_names(p);
757                 error_setg(errp,
758                            "Failed to get shared \"%s\" lock",
759                            perm_name);
760                 g_free(perm_name);
761                 error_append_hint(errp,
762                                   "Is another process using the image?\n");
763                 return ret;
764             }
765         }
766     }
767     return 0;
768 }
769 
770 static int raw_handle_perm_lock(BlockDriverState *bs,
771                                 RawPermLockOp op,
772                                 uint64_t new_perm, uint64_t new_shared,
773                                 Error **errp)
774 {
775     BDRVRawState *s = bs->opaque;
776     int ret = 0;
777     Error *local_err = NULL;
778 
779     if (!s->use_lock) {
780         return 0;
781     }
782 
783     if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
784         return 0;
785     }
786 
787     assert(s->lock_fd > 0);
788 
789     switch (op) {
790     case RAW_PL_PREPARE:
791         ret = raw_apply_lock_bytes(s->lock_fd, s->perm | new_perm,
792                                    ~s->shared_perm | ~new_shared,
793                                    false, errp);
794         if (!ret) {
795             ret = raw_check_lock_bytes(s->lock_fd, new_perm, new_shared, errp);
796             if (!ret) {
797                 return 0;
798             }
799         }
800         op = RAW_PL_ABORT;
801         /* fall through to unlock bytes. */
802     case RAW_PL_ABORT:
803         raw_apply_lock_bytes(s->lock_fd, s->perm, ~s->shared_perm,
804                              true, &local_err);
805         if (local_err) {
806             /* Theoretically the above call only unlocks bytes and it cannot
807              * fail. Something weird happened, report it.
808              */
809             error_report_err(local_err);
810         }
811         break;
812     case RAW_PL_COMMIT:
813         raw_apply_lock_bytes(s->lock_fd, new_perm, ~new_shared,
814                              true, &local_err);
815         if (local_err) {
816             /* Theoretically the above call only unlocks bytes and it cannot
817              * fail. Something weird happened, report it.
818              */
819             error_report_err(local_err);
820         }
821         break;
822     }
823     return ret;
824 }
825 
826 static int raw_reopen_prepare(BDRVReopenState *state,
827                               BlockReopenQueue *queue, Error **errp)
828 {
829     BDRVRawState *s;
830     BDRVRawReopenState *rs;
831     QemuOpts *opts;
832     int ret = 0;
833     Error *local_err = NULL;
834 
835     assert(state != NULL);
836     assert(state->bs != NULL);
837 
838     s = state->bs->opaque;
839 
840     state->opaque = g_new0(BDRVRawReopenState, 1);
841     rs = state->opaque;
842     rs->fd = -1;
843 
844     /* Handle options changes */
845     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
846     qemu_opts_absorb_qdict(opts, state->options, &local_err);
847     if (local_err) {
848         error_propagate(errp, local_err);
849         ret = -EINVAL;
850         goto out;
851     }
852 
853     rs->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
854                                                 s->check_cache_dropped);
855 
856     if (s->type == FTYPE_CD) {
857         rs->open_flags |= O_NONBLOCK;
858     }
859 
860     raw_parse_flags(state->flags, &rs->open_flags);
861 
862     int fcntl_flags = O_APPEND | O_NONBLOCK;
863 #ifdef O_NOATIME
864     fcntl_flags |= O_NOATIME;
865 #endif
866 
867 #ifdef O_ASYNC
868     /* Not all operating systems have O_ASYNC, and those that don't
869      * will not let us track the state into rs->open_flags (typically
870      * you achieve the same effect with an ioctl, for example I_SETSIG
871      * on Solaris). But we do not use O_ASYNC, so that's fine.
872      */
873     assert((s->open_flags & O_ASYNC) == 0);
874 #endif
875 
876     if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
877         /* dup the original fd */
878         rs->fd = qemu_dup(s->fd);
879         if (rs->fd >= 0) {
880             ret = fcntl_setfl(rs->fd, rs->open_flags);
881             if (ret) {
882                 qemu_close(rs->fd);
883                 rs->fd = -1;
884             }
885         }
886     }
887 
888     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
889     if (rs->fd == -1) {
890         const char *normalized_filename = state->bs->filename;
891         ret = raw_normalize_devicepath(&normalized_filename);
892         if (ret < 0) {
893             error_setg_errno(errp, -ret, "Could not normalize device path");
894         } else {
895             assert(!(rs->open_flags & O_CREAT));
896             rs->fd = qemu_open(normalized_filename, rs->open_flags);
897             if (rs->fd == -1) {
898                 error_setg_errno(errp, errno, "Could not reopen file");
899                 ret = -1;
900             }
901         }
902     }
903 
904     /* Fail already reopen_prepare() if we can't get a working O_DIRECT
905      * alignment with the new fd. */
906     if (rs->fd != -1) {
907         raw_probe_alignment(state->bs, rs->fd, &local_err);
908         if (local_err) {
909             qemu_close(rs->fd);
910             rs->fd = -1;
911             error_propagate(errp, local_err);
912             ret = -EINVAL;
913         }
914     }
915 
916 out:
917     qemu_opts_del(opts);
918     return ret;
919 }
920 
921 static void raw_reopen_commit(BDRVReopenState *state)
922 {
923     BDRVRawReopenState *rs = state->opaque;
924     BDRVRawState *s = state->bs->opaque;
925 
926     s->check_cache_dropped = rs->check_cache_dropped;
927     s->open_flags = rs->open_flags;
928 
929     qemu_close(s->fd);
930     s->fd = rs->fd;
931 
932     g_free(state->opaque);
933     state->opaque = NULL;
934 }
935 
936 
937 static void raw_reopen_abort(BDRVReopenState *state)
938 {
939     BDRVRawReopenState *rs = state->opaque;
940 
941      /* nothing to do if NULL, we didn't get far enough */
942     if (rs == NULL) {
943         return;
944     }
945 
946     if (rs->fd >= 0) {
947         qemu_close(rs->fd);
948         rs->fd = -1;
949     }
950     g_free(state->opaque);
951     state->opaque = NULL;
952 }
953 
954 static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
955 {
956 #ifdef BLKSECTGET
957     int max_bytes = 0;
958     short max_sectors = 0;
959     if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
960         return max_bytes;
961     } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
962         return max_sectors << BDRV_SECTOR_BITS;
963     } else {
964         return -errno;
965     }
966 #else
967     return -ENOSYS;
968 #endif
969 }
970 
971 static int hdev_get_max_segments(const struct stat *st)
972 {
973 #ifdef CONFIG_LINUX
974     char buf[32];
975     const char *end;
976     char *sysfspath;
977     int ret;
978     int fd = -1;
979     long max_segments;
980 
981     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
982                                 major(st->st_rdev), minor(st->st_rdev));
983     fd = open(sysfspath, O_RDONLY);
984     if (fd == -1) {
985         ret = -errno;
986         goto out;
987     }
988     do {
989         ret = read(fd, buf, sizeof(buf) - 1);
990     } while (ret == -1 && errno == EINTR);
991     if (ret < 0) {
992         ret = -errno;
993         goto out;
994     } else if (ret == 0) {
995         ret = -EIO;
996         goto out;
997     }
998     buf[ret] = 0;
999     /* The file is ended with '\n', pass 'end' to accept that. */
1000     ret = qemu_strtol(buf, &end, 10, &max_segments);
1001     if (ret == 0 && end && *end == '\n') {
1002         ret = max_segments;
1003     }
1004 
1005 out:
1006     if (fd != -1) {
1007         close(fd);
1008     }
1009     g_free(sysfspath);
1010     return ret;
1011 #else
1012     return -ENOTSUP;
1013 #endif
1014 }
1015 
1016 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1017 {
1018     BDRVRawState *s = bs->opaque;
1019     struct stat st;
1020 
1021     if (!fstat(s->fd, &st)) {
1022         if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
1023             int ret = hdev_get_max_transfer_length(bs, s->fd);
1024             if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1025                 bs->bl.max_transfer = pow2floor(ret);
1026             }
1027             ret = hdev_get_max_segments(&st);
1028             if (ret > 0) {
1029                 bs->bl.max_transfer = MIN(bs->bl.max_transfer,
1030                                           ret * getpagesize());
1031             }
1032         }
1033     }
1034 
1035     raw_probe_alignment(bs, s->fd, errp);
1036     bs->bl.min_mem_alignment = s->buf_align;
1037     bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
1038 }
1039 
1040 static int check_for_dasd(int fd)
1041 {
1042 #ifdef BIODASDINFO2
1043     struct dasd_information2_t info = {0};
1044 
1045     return ioctl(fd, BIODASDINFO2, &info);
1046 #else
1047     return -1;
1048 #endif
1049 }
1050 
1051 /**
1052  * Try to get @bs's logical and physical block size.
1053  * On success, store them in @bsz and return zero.
1054  * On failure, return negative errno.
1055  */
1056 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1057 {
1058     BDRVRawState *s = bs->opaque;
1059     int ret;
1060 
1061     /* If DASD, get blocksizes */
1062     if (check_for_dasd(s->fd) < 0) {
1063         return -ENOTSUP;
1064     }
1065     ret = probe_logical_blocksize(s->fd, &bsz->log);
1066     if (ret < 0) {
1067         return ret;
1068     }
1069     return probe_physical_blocksize(s->fd, &bsz->phys);
1070 }
1071 
1072 /**
1073  * Try to get @bs's geometry: cyls, heads, sectors.
1074  * On success, store them in @geo and return 0.
1075  * On failure return -errno.
1076  * (Allows block driver to assign default geometry values that guest sees)
1077  */
1078 #ifdef __linux__
1079 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1080 {
1081     BDRVRawState *s = bs->opaque;
1082     struct hd_geometry ioctl_geo = {0};
1083 
1084     /* If DASD, get its geometry */
1085     if (check_for_dasd(s->fd) < 0) {
1086         return -ENOTSUP;
1087     }
1088     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1089         return -errno;
1090     }
1091     /* HDIO_GETGEO may return success even though geo contains zeros
1092        (e.g. certain multipath setups) */
1093     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1094         return -ENOTSUP;
1095     }
1096     /* Do not return a geometry for partition */
1097     if (ioctl_geo.start != 0) {
1098         return -ENOTSUP;
1099     }
1100     geo->heads = ioctl_geo.heads;
1101     geo->sectors = ioctl_geo.sectors;
1102     geo->cylinders = ioctl_geo.cylinders;
1103 
1104     return 0;
1105 }
1106 #else /* __linux__ */
1107 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1108 {
1109     return -ENOTSUP;
1110 }
1111 #endif
1112 
1113 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
1114 {
1115     int ret;
1116 
1117     ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
1118     if (ret == -1) {
1119         return -errno;
1120     }
1121 
1122     return 0;
1123 }
1124 
1125 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
1126 {
1127     BDRVRawState *s = aiocb->bs->opaque;
1128     int ret;
1129 
1130     if (s->page_cache_inconsistent) {
1131         return -EIO;
1132     }
1133 
1134     ret = qemu_fdatasync(aiocb->aio_fildes);
1135     if (ret == -1) {
1136         /* There is no clear definition of the semantics of a failing fsync(),
1137          * so we may have to assume the worst. The sad truth is that this
1138          * assumption is correct for Linux. Some pages are now probably marked
1139          * clean in the page cache even though they are inconsistent with the
1140          * on-disk contents. The next fdatasync() call would succeed, but no
1141          * further writeback attempt will be made. We can't get back to a state
1142          * in which we know what is on disk (we would have to rewrite
1143          * everything that was touched since the last fdatasync() at least), so
1144          * make bdrv_flush() fail permanently. Given that the behaviour isn't
1145          * really defined, I have little hope that other OSes are doing better.
1146          *
1147          * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1148          * cache. */
1149         if ((s->open_flags & O_DIRECT) == 0) {
1150             s->page_cache_inconsistent = true;
1151         }
1152         return -errno;
1153     }
1154     return 0;
1155 }
1156 
1157 #ifdef CONFIG_PREADV
1158 
1159 static bool preadv_present = true;
1160 
1161 static ssize_t
1162 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1163 {
1164     return preadv(fd, iov, nr_iov, offset);
1165 }
1166 
1167 static ssize_t
1168 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1169 {
1170     return pwritev(fd, iov, nr_iov, offset);
1171 }
1172 
1173 #else
1174 
1175 static bool preadv_present = false;
1176 
1177 static ssize_t
1178 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1179 {
1180     return -ENOSYS;
1181 }
1182 
1183 static ssize_t
1184 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1185 {
1186     return -ENOSYS;
1187 }
1188 
1189 #endif
1190 
1191 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1192 {
1193     ssize_t len;
1194 
1195     do {
1196         if (aiocb->aio_type & QEMU_AIO_WRITE)
1197             len = qemu_pwritev(aiocb->aio_fildes,
1198                                aiocb->aio_iov,
1199                                aiocb->aio_niov,
1200                                aiocb->aio_offset);
1201          else
1202             len = qemu_preadv(aiocb->aio_fildes,
1203                               aiocb->aio_iov,
1204                               aiocb->aio_niov,
1205                               aiocb->aio_offset);
1206     } while (len == -1 && errno == EINTR);
1207 
1208     if (len == -1) {
1209         return -errno;
1210     }
1211     return len;
1212 }
1213 
1214 /*
1215  * Read/writes the data to/from a given linear buffer.
1216  *
1217  * Returns the number of bytes handles or -errno in case of an error. Short
1218  * reads are only returned if the end of the file is reached.
1219  */
1220 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1221 {
1222     ssize_t offset = 0;
1223     ssize_t len;
1224 
1225     while (offset < aiocb->aio_nbytes) {
1226         if (aiocb->aio_type & QEMU_AIO_WRITE) {
1227             len = pwrite(aiocb->aio_fildes,
1228                          (const char *)buf + offset,
1229                          aiocb->aio_nbytes - offset,
1230                          aiocb->aio_offset + offset);
1231         } else {
1232             len = pread(aiocb->aio_fildes,
1233                         buf + offset,
1234                         aiocb->aio_nbytes - offset,
1235                         aiocb->aio_offset + offset);
1236         }
1237         if (len == -1 && errno == EINTR) {
1238             continue;
1239         } else if (len == -1 && errno == EINVAL &&
1240                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1241                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1242                    offset > 0) {
1243             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1244              * after a short read.  Assume that O_DIRECT short reads only occur
1245              * at EOF.  Therefore this is a short read, not an I/O error.
1246              */
1247             break;
1248         } else if (len == -1) {
1249             offset = -errno;
1250             break;
1251         } else if (len == 0) {
1252             break;
1253         }
1254         offset += len;
1255     }
1256 
1257     return offset;
1258 }
1259 
1260 static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
1261 {
1262     ssize_t nbytes;
1263     char *buf;
1264 
1265     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1266         /*
1267          * If there is just a single buffer, and it is properly aligned
1268          * we can just use plain pread/pwrite without any problems.
1269          */
1270         if (aiocb->aio_niov == 1) {
1271              return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
1272         }
1273         /*
1274          * We have more than one iovec, and all are properly aligned.
1275          *
1276          * Try preadv/pwritev first and fall back to linearizing the
1277          * buffer if it's not supported.
1278          */
1279         if (preadv_present) {
1280             nbytes = handle_aiocb_rw_vector(aiocb);
1281             if (nbytes == aiocb->aio_nbytes ||
1282                 (nbytes < 0 && nbytes != -ENOSYS)) {
1283                 return nbytes;
1284             }
1285             preadv_present = false;
1286         }
1287 
1288         /*
1289          * XXX(hch): short read/write.  no easy way to handle the reminder
1290          * using these interfaces.  For now retry using plain
1291          * pread/pwrite?
1292          */
1293     }
1294 
1295     /*
1296      * Ok, we have to do it the hard way, copy all segments into
1297      * a single aligned buffer.
1298      */
1299     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1300     if (buf == NULL) {
1301         return -ENOMEM;
1302     }
1303 
1304     if (aiocb->aio_type & QEMU_AIO_WRITE) {
1305         char *p = buf;
1306         int i;
1307 
1308         for (i = 0; i < aiocb->aio_niov; ++i) {
1309             memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
1310             p += aiocb->aio_iov[i].iov_len;
1311         }
1312         assert(p - buf == aiocb->aio_nbytes);
1313     }
1314 
1315     nbytes = handle_aiocb_rw_linear(aiocb, buf);
1316     if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1317         char *p = buf;
1318         size_t count = aiocb->aio_nbytes, copy;
1319         int i;
1320 
1321         for (i = 0; i < aiocb->aio_niov && count; ++i) {
1322             copy = count;
1323             if (copy > aiocb->aio_iov[i].iov_len) {
1324                 copy = aiocb->aio_iov[i].iov_len;
1325             }
1326             memcpy(aiocb->aio_iov[i].iov_base, p, copy);
1327             assert(count >= copy);
1328             p     += copy;
1329             count -= copy;
1330         }
1331         assert(count == 0);
1332     }
1333     qemu_vfree(buf);
1334 
1335     return nbytes;
1336 }
1337 
1338 #ifdef CONFIG_XFS
1339 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
1340 {
1341     struct xfs_flock64 fl;
1342     int err;
1343 
1344     memset(&fl, 0, sizeof(fl));
1345     fl.l_whence = SEEK_SET;
1346     fl.l_start = offset;
1347     fl.l_len = bytes;
1348 
1349     if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
1350         err = errno;
1351         DPRINTF("cannot write zero range (%s)\n", strerror(errno));
1352         return -err;
1353     }
1354 
1355     return 0;
1356 }
1357 
1358 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
1359 {
1360     struct xfs_flock64 fl;
1361     int err;
1362 
1363     memset(&fl, 0, sizeof(fl));
1364     fl.l_whence = SEEK_SET;
1365     fl.l_start = offset;
1366     fl.l_len = bytes;
1367 
1368     if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
1369         err = errno;
1370         DPRINTF("cannot punch hole (%s)\n", strerror(errno));
1371         return -err;
1372     }
1373 
1374     return 0;
1375 }
1376 #endif
1377 
1378 static int translate_err(int err)
1379 {
1380     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1381         err == -ENOTTY) {
1382         err = -ENOTSUP;
1383     }
1384     return err;
1385 }
1386 
1387 #ifdef CONFIG_FALLOCATE
1388 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1389 {
1390     do {
1391         if (fallocate(fd, mode, offset, len) == 0) {
1392             return 0;
1393         }
1394     } while (errno == EINTR);
1395     return translate_err(-errno);
1396 }
1397 #endif
1398 
1399 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1400 {
1401     int ret = -ENOTSUP;
1402     BDRVRawState *s = aiocb->bs->opaque;
1403 
1404     if (!s->has_write_zeroes) {
1405         return -ENOTSUP;
1406     }
1407 
1408 #ifdef BLKZEROOUT
1409     do {
1410         uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1411         if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1412             return 0;
1413         }
1414     } while (errno == EINTR);
1415 
1416     ret = translate_err(-errno);
1417 #endif
1418 
1419     if (ret == -ENOTSUP) {
1420         s->has_write_zeroes = false;
1421     }
1422     return ret;
1423 }
1424 
1425 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
1426 {
1427 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1428     BDRVRawState *s = aiocb->bs->opaque;
1429 #endif
1430 #ifdef CONFIG_FALLOCATE
1431     int64_t len;
1432 #endif
1433 
1434     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1435         return handle_aiocb_write_zeroes_block(aiocb);
1436     }
1437 
1438 #ifdef CONFIG_XFS
1439     if (s->is_xfs) {
1440         return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
1441     }
1442 #endif
1443 
1444 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1445     if (s->has_write_zeroes) {
1446         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1447                                aiocb->aio_offset, aiocb->aio_nbytes);
1448         if (ret == 0 || ret != -ENOTSUP) {
1449             return ret;
1450         }
1451         s->has_write_zeroes = false;
1452     }
1453 #endif
1454 
1455 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1456     if (s->has_discard && s->has_fallocate) {
1457         int ret = do_fallocate(s->fd,
1458                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1459                                aiocb->aio_offset, aiocb->aio_nbytes);
1460         if (ret == 0) {
1461             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1462             if (ret == 0 || ret != -ENOTSUP) {
1463                 return ret;
1464             }
1465             s->has_fallocate = false;
1466         } else if (ret != -ENOTSUP) {
1467             return ret;
1468         } else {
1469             s->has_discard = false;
1470         }
1471     }
1472 #endif
1473 
1474 #ifdef CONFIG_FALLOCATE
1475     /* Last resort: we are trying to extend the file with zeroed data. This
1476      * can be done via fallocate(fd, 0) */
1477     len = bdrv_getlength(aiocb->bs);
1478     if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1479         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1480         if (ret == 0 || ret != -ENOTSUP) {
1481             return ret;
1482         }
1483         s->has_fallocate = false;
1484     }
1485 #endif
1486 
1487     return -ENOTSUP;
1488 }
1489 
1490 #ifndef HAVE_COPY_FILE_RANGE
1491 static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1492                              off_t *out_off, size_t len, unsigned int flags)
1493 {
1494 #ifdef __NR_copy_file_range
1495     return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1496                    out_off, len, flags);
1497 #else
1498     errno = ENOSYS;
1499     return -1;
1500 #endif
1501 }
1502 #endif
1503 
1504 static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
1505 {
1506     uint64_t bytes = aiocb->aio_nbytes;
1507     off_t in_off = aiocb->aio_offset;
1508     off_t out_off = aiocb->aio_offset2;
1509 
1510     while (bytes) {
1511         ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1512                                       aiocb->aio_fd2, &out_off,
1513                                       bytes, 0);
1514         trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1515                                    aiocb->aio_fd2, out_off, bytes, 0, ret);
1516         if (ret == 0) {
1517             /* No progress (e.g. when beyond EOF), let the caller fall back to
1518              * buffer I/O. */
1519             return -ENOSPC;
1520         }
1521         if (ret < 0) {
1522             switch (errno) {
1523             case ENOSYS:
1524                 return -ENOTSUP;
1525             case EINTR:
1526                 continue;
1527             default:
1528                 return -errno;
1529             }
1530         }
1531         bytes -= ret;
1532     }
1533     return 0;
1534 }
1535 
1536 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
1537 {
1538     int ret = -EOPNOTSUPP;
1539     BDRVRawState *s = aiocb->bs->opaque;
1540 
1541     if (!s->has_discard) {
1542         return -ENOTSUP;
1543     }
1544 
1545     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1546 #ifdef BLKDISCARD
1547         do {
1548             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1549             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1550                 return 0;
1551             }
1552         } while (errno == EINTR);
1553 
1554         ret = -errno;
1555 #endif
1556     } else {
1557 #ifdef CONFIG_XFS
1558         if (s->is_xfs) {
1559             return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1560         }
1561 #endif
1562 
1563 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1564         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1565                            aiocb->aio_offset, aiocb->aio_nbytes);
1566 #endif
1567     }
1568 
1569     ret = translate_err(ret);
1570     if (ret == -ENOTSUP) {
1571         s->has_discard = false;
1572     }
1573     return ret;
1574 }
1575 
1576 static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
1577 {
1578     int result = 0;
1579     int64_t current_length = 0;
1580     char *buf = NULL;
1581     struct stat st;
1582     int fd = aiocb->aio_fildes;
1583     int64_t offset = aiocb->aio_offset;
1584     Error **errp = aiocb->errp;
1585 
1586     if (fstat(fd, &st) < 0) {
1587         result = -errno;
1588         error_setg_errno(errp, -result, "Could not stat file");
1589         return result;
1590     }
1591 
1592     current_length = st.st_size;
1593     if (current_length > offset && aiocb->prealloc != PREALLOC_MODE_OFF) {
1594         error_setg(errp, "Cannot use preallocation for shrinking files");
1595         return -ENOTSUP;
1596     }
1597 
1598     switch (aiocb->prealloc) {
1599 #ifdef CONFIG_POSIX_FALLOCATE
1600     case PREALLOC_MODE_FALLOC:
1601         /*
1602          * Truncating before posix_fallocate() makes it about twice slower on
1603          * file systems that do not support fallocate(), trying to check if a
1604          * block is allocated before allocating it, so don't do that here.
1605          */
1606         if (offset != current_length) {
1607             result = -posix_fallocate(fd, current_length,
1608                                       offset - current_length);
1609             if (result != 0) {
1610                 /* posix_fallocate() doesn't set errno. */
1611                 error_setg_errno(errp, -result,
1612                                  "Could not preallocate new data");
1613             }
1614         } else {
1615             result = 0;
1616         }
1617         goto out;
1618 #endif
1619     case PREALLOC_MODE_FULL:
1620     {
1621         int64_t num = 0, left = offset - current_length;
1622         off_t seek_result;
1623 
1624         /*
1625          * Knowing the final size from the beginning could allow the file
1626          * system driver to do less allocations and possibly avoid
1627          * fragmentation of the file.
1628          */
1629         if (ftruncate(fd, offset) != 0) {
1630             result = -errno;
1631             error_setg_errno(errp, -result, "Could not resize file");
1632             goto out;
1633         }
1634 
1635         buf = g_malloc0(65536);
1636 
1637         seek_result = lseek(fd, current_length, SEEK_SET);
1638         if (seek_result < 0) {
1639             result = -errno;
1640             error_setg_errno(errp, -result,
1641                              "Failed to seek to the old end of file");
1642             goto out;
1643         }
1644 
1645         while (left > 0) {
1646             num = MIN(left, 65536);
1647             result = write(fd, buf, num);
1648             if (result < 0) {
1649                 result = -errno;
1650                 error_setg_errno(errp, -result,
1651                                  "Could not write zeros for preallocation");
1652                 goto out;
1653             }
1654             left -= result;
1655         }
1656         if (result >= 0) {
1657             result = fsync(fd);
1658             if (result < 0) {
1659                 result = -errno;
1660                 error_setg_errno(errp, -result,
1661                                  "Could not flush file to disk");
1662                 goto out;
1663             }
1664         }
1665         goto out;
1666     }
1667     case PREALLOC_MODE_OFF:
1668         if (ftruncate(fd, offset) != 0) {
1669             result = -errno;
1670             error_setg_errno(errp, -result, "Could not resize file");
1671         }
1672         return result;
1673     default:
1674         result = -ENOTSUP;
1675         error_setg(errp, "Unsupported preallocation mode: %s",
1676                    PreallocMode_str(aiocb->prealloc));
1677         return result;
1678     }
1679 
1680 out:
1681     if (result < 0) {
1682         if (ftruncate(fd, current_length) < 0) {
1683             error_report("Failed to restore old file length: %s",
1684                          strerror(errno));
1685         }
1686     }
1687 
1688     g_free(buf);
1689     return result;
1690 }
1691 
1692 static int aio_worker(void *arg)
1693 {
1694     RawPosixAIOData *aiocb = arg;
1695     ssize_t ret = 0;
1696 
1697     switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
1698     case QEMU_AIO_READ:
1699         ret = handle_aiocb_rw(aiocb);
1700         if (ret >= 0 && ret < aiocb->aio_nbytes) {
1701             iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
1702                       0, aiocb->aio_nbytes - ret);
1703 
1704             ret = aiocb->aio_nbytes;
1705         }
1706         if (ret == aiocb->aio_nbytes) {
1707             ret = 0;
1708         } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1709             ret = -EINVAL;
1710         }
1711         break;
1712     case QEMU_AIO_WRITE:
1713         ret = handle_aiocb_rw(aiocb);
1714         if (ret == aiocb->aio_nbytes) {
1715             ret = 0;
1716         } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1717             ret = -EINVAL;
1718         }
1719         break;
1720     case QEMU_AIO_FLUSH:
1721         ret = handle_aiocb_flush(aiocb);
1722         break;
1723     case QEMU_AIO_IOCTL:
1724         ret = handle_aiocb_ioctl(aiocb);
1725         break;
1726     case QEMU_AIO_DISCARD:
1727         ret = handle_aiocb_discard(aiocb);
1728         break;
1729     case QEMU_AIO_WRITE_ZEROES:
1730         ret = handle_aiocb_write_zeroes(aiocb);
1731         break;
1732     case QEMU_AIO_COPY_RANGE:
1733         ret = handle_aiocb_copy_range(aiocb);
1734         break;
1735     case QEMU_AIO_TRUNCATE:
1736         ret = handle_aiocb_truncate(aiocb);
1737         break;
1738     default:
1739         fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
1740         ret = -EINVAL;
1741         break;
1742     }
1743 
1744     g_free(aiocb);
1745     return ret;
1746 }
1747 
1748 static int paio_submit_co_full(BlockDriverState *bs, int fd,
1749                                int64_t offset, int fd2, int64_t offset2,
1750                                QEMUIOVector *qiov,
1751                                int bytes, int type)
1752 {
1753     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1754     ThreadPool *pool;
1755 
1756     acb->bs = bs;
1757     acb->aio_type = type;
1758     acb->aio_fildes = fd;
1759     acb->aio_fd2 = fd2;
1760     acb->aio_offset2 = offset2;
1761 
1762     acb->aio_nbytes = bytes;
1763     acb->aio_offset = offset;
1764 
1765     if (qiov) {
1766         acb->aio_iov = qiov->iov;
1767         acb->aio_niov = qiov->niov;
1768         assert(qiov->size == bytes);
1769     }
1770 
1771     trace_file_paio_submit_co(offset, bytes, type);
1772     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1773     return thread_pool_submit_co(pool, aio_worker, acb);
1774 }
1775 
1776 static inline int paio_submit_co(BlockDriverState *bs, int fd,
1777                                  int64_t offset, QEMUIOVector *qiov,
1778                                  int bytes, int type)
1779 {
1780     return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type);
1781 }
1782 
1783 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1784                                    uint64_t bytes, QEMUIOVector *qiov, int type)
1785 {
1786     BDRVRawState *s = bs->opaque;
1787 
1788     if (fd_open(bs) < 0)
1789         return -EIO;
1790 
1791     /*
1792      * Check if the underlying device requires requests to be aligned,
1793      * and if the request we are trying to submit is aligned or not.
1794      * If this is the case tell the low-level driver that it needs
1795      * to copy the buffer.
1796      */
1797     if (s->needs_alignment) {
1798         if (!bdrv_qiov_is_aligned(bs, qiov)) {
1799             type |= QEMU_AIO_MISALIGNED;
1800 #ifdef CONFIG_LINUX_AIO
1801         } else if (s->use_linux_aio) {
1802             LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1803             assert(qiov->size == bytes);
1804             return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1805 #endif
1806         }
1807     }
1808 
1809     return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
1810 }
1811 
1812 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1813                                       uint64_t bytes, QEMUIOVector *qiov,
1814                                       int flags)
1815 {
1816     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1817 }
1818 
1819 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1820                                        uint64_t bytes, QEMUIOVector *qiov,
1821                                        int flags)
1822 {
1823     assert(flags == 0);
1824     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1825 }
1826 
1827 static void raw_aio_plug(BlockDriverState *bs)
1828 {
1829 #ifdef CONFIG_LINUX_AIO
1830     BDRVRawState *s = bs->opaque;
1831     if (s->use_linux_aio) {
1832         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1833         laio_io_plug(bs, aio);
1834     }
1835 #endif
1836 }
1837 
1838 static void raw_aio_unplug(BlockDriverState *bs)
1839 {
1840 #ifdef CONFIG_LINUX_AIO
1841     BDRVRawState *s = bs->opaque;
1842     if (s->use_linux_aio) {
1843         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1844         laio_io_unplug(bs, aio);
1845     }
1846 #endif
1847 }
1848 
1849 static int raw_co_flush_to_disk(BlockDriverState *bs)
1850 {
1851     BDRVRawState *s = bs->opaque;
1852     int ret;
1853 
1854     ret = fd_open(bs);
1855     if (ret < 0) {
1856         return ret;
1857     }
1858 
1859     return paio_submit_co(bs, s->fd, 0, NULL, 0, QEMU_AIO_FLUSH);
1860 }
1861 
1862 static void raw_aio_attach_aio_context(BlockDriverState *bs,
1863                                        AioContext *new_context)
1864 {
1865 #ifdef CONFIG_LINUX_AIO
1866     BDRVRawState *s = bs->opaque;
1867     if (s->use_linux_aio) {
1868         Error *local_err;
1869         if (!aio_setup_linux_aio(new_context, &local_err)) {
1870             error_reportf_err(local_err, "Unable to use native AIO, "
1871                                          "falling back to thread pool: ");
1872             s->use_linux_aio = false;
1873         }
1874     }
1875 #endif
1876 }
1877 
1878 static void raw_close(BlockDriverState *bs)
1879 {
1880     BDRVRawState *s = bs->opaque;
1881 
1882     if (s->fd >= 0) {
1883         qemu_close(s->fd);
1884         s->fd = -1;
1885     }
1886     if (s->lock_fd >= 0) {
1887         qemu_close(s->lock_fd);
1888         s->lock_fd = -1;
1889     }
1890 }
1891 
1892 /**
1893  * Truncates the given regular file @fd to @offset and, when growing, fills the
1894  * new space according to @prealloc.
1895  *
1896  * Returns: 0 on success, -errno on failure.
1897  */
1898 static int coroutine_fn
1899 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
1900                      PreallocMode prealloc, Error **errp)
1901 {
1902     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1903     ThreadPool *pool;
1904 
1905     *acb = (RawPosixAIOData) {
1906         .bs             = bs,
1907         .aio_fildes     = fd,
1908         .aio_type       = QEMU_AIO_TRUNCATE,
1909         .aio_offset     = offset,
1910         .prealloc       = prealloc,
1911         .errp           = errp,
1912     };
1913 
1914     /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
1915     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1916     return thread_pool_submit_co(pool, aio_worker, acb);
1917 }
1918 
1919 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
1920                                         PreallocMode prealloc, Error **errp)
1921 {
1922     BDRVRawState *s = bs->opaque;
1923     struct stat st;
1924     int ret;
1925 
1926     if (fstat(s->fd, &st)) {
1927         ret = -errno;
1928         error_setg_errno(errp, -ret, "Failed to fstat() the file");
1929         return ret;
1930     }
1931 
1932     if (S_ISREG(st.st_mode)) {
1933         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
1934     }
1935 
1936     if (prealloc != PREALLOC_MODE_OFF) {
1937         error_setg(errp, "Preallocation mode '%s' unsupported for this "
1938                    "non-regular file", PreallocMode_str(prealloc));
1939         return -ENOTSUP;
1940     }
1941 
1942     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1943         if (offset > raw_getlength(bs)) {
1944             error_setg(errp, "Cannot grow device files");
1945             return -EINVAL;
1946         }
1947     } else {
1948         error_setg(errp, "Resizing this file is not supported");
1949         return -ENOTSUP;
1950     }
1951 
1952     return 0;
1953 }
1954 
1955 #ifdef __OpenBSD__
1956 static int64_t raw_getlength(BlockDriverState *bs)
1957 {
1958     BDRVRawState *s = bs->opaque;
1959     int fd = s->fd;
1960     struct stat st;
1961 
1962     if (fstat(fd, &st))
1963         return -errno;
1964     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1965         struct disklabel dl;
1966 
1967         if (ioctl(fd, DIOCGDINFO, &dl))
1968             return -errno;
1969         return (uint64_t)dl.d_secsize *
1970             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1971     } else
1972         return st.st_size;
1973 }
1974 #elif defined(__NetBSD__)
1975 static int64_t raw_getlength(BlockDriverState *bs)
1976 {
1977     BDRVRawState *s = bs->opaque;
1978     int fd = s->fd;
1979     struct stat st;
1980 
1981     if (fstat(fd, &st))
1982         return -errno;
1983     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1984         struct dkwedge_info dkw;
1985 
1986         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
1987             return dkw.dkw_size * 512;
1988         } else {
1989             struct disklabel dl;
1990 
1991             if (ioctl(fd, DIOCGDINFO, &dl))
1992                 return -errno;
1993             return (uint64_t)dl.d_secsize *
1994                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1995         }
1996     } else
1997         return st.st_size;
1998 }
1999 #elif defined(__sun__)
2000 static int64_t raw_getlength(BlockDriverState *bs)
2001 {
2002     BDRVRawState *s = bs->opaque;
2003     struct dk_minfo minfo;
2004     int ret;
2005     int64_t size;
2006 
2007     ret = fd_open(bs);
2008     if (ret < 0) {
2009         return ret;
2010     }
2011 
2012     /*
2013      * Use the DKIOCGMEDIAINFO ioctl to read the size.
2014      */
2015     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2016     if (ret != -1) {
2017         return minfo.dki_lbsize * minfo.dki_capacity;
2018     }
2019 
2020     /*
2021      * There are reports that lseek on some devices fails, but
2022      * irc discussion said that contingency on contingency was overkill.
2023      */
2024     size = lseek(s->fd, 0, SEEK_END);
2025     if (size < 0) {
2026         return -errno;
2027     }
2028     return size;
2029 }
2030 #elif defined(CONFIG_BSD)
2031 static int64_t raw_getlength(BlockDriverState *bs)
2032 {
2033     BDRVRawState *s = bs->opaque;
2034     int fd = s->fd;
2035     int64_t size;
2036     struct stat sb;
2037 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2038     int reopened = 0;
2039 #endif
2040     int ret;
2041 
2042     ret = fd_open(bs);
2043     if (ret < 0)
2044         return ret;
2045 
2046 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2047 again:
2048 #endif
2049     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2050 #ifdef DIOCGMEDIASIZE
2051 	if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2052 #elif defined(DIOCGPART)
2053         {
2054                 struct partinfo pi;
2055                 if (ioctl(fd, DIOCGPART, &pi) == 0)
2056                         size = pi.media_size;
2057                 else
2058                         size = 0;
2059         }
2060         if (size == 0)
2061 #endif
2062 #if defined(__APPLE__) && defined(__MACH__)
2063         {
2064             uint64_t sectors = 0;
2065             uint32_t sector_size = 0;
2066 
2067             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2068                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2069                 size = sectors * sector_size;
2070             } else {
2071                 size = lseek(fd, 0LL, SEEK_END);
2072                 if (size < 0) {
2073                     return -errno;
2074                 }
2075             }
2076         }
2077 #else
2078         size = lseek(fd, 0LL, SEEK_END);
2079         if (size < 0) {
2080             return -errno;
2081         }
2082 #endif
2083 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2084         switch(s->type) {
2085         case FTYPE_CD:
2086             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2087             if (size == 2048LL * (unsigned)-1)
2088                 size = 0;
2089             /* XXX no disc?  maybe we need to reopen... */
2090             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2091                 reopened = 1;
2092                 goto again;
2093             }
2094         }
2095 #endif
2096     } else {
2097         size = lseek(fd, 0, SEEK_END);
2098         if (size < 0) {
2099             return -errno;
2100         }
2101     }
2102     return size;
2103 }
2104 #else
2105 static int64_t raw_getlength(BlockDriverState *bs)
2106 {
2107     BDRVRawState *s = bs->opaque;
2108     int ret;
2109     int64_t size;
2110 
2111     ret = fd_open(bs);
2112     if (ret < 0) {
2113         return ret;
2114     }
2115 
2116     size = lseek(s->fd, 0, SEEK_END);
2117     if (size < 0) {
2118         return -errno;
2119     }
2120     return size;
2121 }
2122 #endif
2123 
2124 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2125 {
2126     struct stat st;
2127     BDRVRawState *s = bs->opaque;
2128 
2129     if (fstat(s->fd, &st) < 0) {
2130         return -errno;
2131     }
2132     return (int64_t)st.st_blocks * 512;
2133 }
2134 
2135 static int coroutine_fn
2136 raw_co_create(BlockdevCreateOptions *options, Error **errp)
2137 {
2138     BlockdevCreateOptionsFile *file_opts;
2139     Error *local_err = NULL;
2140     int fd;
2141     uint64_t perm, shared;
2142     int result = 0;
2143 
2144     /* Validate options and set default values */
2145     assert(options->driver == BLOCKDEV_DRIVER_FILE);
2146     file_opts = &options->u.file;
2147 
2148     if (!file_opts->has_nocow) {
2149         file_opts->nocow = false;
2150     }
2151     if (!file_opts->has_preallocation) {
2152         file_opts->preallocation = PREALLOC_MODE_OFF;
2153     }
2154 
2155     /* Create file */
2156     fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
2157     if (fd < 0) {
2158         result = -errno;
2159         error_setg_errno(errp, -result, "Could not create file");
2160         goto out;
2161     }
2162 
2163     /* Take permissions: We want to discard everything, so we need
2164      * BLK_PERM_WRITE; and truncation to the desired size requires
2165      * BLK_PERM_RESIZE.
2166      * On the other hand, we cannot share the RESIZE permission
2167      * because we promise that after this function, the file has the
2168      * size given in the options.  If someone else were to resize it
2169      * concurrently, we could not guarantee that.
2170      * Note that after this function, we can no longer guarantee that
2171      * the file is not touched by a third party, so it may be resized
2172      * then. */
2173     perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2174     shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2175 
2176     /* Step one: Take locks */
2177     result = raw_apply_lock_bytes(fd, perm, ~shared, false, errp);
2178     if (result < 0) {
2179         goto out_close;
2180     }
2181 
2182     /* Step two: Check that nobody else has taken conflicting locks */
2183     result = raw_check_lock_bytes(fd, perm, shared, errp);
2184     if (result < 0) {
2185         goto out_unlock;
2186     }
2187 
2188     /* Clear the file by truncating it to 0 */
2189     result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2190     if (result < 0) {
2191         goto out_unlock;
2192     }
2193 
2194     if (file_opts->nocow) {
2195 #ifdef __linux__
2196         /* Set NOCOW flag to solve performance issue on fs like btrfs.
2197          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2198          * will be ignored since any failure of this operation should not
2199          * block the left work.
2200          */
2201         int attr;
2202         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2203             attr |= FS_NOCOW_FL;
2204             ioctl(fd, FS_IOC_SETFLAGS, &attr);
2205         }
2206 #endif
2207     }
2208 
2209     /* Resize and potentially preallocate the file to the desired
2210      * final size */
2211     result = raw_regular_truncate(NULL, fd, file_opts->size,
2212                                   file_opts->preallocation, errp);
2213     if (result < 0) {
2214         goto out_unlock;
2215     }
2216 
2217 out_unlock:
2218     raw_apply_lock_bytes(fd, 0, 0, true, &local_err);
2219     if (local_err) {
2220         /* The above call should not fail, and if it does, that does
2221          * not mean the whole creation operation has failed.  So
2222          * report it the user for their convenience, but do not report
2223          * it to the caller. */
2224         error_report_err(local_err);
2225     }
2226 
2227 out_close:
2228     if (qemu_close(fd) != 0 && result == 0) {
2229         result = -errno;
2230         error_setg_errno(errp, -result, "Could not close the new file");
2231     }
2232 out:
2233     return result;
2234 }
2235 
2236 static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
2237                                            Error **errp)
2238 {
2239     BlockdevCreateOptions options;
2240     int64_t total_size = 0;
2241     bool nocow = false;
2242     PreallocMode prealloc;
2243     char *buf = NULL;
2244     Error *local_err = NULL;
2245 
2246     /* Skip file: protocol prefix */
2247     strstart(filename, "file:", &filename);
2248 
2249     /* Read out options */
2250     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2251                           BDRV_SECTOR_SIZE);
2252     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2253     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2254     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2255                                PREALLOC_MODE_OFF, &local_err);
2256     g_free(buf);
2257     if (local_err) {
2258         error_propagate(errp, local_err);
2259         return -EINVAL;
2260     }
2261 
2262     options = (BlockdevCreateOptions) {
2263         .driver     = BLOCKDEV_DRIVER_FILE,
2264         .u.file     = {
2265             .filename           = (char *) filename,
2266             .size               = total_size,
2267             .has_preallocation  = true,
2268             .preallocation      = prealloc,
2269             .has_nocow          = true,
2270             .nocow              = nocow,
2271         },
2272     };
2273     return raw_co_create(&options, errp);
2274 }
2275 
2276 /*
2277  * Find allocation range in @bs around offset @start.
2278  * May change underlying file descriptor's file offset.
2279  * If @start is not in a hole, store @start in @data, and the
2280  * beginning of the next hole in @hole, and return 0.
2281  * If @start is in a non-trailing hole, store @start in @hole and the
2282  * beginning of the next non-hole in @data, and return 0.
2283  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2284  * If we can't find out, return a negative errno other than -ENXIO.
2285  */
2286 static int find_allocation(BlockDriverState *bs, off_t start,
2287                            off_t *data, off_t *hole)
2288 {
2289 #if defined SEEK_HOLE && defined SEEK_DATA
2290     BDRVRawState *s = bs->opaque;
2291     off_t offs;
2292 
2293     /*
2294      * SEEK_DATA cases:
2295      * D1. offs == start: start is in data
2296      * D2. offs > start: start is in a hole, next data at offs
2297      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2298      *                              or start is beyond EOF
2299      *     If the latter happens, the file has been truncated behind
2300      *     our back since we opened it.  All bets are off then.
2301      *     Treating like a trailing hole is simplest.
2302      * D4. offs < 0, errno != ENXIO: we learned nothing
2303      */
2304     offs = lseek(s->fd, start, SEEK_DATA);
2305     if (offs < 0) {
2306         return -errno;          /* D3 or D4 */
2307     }
2308 
2309     if (offs < start) {
2310         /* This is not a valid return by lseek().  We are safe to just return
2311          * -EIO in this case, and we'll treat it like D4. */
2312         return -EIO;
2313     }
2314 
2315     if (offs > start) {
2316         /* D2: in hole, next data at offs */
2317         *hole = start;
2318         *data = offs;
2319         return 0;
2320     }
2321 
2322     /* D1: in data, end not yet known */
2323 
2324     /*
2325      * SEEK_HOLE cases:
2326      * H1. offs == start: start is in a hole
2327      *     If this happens here, a hole has been dug behind our back
2328      *     since the previous lseek().
2329      * H2. offs > start: either start is in data, next hole at offs,
2330      *                   or start is in trailing hole, EOF at offs
2331      *     Linux treats trailing holes like any other hole: offs ==
2332      *     start.  Solaris seeks to EOF instead: offs > start (blech).
2333      *     If that happens here, a hole has been dug behind our back
2334      *     since the previous lseek().
2335      * H3. offs < 0, errno = ENXIO: start is beyond EOF
2336      *     If this happens, the file has been truncated behind our
2337      *     back since we opened it.  Treat it like a trailing hole.
2338      * H4. offs < 0, errno != ENXIO: we learned nothing
2339      *     Pretend we know nothing at all, i.e. "forget" about D1.
2340      */
2341     offs = lseek(s->fd, start, SEEK_HOLE);
2342     if (offs < 0) {
2343         return -errno;          /* D1 and (H3 or H4) */
2344     }
2345 
2346     if (offs < start) {
2347         /* This is not a valid return by lseek().  We are safe to just return
2348          * -EIO in this case, and we'll treat it like H4. */
2349         return -EIO;
2350     }
2351 
2352     if (offs > start) {
2353         /*
2354          * D1 and H2: either in data, next hole at offs, or it was in
2355          * data but is now in a trailing hole.  In the latter case,
2356          * all bets are off.  Treating it as if it there was data all
2357          * the way to EOF is safe, so simply do that.
2358          */
2359         *data = start;
2360         *hole = offs;
2361         return 0;
2362     }
2363 
2364     /* D1 and H1 */
2365     return -EBUSY;
2366 #else
2367     return -ENOTSUP;
2368 #endif
2369 }
2370 
2371 /*
2372  * Returns the allocation status of the specified offset.
2373  *
2374  * The block layer guarantees 'offset' and 'bytes' are within bounds.
2375  *
2376  * 'pnum' is set to the number of bytes (including and immediately following
2377  * the specified offset) that are known to be in the same
2378  * allocated/unallocated state.
2379  *
2380  * 'bytes' is the max value 'pnum' should be set to.
2381  */
2382 static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2383                                             bool want_zero,
2384                                             int64_t offset,
2385                                             int64_t bytes, int64_t *pnum,
2386                                             int64_t *map,
2387                                             BlockDriverState **file)
2388 {
2389     off_t data = 0, hole = 0;
2390     int ret;
2391 
2392     ret = fd_open(bs);
2393     if (ret < 0) {
2394         return ret;
2395     }
2396 
2397     if (!want_zero) {
2398         *pnum = bytes;
2399         *map = offset;
2400         *file = bs;
2401         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2402     }
2403 
2404     ret = find_allocation(bs, offset, &data, &hole);
2405     if (ret == -ENXIO) {
2406         /* Trailing hole */
2407         *pnum = bytes;
2408         ret = BDRV_BLOCK_ZERO;
2409     } else if (ret < 0) {
2410         /* No info available, so pretend there are no holes */
2411         *pnum = bytes;
2412         ret = BDRV_BLOCK_DATA;
2413     } else if (data == offset) {
2414         /* On a data extent, compute bytes to the end of the extent,
2415          * possibly including a partial sector at EOF. */
2416         *pnum = MIN(bytes, hole - offset);
2417         ret = BDRV_BLOCK_DATA;
2418     } else {
2419         /* On a hole, compute bytes to the beginning of the next extent.  */
2420         assert(hole == offset);
2421         *pnum = MIN(bytes, data - offset);
2422         ret = BDRV_BLOCK_ZERO;
2423     }
2424     *map = offset;
2425     *file = bs;
2426     return ret | BDRV_BLOCK_OFFSET_VALID;
2427 }
2428 
2429 #if defined(__linux__)
2430 /* Verify that the file is not in the page cache */
2431 static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2432 {
2433     const size_t window_size = 128 * 1024 * 1024;
2434     BDRVRawState *s = bs->opaque;
2435     void *window = NULL;
2436     size_t length = 0;
2437     unsigned char *vec;
2438     size_t page_size;
2439     off_t offset;
2440     off_t end;
2441 
2442     /* mincore(2) page status information requires 1 byte per page */
2443     page_size = sysconf(_SC_PAGESIZE);
2444     vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2445 
2446     end = raw_getlength(bs);
2447 
2448     for (offset = 0; offset < end; offset += window_size) {
2449         void *new_window;
2450         size_t new_length;
2451         size_t vec_end;
2452         size_t i;
2453         int ret;
2454 
2455         /* Unmap previous window if size has changed */
2456         new_length = MIN(end - offset, window_size);
2457         if (new_length != length) {
2458             munmap(window, length);
2459             window = NULL;
2460             length = 0;
2461         }
2462 
2463         new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2464                           s->fd, offset);
2465         if (new_window == MAP_FAILED) {
2466             error_setg_errno(errp, errno, "mmap failed");
2467             break;
2468         }
2469 
2470         window = new_window;
2471         length = new_length;
2472 
2473         ret = mincore(window, length, vec);
2474         if (ret < 0) {
2475             error_setg_errno(errp, errno, "mincore failed");
2476             break;
2477         }
2478 
2479         vec_end = DIV_ROUND_UP(length, page_size);
2480         for (i = 0; i < vec_end; i++) {
2481             if (vec[i] & 0x1) {
2482                 error_setg(errp, "page cache still in use!");
2483                 break;
2484             }
2485         }
2486     }
2487 
2488     if (window) {
2489         munmap(window, length);
2490     }
2491 
2492     g_free(vec);
2493 }
2494 #endif /* __linux__ */
2495 
2496 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2497                                                  Error **errp)
2498 {
2499     BDRVRawState *s = bs->opaque;
2500     int ret;
2501 
2502     ret = fd_open(bs);
2503     if (ret < 0) {
2504         error_setg_errno(errp, -ret, "The file descriptor is not open");
2505         return;
2506     }
2507 
2508     if (s->open_flags & O_DIRECT) {
2509         return; /* No host kernel page cache */
2510     }
2511 
2512 #if defined(__linux__)
2513     /* This sets the scene for the next syscall... */
2514     ret = bdrv_co_flush(bs);
2515     if (ret < 0) {
2516         error_setg_errno(errp, -ret, "flush failed");
2517         return;
2518     }
2519 
2520     /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2521      * process.  These limitations are okay because we just fsynced the file,
2522      * we don't use mmap, and the file should not be in use by other processes.
2523      */
2524     ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2525     if (ret != 0) { /* the return value is a positive errno */
2526         error_setg_errno(errp, ret, "fadvise failed");
2527         return;
2528     }
2529 
2530     if (s->check_cache_dropped) {
2531         check_cache_dropped(bs, errp);
2532     }
2533 #else /* __linux__ */
2534     /* Do nothing.  Live migration to a remote host with cache.direct=off is
2535      * unsupported on other host operating systems.  Cache consistency issues
2536      * may occur but no error is reported here, partly because that's the
2537      * historical behavior and partly because it's hard to differentiate valid
2538      * configurations that should not cause errors.
2539      */
2540 #endif /* !__linux__ */
2541 }
2542 
2543 static coroutine_fn int
2544 raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2545 {
2546     BDRVRawState *s = bs->opaque;
2547 
2548     return paio_submit_co(bs, s->fd, offset, NULL, bytes, QEMU_AIO_DISCARD);
2549 }
2550 
2551 static int coroutine_fn raw_co_pwrite_zeroes(
2552     BlockDriverState *bs, int64_t offset,
2553     int bytes, BdrvRequestFlags flags)
2554 {
2555     BDRVRawState *s = bs->opaque;
2556 
2557     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
2558         return paio_submit_co(bs, s->fd, offset, NULL, bytes,
2559                               QEMU_AIO_WRITE_ZEROES);
2560     } else if (s->discard_zeroes) {
2561         return paio_submit_co(bs, s->fd, offset, NULL, bytes,
2562                               QEMU_AIO_DISCARD);
2563     }
2564     return -ENOTSUP;
2565 }
2566 
2567 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2568 {
2569     BDRVRawState *s = bs->opaque;
2570 
2571     bdi->unallocated_blocks_are_zero = s->discard_zeroes;
2572     return 0;
2573 }
2574 
2575 static QemuOptsList raw_create_opts = {
2576     .name = "raw-create-opts",
2577     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
2578     .desc = {
2579         {
2580             .name = BLOCK_OPT_SIZE,
2581             .type = QEMU_OPT_SIZE,
2582             .help = "Virtual disk size"
2583         },
2584         {
2585             .name = BLOCK_OPT_NOCOW,
2586             .type = QEMU_OPT_BOOL,
2587             .help = "Turn off copy-on-write (valid only on btrfs)"
2588         },
2589         {
2590             .name = BLOCK_OPT_PREALLOC,
2591             .type = QEMU_OPT_STRING,
2592             .help = "Preallocation mode (allowed values: off, falloc, full)"
2593         },
2594         { /* end of list */ }
2595     }
2596 };
2597 
2598 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
2599                           Error **errp)
2600 {
2601     return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
2602 }
2603 
2604 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
2605 {
2606     BDRVRawState *s = bs->opaque;
2607     raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
2608     s->perm = perm;
2609     s->shared_perm = shared;
2610 }
2611 
2612 static void raw_abort_perm_update(BlockDriverState *bs)
2613 {
2614     raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
2615 }
2616 
2617 static int coroutine_fn raw_co_copy_range_from(
2618         BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
2619         BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
2620         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
2621 {
2622     return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
2623                                  read_flags, write_flags);
2624 }
2625 
2626 static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
2627                                              BdrvChild *src,
2628                                              uint64_t src_offset,
2629                                              BdrvChild *dst,
2630                                              uint64_t dst_offset,
2631                                              uint64_t bytes,
2632                                              BdrvRequestFlags read_flags,
2633                                              BdrvRequestFlags write_flags)
2634 {
2635     BDRVRawState *s = bs->opaque;
2636     BDRVRawState *src_s;
2637 
2638     assert(dst->bs == bs);
2639     if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
2640         return -ENOTSUP;
2641     }
2642 
2643     src_s = src->bs->opaque;
2644     if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
2645         return -EIO;
2646     }
2647     return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset,
2648                                NULL, bytes, QEMU_AIO_COPY_RANGE);
2649 }
2650 
2651 BlockDriver bdrv_file = {
2652     .format_name = "file",
2653     .protocol_name = "file",
2654     .instance_size = sizeof(BDRVRawState),
2655     .bdrv_needs_filename = true,
2656     .bdrv_probe = NULL, /* no probe for protocols */
2657     .bdrv_parse_filename = raw_parse_filename,
2658     .bdrv_file_open = raw_open,
2659     .bdrv_reopen_prepare = raw_reopen_prepare,
2660     .bdrv_reopen_commit = raw_reopen_commit,
2661     .bdrv_reopen_abort = raw_reopen_abort,
2662     .bdrv_close = raw_close,
2663     .bdrv_co_create = raw_co_create,
2664     .bdrv_co_create_opts = raw_co_create_opts,
2665     .bdrv_has_zero_init = bdrv_has_zero_init_1,
2666     .bdrv_co_block_status = raw_co_block_status,
2667     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
2668     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
2669 
2670     .bdrv_co_preadv         = raw_co_preadv,
2671     .bdrv_co_pwritev        = raw_co_pwritev,
2672     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
2673     .bdrv_co_pdiscard       = raw_co_pdiscard,
2674     .bdrv_co_copy_range_from = raw_co_copy_range_from,
2675     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
2676     .bdrv_refresh_limits = raw_refresh_limits,
2677     .bdrv_io_plug = raw_aio_plug,
2678     .bdrv_io_unplug = raw_aio_unplug,
2679     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
2680 
2681     .bdrv_co_truncate = raw_co_truncate,
2682     .bdrv_getlength = raw_getlength,
2683     .bdrv_get_info = raw_get_info,
2684     .bdrv_get_allocated_file_size
2685                         = raw_get_allocated_file_size,
2686     .bdrv_check_perm = raw_check_perm,
2687     .bdrv_set_perm   = raw_set_perm,
2688     .bdrv_abort_perm_update = raw_abort_perm_update,
2689     .create_opts = &raw_create_opts,
2690 };
2691 
2692 /***********************************************/
2693 /* host device */
2694 
2695 #if defined(__APPLE__) && defined(__MACH__)
2696 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2697                                 CFIndex maxPathSize, int flags);
2698 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
2699 {
2700     kern_return_t kernResult = KERN_FAILURE;
2701     mach_port_t     masterPort;
2702     CFMutableDictionaryRef  classesToMatch;
2703     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
2704     char *mediaType = NULL;
2705 
2706     kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
2707     if ( KERN_SUCCESS != kernResult ) {
2708         printf( "IOMasterPort returned %d\n", kernResult );
2709     }
2710 
2711     int index;
2712     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
2713         classesToMatch = IOServiceMatching(matching_array[index]);
2714         if (classesToMatch == NULL) {
2715             error_report("IOServiceMatching returned NULL for %s",
2716                          matching_array[index]);
2717             continue;
2718         }
2719         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
2720                              kCFBooleanTrue);
2721         kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
2722                                                   mediaIterator);
2723         if (kernResult != KERN_SUCCESS) {
2724             error_report("Note: IOServiceGetMatchingServices returned %d",
2725                          kernResult);
2726             continue;
2727         }
2728 
2729         /* If a match was found, leave the loop */
2730         if (*mediaIterator != 0) {
2731             DPRINTF("Matching using %s\n", matching_array[index]);
2732             mediaType = g_strdup(matching_array[index]);
2733             break;
2734         }
2735     }
2736     return mediaType;
2737 }
2738 
2739 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2740                          CFIndex maxPathSize, int flags)
2741 {
2742     io_object_t     nextMedia;
2743     kern_return_t   kernResult = KERN_FAILURE;
2744     *bsdPath = '\0';
2745     nextMedia = IOIteratorNext( mediaIterator );
2746     if ( nextMedia )
2747     {
2748         CFTypeRef   bsdPathAsCFString;
2749     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
2750         if ( bsdPathAsCFString ) {
2751             size_t devPathLength;
2752             strcpy( bsdPath, _PATH_DEV );
2753             if (flags & BDRV_O_NOCACHE) {
2754                 strcat(bsdPath, "r");
2755             }
2756             devPathLength = strlen( bsdPath );
2757             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
2758                 kernResult = KERN_SUCCESS;
2759             }
2760             CFRelease( bsdPathAsCFString );
2761         }
2762         IOObjectRelease( nextMedia );
2763     }
2764 
2765     return kernResult;
2766 }
2767 
2768 /* Sets up a real cdrom for use in QEMU */
2769 static bool setup_cdrom(char *bsd_path, Error **errp)
2770 {
2771     int index, num_of_test_partitions = 2, fd;
2772     char test_partition[MAXPATHLEN];
2773     bool partition_found = false;
2774 
2775     /* look for a working partition */
2776     for (index = 0; index < num_of_test_partitions; index++) {
2777         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
2778                  index);
2779         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
2780         if (fd >= 0) {
2781             partition_found = true;
2782             qemu_close(fd);
2783             break;
2784         }
2785     }
2786 
2787     /* if a working partition on the device was not found */
2788     if (partition_found == false) {
2789         error_setg(errp, "Failed to find a working partition on disc");
2790     } else {
2791         DPRINTF("Using %s as optical disc\n", test_partition);
2792         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
2793     }
2794     return partition_found;
2795 }
2796 
2797 /* Prints directions on mounting and unmounting a device */
2798 static void print_unmounting_directions(const char *file_name)
2799 {
2800     error_report("If device %s is mounted on the desktop, unmount"
2801                  " it first before using it in QEMU", file_name);
2802     error_report("Command to unmount device: diskutil unmountDisk %s",
2803                  file_name);
2804     error_report("Command to mount device: diskutil mountDisk %s", file_name);
2805 }
2806 
2807 #endif /* defined(__APPLE__) && defined(__MACH__) */
2808 
2809 static int hdev_probe_device(const char *filename)
2810 {
2811     struct stat st;
2812 
2813     /* allow a dedicated CD-ROM driver to match with a higher priority */
2814     if (strstart(filename, "/dev/cdrom", NULL))
2815         return 50;
2816 
2817     if (stat(filename, &st) >= 0 &&
2818             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
2819         return 100;
2820     }
2821 
2822     return 0;
2823 }
2824 
2825 static int check_hdev_writable(BDRVRawState *s)
2826 {
2827 #if defined(BLKROGET)
2828     /* Linux block devices can be configured "read-only" using blockdev(8).
2829      * This is independent of device node permissions and therefore open(2)
2830      * with O_RDWR succeeds.  Actual writes fail with EPERM.
2831      *
2832      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
2833      * check for read-only block devices so that Linux block devices behave
2834      * properly.
2835      */
2836     struct stat st;
2837     int readonly = 0;
2838 
2839     if (fstat(s->fd, &st)) {
2840         return -errno;
2841     }
2842 
2843     if (!S_ISBLK(st.st_mode)) {
2844         return 0;
2845     }
2846 
2847     if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
2848         return -errno;
2849     }
2850 
2851     if (readonly) {
2852         return -EACCES;
2853     }
2854 #endif /* defined(BLKROGET) */
2855     return 0;
2856 }
2857 
2858 static void hdev_parse_filename(const char *filename, QDict *options,
2859                                 Error **errp)
2860 {
2861     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
2862 }
2863 
2864 static bool hdev_is_sg(BlockDriverState *bs)
2865 {
2866 
2867 #if defined(__linux__)
2868 
2869     BDRVRawState *s = bs->opaque;
2870     struct stat st;
2871     struct sg_scsi_id scsiid;
2872     int sg_version;
2873     int ret;
2874 
2875     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
2876         return false;
2877     }
2878 
2879     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
2880     if (ret < 0) {
2881         return false;
2882     }
2883 
2884     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
2885     if (ret >= 0) {
2886         DPRINTF("SG device found: type=%d, version=%d\n",
2887             scsiid.scsi_type, sg_version);
2888         return true;
2889     }
2890 
2891 #endif
2892 
2893     return false;
2894 }
2895 
2896 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
2897                      Error **errp)
2898 {
2899     BDRVRawState *s = bs->opaque;
2900     Error *local_err = NULL;
2901     int ret;
2902 
2903 #if defined(__APPLE__) && defined(__MACH__)
2904     /*
2905      * Caution: while qdict_get_str() is fine, getting non-string types
2906      * would require more care.  When @options come from -blockdev or
2907      * blockdev_add, its members are typed according to the QAPI
2908      * schema, but when they come from -drive, they're all QString.
2909      */
2910     const char *filename = qdict_get_str(options, "filename");
2911     char bsd_path[MAXPATHLEN] = "";
2912     bool error_occurred = false;
2913 
2914     /* If using a real cdrom */
2915     if (strcmp(filename, "/dev/cdrom") == 0) {
2916         char *mediaType = NULL;
2917         kern_return_t ret_val;
2918         io_iterator_t mediaIterator = 0;
2919 
2920         mediaType = FindEjectableOpticalMedia(&mediaIterator);
2921         if (mediaType == NULL) {
2922             error_setg(errp, "Please make sure your CD/DVD is in the optical"
2923                        " drive");
2924             error_occurred = true;
2925             goto hdev_open_Mac_error;
2926         }
2927 
2928         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
2929         if (ret_val != KERN_SUCCESS) {
2930             error_setg(errp, "Could not get BSD path for optical drive");
2931             error_occurred = true;
2932             goto hdev_open_Mac_error;
2933         }
2934 
2935         /* If a real optical drive was not found */
2936         if (bsd_path[0] == '\0') {
2937             error_setg(errp, "Failed to obtain bsd path for optical drive");
2938             error_occurred = true;
2939             goto hdev_open_Mac_error;
2940         }
2941 
2942         /* If using a cdrom disc and finding a partition on the disc failed */
2943         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
2944             setup_cdrom(bsd_path, errp) == false) {
2945             print_unmounting_directions(bsd_path);
2946             error_occurred = true;
2947             goto hdev_open_Mac_error;
2948         }
2949 
2950         qdict_put_str(options, "filename", bsd_path);
2951 
2952 hdev_open_Mac_error:
2953         g_free(mediaType);
2954         if (mediaIterator) {
2955             IOObjectRelease(mediaIterator);
2956         }
2957         if (error_occurred) {
2958             return -ENOENT;
2959         }
2960     }
2961 #endif /* defined(__APPLE__) && defined(__MACH__) */
2962 
2963     s->type = FTYPE_FILE;
2964 
2965     ret = raw_open_common(bs, options, flags, 0, true, &local_err);
2966     if (ret < 0) {
2967         error_propagate(errp, local_err);
2968 #if defined(__APPLE__) && defined(__MACH__)
2969         if (*bsd_path) {
2970             filename = bsd_path;
2971         }
2972         /* if a physical device experienced an error while being opened */
2973         if (strncmp(filename, "/dev/", 5) == 0) {
2974             print_unmounting_directions(filename);
2975         }
2976 #endif /* defined(__APPLE__) && defined(__MACH__) */
2977         return ret;
2978     }
2979 
2980     /* Since this does ioctl the device must be already opened */
2981     bs->sg = hdev_is_sg(bs);
2982 
2983     if (flags & BDRV_O_RDWR) {
2984         ret = check_hdev_writable(s);
2985         if (ret < 0) {
2986             raw_close(bs);
2987             error_setg_errno(errp, -ret, "The device is not writable");
2988             return ret;
2989         }
2990     }
2991 
2992     return ret;
2993 }
2994 
2995 #if defined(__linux__)
2996 
2997 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
2998         unsigned long int req, void *buf,
2999         BlockCompletionFunc *cb, void *opaque)
3000 {
3001     BDRVRawState *s = bs->opaque;
3002     RawPosixAIOData *acb;
3003     ThreadPool *pool;
3004 
3005     if (fd_open(bs) < 0)
3006         return NULL;
3007 
3008     if (req == SG_IO && s->pr_mgr) {
3009         struct sg_io_hdr *io_hdr = buf;
3010         if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3011             io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3012             return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3013                                       s->fd, io_hdr, cb, opaque);
3014         }
3015     }
3016 
3017     acb = g_new(RawPosixAIOData, 1);
3018     acb->bs = bs;
3019     acb->aio_type = QEMU_AIO_IOCTL;
3020     acb->aio_fildes = s->fd;
3021     acb->aio_offset = 0;
3022     acb->aio_ioctl_buf = buf;
3023     acb->aio_ioctl_cmd = req;
3024     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
3025     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
3026 }
3027 #endif /* linux */
3028 
3029 static int fd_open(BlockDriverState *bs)
3030 {
3031     BDRVRawState *s = bs->opaque;
3032 
3033     /* this is just to ensure s->fd is sane (its called by io ops) */
3034     if (s->fd >= 0)
3035         return 0;
3036     return -EIO;
3037 }
3038 
3039 static coroutine_fn int
3040 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3041 {
3042     BDRVRawState *s = bs->opaque;
3043     int ret;
3044 
3045     ret = fd_open(bs);
3046     if (ret < 0) {
3047         return ret;
3048     }
3049     return paio_submit_co(bs, s->fd, offset, NULL, bytes,
3050                           QEMU_AIO_DISCARD | QEMU_AIO_BLKDEV);
3051 }
3052 
3053 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3054     int64_t offset, int bytes, BdrvRequestFlags flags)
3055 {
3056     BDRVRawState *s = bs->opaque;
3057     int rc;
3058 
3059     rc = fd_open(bs);
3060     if (rc < 0) {
3061         return rc;
3062     }
3063     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
3064         return paio_submit_co(bs, s->fd, offset, NULL, bytes,
3065                               QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
3066     } else if (s->discard_zeroes) {
3067         return paio_submit_co(bs, s->fd, offset, NULL, bytes,
3068                               QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
3069     }
3070     return -ENOTSUP;
3071 }
3072 
3073 static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
3074                                             Error **errp)
3075 {
3076     int fd;
3077     int ret = 0;
3078     struct stat stat_buf;
3079     int64_t total_size = 0;
3080     bool has_prefix;
3081 
3082     /* This function is used by both protocol block drivers and therefore either
3083      * of these prefixes may be given.
3084      * The return value has to be stored somewhere, otherwise this is an error
3085      * due to -Werror=unused-value. */
3086     has_prefix =
3087         strstart(filename, "host_device:", &filename) ||
3088         strstart(filename, "host_cdrom:" , &filename);
3089 
3090     (void)has_prefix;
3091 
3092     ret = raw_normalize_devicepath(&filename);
3093     if (ret < 0) {
3094         error_setg_errno(errp, -ret, "Could not normalize device path");
3095         return ret;
3096     }
3097 
3098     /* Read out options */
3099     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
3100                           BDRV_SECTOR_SIZE);
3101 
3102     fd = qemu_open(filename, O_WRONLY | O_BINARY);
3103     if (fd < 0) {
3104         ret = -errno;
3105         error_setg_errno(errp, -ret, "Could not open device");
3106         return ret;
3107     }
3108 
3109     if (fstat(fd, &stat_buf) < 0) {
3110         ret = -errno;
3111         error_setg_errno(errp, -ret, "Could not stat device");
3112     } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
3113         error_setg(errp,
3114                    "The given file is neither a block nor a character device");
3115         ret = -ENODEV;
3116     } else if (lseek(fd, 0, SEEK_END) < total_size) {
3117         error_setg(errp, "Device is too small");
3118         ret = -ENOSPC;
3119     }
3120 
3121     if (!ret && total_size) {
3122         uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
3123         int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
3124         if (lseek(fd, 0, SEEK_SET) == -1) {
3125             ret = -errno;
3126         } else {
3127             ret = qemu_write_full(fd, buf, zero_size);
3128             ret = ret == zero_size ? 0 : -errno;
3129         }
3130     }
3131     qemu_close(fd);
3132     return ret;
3133 }
3134 
3135 static BlockDriver bdrv_host_device = {
3136     .format_name        = "host_device",
3137     .protocol_name        = "host_device",
3138     .instance_size      = sizeof(BDRVRawState),
3139     .bdrv_needs_filename = true,
3140     .bdrv_probe_device  = hdev_probe_device,
3141     .bdrv_parse_filename = hdev_parse_filename,
3142     .bdrv_file_open     = hdev_open,
3143     .bdrv_close         = raw_close,
3144     .bdrv_reopen_prepare = raw_reopen_prepare,
3145     .bdrv_reopen_commit  = raw_reopen_commit,
3146     .bdrv_reopen_abort   = raw_reopen_abort,
3147     .bdrv_co_create_opts = hdev_co_create_opts,
3148     .create_opts         = &raw_create_opts,
3149     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3150     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3151 
3152     .bdrv_co_preadv         = raw_co_preadv,
3153     .bdrv_co_pwritev        = raw_co_pwritev,
3154     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3155     .bdrv_co_pdiscard       = hdev_co_pdiscard,
3156     .bdrv_co_copy_range_from = raw_co_copy_range_from,
3157     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3158     .bdrv_refresh_limits = raw_refresh_limits,
3159     .bdrv_io_plug = raw_aio_plug,
3160     .bdrv_io_unplug = raw_aio_unplug,
3161 
3162     .bdrv_co_truncate       = raw_co_truncate,
3163     .bdrv_getlength	= raw_getlength,
3164     .bdrv_get_info = raw_get_info,
3165     .bdrv_get_allocated_file_size
3166                         = raw_get_allocated_file_size,
3167     .bdrv_check_perm = raw_check_perm,
3168     .bdrv_set_perm   = raw_set_perm,
3169     .bdrv_abort_perm_update = raw_abort_perm_update,
3170     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3171     .bdrv_probe_geometry = hdev_probe_geometry,
3172 
3173     /* generic scsi device */
3174 #ifdef __linux__
3175     .bdrv_aio_ioctl     = hdev_aio_ioctl,
3176 #endif
3177 };
3178 
3179 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3180 static void cdrom_parse_filename(const char *filename, QDict *options,
3181                                  Error **errp)
3182 {
3183     bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3184 }
3185 #endif
3186 
3187 #ifdef __linux__
3188 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3189                       Error **errp)
3190 {
3191     BDRVRawState *s = bs->opaque;
3192 
3193     s->type = FTYPE_CD;
3194 
3195     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3196     return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3197 }
3198 
3199 static int cdrom_probe_device(const char *filename)
3200 {
3201     int fd, ret;
3202     int prio = 0;
3203     struct stat st;
3204 
3205     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
3206     if (fd < 0) {
3207         goto out;
3208     }
3209     ret = fstat(fd, &st);
3210     if (ret == -1 || !S_ISBLK(st.st_mode)) {
3211         goto outc;
3212     }
3213 
3214     /* Attempt to detect via a CDROM specific ioctl */
3215     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3216     if (ret >= 0)
3217         prio = 100;
3218 
3219 outc:
3220     qemu_close(fd);
3221 out:
3222     return prio;
3223 }
3224 
3225 static bool cdrom_is_inserted(BlockDriverState *bs)
3226 {
3227     BDRVRawState *s = bs->opaque;
3228     int ret;
3229 
3230     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3231     return ret == CDS_DISC_OK;
3232 }
3233 
3234 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3235 {
3236     BDRVRawState *s = bs->opaque;
3237 
3238     if (eject_flag) {
3239         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3240             perror("CDROMEJECT");
3241     } else {
3242         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3243             perror("CDROMEJECT");
3244     }
3245 }
3246 
3247 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3248 {
3249     BDRVRawState *s = bs->opaque;
3250 
3251     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3252         /*
3253          * Note: an error can happen if the distribution automatically
3254          * mounts the CD-ROM
3255          */
3256         /* perror("CDROM_LOCKDOOR"); */
3257     }
3258 }
3259 
3260 static BlockDriver bdrv_host_cdrom = {
3261     .format_name        = "host_cdrom",
3262     .protocol_name      = "host_cdrom",
3263     .instance_size      = sizeof(BDRVRawState),
3264     .bdrv_needs_filename = true,
3265     .bdrv_probe_device	= cdrom_probe_device,
3266     .bdrv_parse_filename = cdrom_parse_filename,
3267     .bdrv_file_open     = cdrom_open,
3268     .bdrv_close         = raw_close,
3269     .bdrv_reopen_prepare = raw_reopen_prepare,
3270     .bdrv_reopen_commit  = raw_reopen_commit,
3271     .bdrv_reopen_abort   = raw_reopen_abort,
3272     .bdrv_co_create_opts = hdev_co_create_opts,
3273     .create_opts         = &raw_create_opts,
3274     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3275 
3276 
3277     .bdrv_co_preadv         = raw_co_preadv,
3278     .bdrv_co_pwritev        = raw_co_pwritev,
3279     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3280     .bdrv_refresh_limits = raw_refresh_limits,
3281     .bdrv_io_plug = raw_aio_plug,
3282     .bdrv_io_unplug = raw_aio_unplug,
3283 
3284     .bdrv_co_truncate    = raw_co_truncate,
3285     .bdrv_getlength      = raw_getlength,
3286     .has_variable_length = true,
3287     .bdrv_get_allocated_file_size
3288                         = raw_get_allocated_file_size,
3289 
3290     /* removable device support */
3291     .bdrv_is_inserted   = cdrom_is_inserted,
3292     .bdrv_eject         = cdrom_eject,
3293     .bdrv_lock_medium   = cdrom_lock_medium,
3294 
3295     /* generic scsi device */
3296     .bdrv_aio_ioctl     = hdev_aio_ioctl,
3297 };
3298 #endif /* __linux__ */
3299 
3300 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3301 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3302                       Error **errp)
3303 {
3304     BDRVRawState *s = bs->opaque;
3305     Error *local_err = NULL;
3306     int ret;
3307 
3308     s->type = FTYPE_CD;
3309 
3310     ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3311     if (ret) {
3312         error_propagate(errp, local_err);
3313         return ret;
3314     }
3315 
3316     /* make sure the door isn't locked at this time */
3317     ioctl(s->fd, CDIOCALLOW);
3318     return 0;
3319 }
3320 
3321 static int cdrom_probe_device(const char *filename)
3322 {
3323     if (strstart(filename, "/dev/cd", NULL) ||
3324             strstart(filename, "/dev/acd", NULL))
3325         return 100;
3326     return 0;
3327 }
3328 
3329 static int cdrom_reopen(BlockDriverState *bs)
3330 {
3331     BDRVRawState *s = bs->opaque;
3332     int fd;
3333 
3334     /*
3335      * Force reread of possibly changed/newly loaded disc,
3336      * FreeBSD seems to not notice sometimes...
3337      */
3338     if (s->fd >= 0)
3339         qemu_close(s->fd);
3340     fd = qemu_open(bs->filename, s->open_flags, 0644);
3341     if (fd < 0) {
3342         s->fd = -1;
3343         return -EIO;
3344     }
3345     s->fd = fd;
3346 
3347     /* make sure the door isn't locked at this time */
3348     ioctl(s->fd, CDIOCALLOW);
3349     return 0;
3350 }
3351 
3352 static bool cdrom_is_inserted(BlockDriverState *bs)
3353 {
3354     return raw_getlength(bs) > 0;
3355 }
3356 
3357 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3358 {
3359     BDRVRawState *s = bs->opaque;
3360 
3361     if (s->fd < 0)
3362         return;
3363 
3364     (void) ioctl(s->fd, CDIOCALLOW);
3365 
3366     if (eject_flag) {
3367         if (ioctl(s->fd, CDIOCEJECT) < 0)
3368             perror("CDIOCEJECT");
3369     } else {
3370         if (ioctl(s->fd, CDIOCCLOSE) < 0)
3371             perror("CDIOCCLOSE");
3372     }
3373 
3374     cdrom_reopen(bs);
3375 }
3376 
3377 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3378 {
3379     BDRVRawState *s = bs->opaque;
3380 
3381     if (s->fd < 0)
3382         return;
3383     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3384         /*
3385          * Note: an error can happen if the distribution automatically
3386          * mounts the CD-ROM
3387          */
3388         /* perror("CDROM_LOCKDOOR"); */
3389     }
3390 }
3391 
3392 static BlockDriver bdrv_host_cdrom = {
3393     .format_name        = "host_cdrom",
3394     .protocol_name      = "host_cdrom",
3395     .instance_size      = sizeof(BDRVRawState),
3396     .bdrv_needs_filename = true,
3397     .bdrv_probe_device	= cdrom_probe_device,
3398     .bdrv_parse_filename = cdrom_parse_filename,
3399     .bdrv_file_open     = cdrom_open,
3400     .bdrv_close         = raw_close,
3401     .bdrv_reopen_prepare = raw_reopen_prepare,
3402     .bdrv_reopen_commit  = raw_reopen_commit,
3403     .bdrv_reopen_abort   = raw_reopen_abort,
3404     .bdrv_co_create_opts = hdev_co_create_opts,
3405     .create_opts        = &raw_create_opts,
3406 
3407     .bdrv_co_preadv         = raw_co_preadv,
3408     .bdrv_co_pwritev        = raw_co_pwritev,
3409     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3410     .bdrv_refresh_limits = raw_refresh_limits,
3411     .bdrv_io_plug = raw_aio_plug,
3412     .bdrv_io_unplug = raw_aio_unplug,
3413 
3414     .bdrv_co_truncate    = raw_co_truncate,
3415     .bdrv_getlength      = raw_getlength,
3416     .has_variable_length = true,
3417     .bdrv_get_allocated_file_size
3418                         = raw_get_allocated_file_size,
3419 
3420     /* removable device support */
3421     .bdrv_is_inserted   = cdrom_is_inserted,
3422     .bdrv_eject         = cdrom_eject,
3423     .bdrv_lock_medium   = cdrom_lock_medium,
3424 };
3425 #endif /* __FreeBSD__ */
3426 
3427 static void bdrv_file_init(void)
3428 {
3429     /*
3430      * Register all the drivers.  Note that order is important, the driver
3431      * registered last will get probed first.
3432      */
3433     bdrv_register(&bdrv_file);
3434     bdrv_register(&bdrv_host_device);
3435 #ifdef __linux__
3436     bdrv_register(&bdrv_host_cdrom);
3437 #endif
3438 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3439     bdrv_register(&bdrv_host_cdrom);
3440 #endif
3441 }
3442 
3443 block_init(bdrv_file_init);
3444