xref: /openbmc/qemu/block/file-posix.c (revision 3f53bc61)
1 /*
2  * Block driver for RAW files (posix)
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu/osdep.h"
25 #include "qapi/error.h"
26 #include "qemu/cutils.h"
27 #include "qemu/error-report.h"
28 #include "qemu/timer.h"
29 #include "qemu/log.h"
30 #include "block/block_int.h"
31 #include "qemu/module.h"
32 #include "trace.h"
33 #include "block/thread-pool.h"
34 #include "qemu/iov.h"
35 #include "block/raw-aio.h"
36 #include "qapi/util.h"
37 #include "qapi/qmp/qstring.h"
38 
39 #if defined(__APPLE__) && (__MACH__)
40 #include <paths.h>
41 #include <sys/param.h>
42 #include <IOKit/IOKitLib.h>
43 #include <IOKit/IOBSD.h>
44 #include <IOKit/storage/IOMediaBSDClient.h>
45 #include <IOKit/storage/IOMedia.h>
46 #include <IOKit/storage/IOCDMedia.h>
47 //#include <IOKit/storage/IOCDTypes.h>
48 #include <IOKit/storage/IODVDMedia.h>
49 #include <CoreFoundation/CoreFoundation.h>
50 #endif
51 
52 #ifdef __sun__
53 #define _POSIX_PTHREAD_SEMANTICS 1
54 #include <sys/dkio.h>
55 #endif
56 #ifdef __linux__
57 #include <sys/ioctl.h>
58 #include <sys/param.h>
59 #include <linux/cdrom.h>
60 #include <linux/fd.h>
61 #include <linux/fs.h>
62 #include <linux/hdreg.h>
63 #include <scsi/sg.h>
64 #ifdef __s390__
65 #include <asm/dasd.h>
66 #endif
67 #ifndef FS_NOCOW_FL
68 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
69 #endif
70 #endif
71 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
72 #include <linux/falloc.h>
73 #endif
74 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
75 #include <sys/disk.h>
76 #include <sys/cdio.h>
77 #endif
78 
79 #ifdef __OpenBSD__
80 #include <sys/ioctl.h>
81 #include <sys/disklabel.h>
82 #include <sys/dkio.h>
83 #endif
84 
85 #ifdef __NetBSD__
86 #include <sys/ioctl.h>
87 #include <sys/disklabel.h>
88 #include <sys/dkio.h>
89 #include <sys/disk.h>
90 #endif
91 
92 #ifdef __DragonFly__
93 #include <sys/ioctl.h>
94 #include <sys/diskslice.h>
95 #endif
96 
97 #ifdef CONFIG_XFS
98 #include <xfs/xfs.h>
99 #endif
100 
101 //#define DEBUG_BLOCK
102 
103 #ifdef DEBUG_BLOCK
104 # define DEBUG_BLOCK_PRINT 1
105 #else
106 # define DEBUG_BLOCK_PRINT 0
107 #endif
108 #define DPRINTF(fmt, ...) \
109 do { \
110     if (DEBUG_BLOCK_PRINT) { \
111         printf(fmt, ## __VA_ARGS__); \
112     } \
113 } while (0)
114 
115 /* OS X does not have O_DSYNC */
116 #ifndef O_DSYNC
117 #ifdef O_SYNC
118 #define O_DSYNC O_SYNC
119 #elif defined(O_FSYNC)
120 #define O_DSYNC O_FSYNC
121 #endif
122 #endif
123 
124 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
125 #ifndef O_DIRECT
126 #define O_DIRECT O_DSYNC
127 #endif
128 
129 #define FTYPE_FILE   0
130 #define FTYPE_CD     1
131 
132 #define MAX_BLOCKSIZE	4096
133 
134 typedef struct BDRVRawState {
135     int fd;
136     int type;
137     int open_flags;
138     size_t buf_align;
139 
140 #ifdef CONFIG_XFS
141     bool is_xfs:1;
142 #endif
143     bool has_discard:1;
144     bool has_write_zeroes:1;
145     bool discard_zeroes:1;
146     bool use_linux_aio:1;
147     bool has_fallocate;
148     bool needs_alignment;
149 } BDRVRawState;
150 
151 typedef struct BDRVRawReopenState {
152     int fd;
153     int open_flags;
154 } BDRVRawReopenState;
155 
156 static int fd_open(BlockDriverState *bs);
157 static int64_t raw_getlength(BlockDriverState *bs);
158 
159 typedef struct RawPosixAIOData {
160     BlockDriverState *bs;
161     int aio_fildes;
162     union {
163         struct iovec *aio_iov;
164         void *aio_ioctl_buf;
165     };
166     int aio_niov;
167     uint64_t aio_nbytes;
168 #define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
169     off_t aio_offset;
170     int aio_type;
171 } RawPosixAIOData;
172 
173 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
174 static int cdrom_reopen(BlockDriverState *bs);
175 #endif
176 
177 #if defined(__NetBSD__)
178 static int raw_normalize_devicepath(const char **filename)
179 {
180     static char namebuf[PATH_MAX];
181     const char *dp, *fname;
182     struct stat sb;
183 
184     fname = *filename;
185     dp = strrchr(fname, '/');
186     if (lstat(fname, &sb) < 0) {
187         fprintf(stderr, "%s: stat failed: %s\n",
188             fname, strerror(errno));
189         return -errno;
190     }
191 
192     if (!S_ISBLK(sb.st_mode)) {
193         return 0;
194     }
195 
196     if (dp == NULL) {
197         snprintf(namebuf, PATH_MAX, "r%s", fname);
198     } else {
199         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
200             (int)(dp - fname), fname, dp + 1);
201     }
202     fprintf(stderr, "%s is a block device", fname);
203     *filename = namebuf;
204     fprintf(stderr, ", using %s\n", *filename);
205 
206     return 0;
207 }
208 #else
209 static int raw_normalize_devicepath(const char **filename)
210 {
211     return 0;
212 }
213 #endif
214 
215 /*
216  * Get logical block size via ioctl. On success store it in @sector_size_p.
217  */
218 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
219 {
220     unsigned int sector_size;
221     bool success = false;
222 
223     errno = ENOTSUP;
224 
225     /* Try a few ioctls to get the right size */
226 #ifdef BLKSSZGET
227     if (ioctl(fd, BLKSSZGET, &sector_size) >= 0) {
228         *sector_size_p = sector_size;
229         success = true;
230     }
231 #endif
232 #ifdef DKIOCGETBLOCKSIZE
233     if (ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
234         *sector_size_p = sector_size;
235         success = true;
236     }
237 #endif
238 #ifdef DIOCGSECTORSIZE
239     if (ioctl(fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
240         *sector_size_p = sector_size;
241         success = true;
242     }
243 #endif
244 
245     return success ? 0 : -errno;
246 }
247 
248 /**
249  * Get physical block size of @fd.
250  * On success, store it in @blk_size and return 0.
251  * On failure, return -errno.
252  */
253 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
254 {
255 #ifdef BLKPBSZGET
256     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
257         return -errno;
258     }
259     return 0;
260 #else
261     return -ENOTSUP;
262 #endif
263 }
264 
265 /* Check if read is allowed with given memory buffer and length.
266  *
267  * This function is used to check O_DIRECT memory buffer and request alignment.
268  */
269 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
270 {
271     ssize_t ret = pread(fd, buf, len, 0);
272 
273     if (ret >= 0) {
274         return true;
275     }
276 
277 #ifdef __linux__
278     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
279      * other errors (e.g. real I/O error), which could happen on a failed
280      * drive, since we only care about probing alignment.
281      */
282     if (errno != EINVAL) {
283         return true;
284     }
285 #endif
286 
287     return false;
288 }
289 
290 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
291 {
292     BDRVRawState *s = bs->opaque;
293     char *buf;
294     size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
295 
296     /* For SCSI generic devices the alignment is not really used.
297        With buffered I/O, we don't have any restrictions. */
298     if (bdrv_is_sg(bs) || !s->needs_alignment) {
299         bs->bl.request_alignment = 1;
300         s->buf_align = 1;
301         return;
302     }
303 
304     bs->bl.request_alignment = 0;
305     s->buf_align = 0;
306     /* Let's try to use the logical blocksize for the alignment. */
307     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
308         bs->bl.request_alignment = 0;
309     }
310 #ifdef CONFIG_XFS
311     if (s->is_xfs) {
312         struct dioattr da;
313         if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
314             bs->bl.request_alignment = da.d_miniosz;
315             /* The kernel returns wrong information for d_mem */
316             /* s->buf_align = da.d_mem; */
317         }
318     }
319 #endif
320 
321     /* If we could not get the sizes so far, we can only guess them */
322     if (!s->buf_align) {
323         size_t align;
324         buf = qemu_memalign(max_align, 2 * max_align);
325         for (align = 512; align <= max_align; align <<= 1) {
326             if (raw_is_io_aligned(fd, buf + align, max_align)) {
327                 s->buf_align = align;
328                 break;
329             }
330         }
331         qemu_vfree(buf);
332     }
333 
334     if (!bs->bl.request_alignment) {
335         size_t align;
336         buf = qemu_memalign(s->buf_align, max_align);
337         for (align = 512; align <= max_align; align <<= 1) {
338             if (raw_is_io_aligned(fd, buf, align)) {
339                 bs->bl.request_alignment = align;
340                 break;
341             }
342         }
343         qemu_vfree(buf);
344     }
345 
346     if (!s->buf_align || !bs->bl.request_alignment) {
347         error_setg(errp, "Could not find working O_DIRECT alignment");
348         error_append_hint(errp, "Try cache.direct=off\n");
349     }
350 }
351 
352 static void raw_parse_flags(int bdrv_flags, int *open_flags)
353 {
354     assert(open_flags != NULL);
355 
356     *open_flags |= O_BINARY;
357     *open_flags &= ~O_ACCMODE;
358     if (bdrv_flags & BDRV_O_RDWR) {
359         *open_flags |= O_RDWR;
360     } else {
361         *open_flags |= O_RDONLY;
362     }
363 
364     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
365      * and O_DIRECT for no caching. */
366     if ((bdrv_flags & BDRV_O_NOCACHE)) {
367         *open_flags |= O_DIRECT;
368     }
369 }
370 
371 static void raw_parse_filename(const char *filename, QDict *options,
372                                Error **errp)
373 {
374     /* The filename does not have to be prefixed by the protocol name, since
375      * "file" is the default protocol; therefore, the return value of this
376      * function call can be ignored. */
377     strstart(filename, "file:", &filename);
378 
379     qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
380 }
381 
382 static QemuOptsList raw_runtime_opts = {
383     .name = "raw",
384     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
385     .desc = {
386         {
387             .name = "filename",
388             .type = QEMU_OPT_STRING,
389             .help = "File name of the image",
390         },
391         {
392             .name = "aio",
393             .type = QEMU_OPT_STRING,
394             .help = "host AIO implementation (threads, native)",
395         },
396         { /* end of list */ }
397     },
398 };
399 
400 static int raw_open_common(BlockDriverState *bs, QDict *options,
401                            int bdrv_flags, int open_flags, Error **errp)
402 {
403     BDRVRawState *s = bs->opaque;
404     QemuOpts *opts;
405     Error *local_err = NULL;
406     const char *filename = NULL;
407     BlockdevAioOptions aio, aio_default;
408     int fd, ret;
409     struct stat st;
410 
411     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
412     qemu_opts_absorb_qdict(opts, options, &local_err);
413     if (local_err) {
414         error_propagate(errp, local_err);
415         ret = -EINVAL;
416         goto fail;
417     }
418 
419     filename = qemu_opt_get(opts, "filename");
420 
421     ret = raw_normalize_devicepath(&filename);
422     if (ret != 0) {
423         error_setg_errno(errp, -ret, "Could not normalize device path");
424         goto fail;
425     }
426 
427     aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
428                   ? BLOCKDEV_AIO_OPTIONS_NATIVE
429                   : BLOCKDEV_AIO_OPTIONS_THREADS;
430     aio = qapi_enum_parse(BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"),
431                           BLOCKDEV_AIO_OPTIONS__MAX, aio_default, &local_err);
432     if (local_err) {
433         error_propagate(errp, local_err);
434         ret = -EINVAL;
435         goto fail;
436     }
437     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
438 
439     s->open_flags = open_flags;
440     raw_parse_flags(bdrv_flags, &s->open_flags);
441 
442     s->fd = -1;
443     fd = qemu_open(filename, s->open_flags, 0644);
444     if (fd < 0) {
445         ret = -errno;
446         error_setg_errno(errp, errno, "Could not open '%s'", filename);
447         if (ret == -EROFS) {
448             ret = -EACCES;
449         }
450         goto fail;
451     }
452     s->fd = fd;
453 
454 #ifdef CONFIG_LINUX_AIO
455      /* Currently Linux does AIO only for files opened with O_DIRECT */
456     if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
457         error_setg(errp, "aio=native was specified, but it requires "
458                          "cache.direct=on, which was not specified.");
459         ret = -EINVAL;
460         goto fail;
461     }
462 #else
463     if (s->use_linux_aio) {
464         error_setg(errp, "aio=native was specified, but is not supported "
465                          "in this build.");
466         ret = -EINVAL;
467         goto fail;
468     }
469 #endif /* !defined(CONFIG_LINUX_AIO) */
470 
471     s->has_discard = true;
472     s->has_write_zeroes = true;
473     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
474     if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
475         s->needs_alignment = true;
476     }
477 
478     if (fstat(s->fd, &st) < 0) {
479         ret = -errno;
480         error_setg_errno(errp, errno, "Could not stat file");
481         goto fail;
482     }
483     if (S_ISREG(st.st_mode)) {
484         s->discard_zeroes = true;
485         s->has_fallocate = true;
486     }
487     if (S_ISBLK(st.st_mode)) {
488 #ifdef BLKDISCARDZEROES
489         unsigned int arg;
490         if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
491             s->discard_zeroes = true;
492         }
493 #endif
494 #ifdef __linux__
495         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
496          * not rely on the contents of discarded blocks unless using O_DIRECT.
497          * Same for BLKZEROOUT.
498          */
499         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
500             s->discard_zeroes = false;
501             s->has_write_zeroes = false;
502         }
503 #endif
504     }
505 #ifdef __FreeBSD__
506     if (S_ISCHR(st.st_mode)) {
507         /*
508          * The file is a char device (disk), which on FreeBSD isn't behind
509          * a pager, so force all requests to be aligned. This is needed
510          * so QEMU makes sure all IO operations on the device are aligned
511          * to sector size, or else FreeBSD will reject them with EINVAL.
512          */
513         s->needs_alignment = true;
514     }
515 #endif
516 
517 #ifdef CONFIG_XFS
518     if (platform_test_xfs_fd(s->fd)) {
519         s->is_xfs = true;
520     }
521 #endif
522 
523     ret = 0;
524 fail:
525     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
526         unlink(filename);
527     }
528     qemu_opts_del(opts);
529     return ret;
530 }
531 
532 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
533                     Error **errp)
534 {
535     BDRVRawState *s = bs->opaque;
536 
537     s->type = FTYPE_FILE;
538     return raw_open_common(bs, options, flags, 0, errp);
539 }
540 
541 static int raw_reopen_prepare(BDRVReopenState *state,
542                               BlockReopenQueue *queue, Error **errp)
543 {
544     BDRVRawState *s;
545     BDRVRawReopenState *rs;
546     int ret = 0;
547     Error *local_err = NULL;
548 
549     assert(state != NULL);
550     assert(state->bs != NULL);
551 
552     s = state->bs->opaque;
553 
554     state->opaque = g_new0(BDRVRawReopenState, 1);
555     rs = state->opaque;
556 
557     if (s->type == FTYPE_CD) {
558         rs->open_flags |= O_NONBLOCK;
559     }
560 
561     raw_parse_flags(state->flags, &rs->open_flags);
562 
563     rs->fd = -1;
564 
565     int fcntl_flags = O_APPEND | O_NONBLOCK;
566 #ifdef O_NOATIME
567     fcntl_flags |= O_NOATIME;
568 #endif
569 
570 #ifdef O_ASYNC
571     /* Not all operating systems have O_ASYNC, and those that don't
572      * will not let us track the state into rs->open_flags (typically
573      * you achieve the same effect with an ioctl, for example I_SETSIG
574      * on Solaris). But we do not use O_ASYNC, so that's fine.
575      */
576     assert((s->open_flags & O_ASYNC) == 0);
577 #endif
578 
579     if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
580         /* dup the original fd */
581         rs->fd = qemu_dup(s->fd);
582         if (rs->fd >= 0) {
583             ret = fcntl_setfl(rs->fd, rs->open_flags);
584             if (ret) {
585                 qemu_close(rs->fd);
586                 rs->fd = -1;
587             }
588         }
589     }
590 
591     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
592     if (rs->fd == -1) {
593         const char *normalized_filename = state->bs->filename;
594         ret = raw_normalize_devicepath(&normalized_filename);
595         if (ret < 0) {
596             error_setg_errno(errp, -ret, "Could not normalize device path");
597         } else {
598             assert(!(rs->open_flags & O_CREAT));
599             rs->fd = qemu_open(normalized_filename, rs->open_flags);
600             if (rs->fd == -1) {
601                 error_setg_errno(errp, errno, "Could not reopen file");
602                 ret = -1;
603             }
604         }
605     }
606 
607     /* Fail already reopen_prepare() if we can't get a working O_DIRECT
608      * alignment with the new fd. */
609     if (rs->fd != -1) {
610         raw_probe_alignment(state->bs, rs->fd, &local_err);
611         if (local_err) {
612             qemu_close(rs->fd);
613             rs->fd = -1;
614             error_propagate(errp, local_err);
615             ret = -EINVAL;
616         }
617     }
618 
619     return ret;
620 }
621 
622 static void raw_reopen_commit(BDRVReopenState *state)
623 {
624     BDRVRawReopenState *rs = state->opaque;
625     BDRVRawState *s = state->bs->opaque;
626 
627     s->open_flags = rs->open_flags;
628 
629     qemu_close(s->fd);
630     s->fd = rs->fd;
631 
632     g_free(state->opaque);
633     state->opaque = NULL;
634 }
635 
636 
637 static void raw_reopen_abort(BDRVReopenState *state)
638 {
639     BDRVRawReopenState *rs = state->opaque;
640 
641      /* nothing to do if NULL, we didn't get far enough */
642     if (rs == NULL) {
643         return;
644     }
645 
646     if (rs->fd >= 0) {
647         qemu_close(rs->fd);
648         rs->fd = -1;
649     }
650     g_free(state->opaque);
651     state->opaque = NULL;
652 }
653 
654 static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
655 {
656 #ifdef BLKSECTGET
657     int max_bytes = 0;
658     short max_sectors = 0;
659     if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
660         return max_bytes;
661     } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
662         return max_sectors << BDRV_SECTOR_BITS;
663     } else {
664         return -errno;
665     }
666 #else
667     return -ENOSYS;
668 #endif
669 }
670 
671 static int hdev_get_max_segments(const struct stat *st)
672 {
673 #ifdef CONFIG_LINUX
674     char buf[32];
675     const char *end;
676     char *sysfspath;
677     int ret;
678     int fd = -1;
679     long max_segments;
680 
681     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
682                                 major(st->st_rdev), minor(st->st_rdev));
683     fd = open(sysfspath, O_RDONLY);
684     if (fd == -1) {
685         ret = -errno;
686         goto out;
687     }
688     do {
689         ret = read(fd, buf, sizeof(buf));
690     } while (ret == -1 && errno == EINTR);
691     if (ret < 0) {
692         ret = -errno;
693         goto out;
694     } else if (ret == 0) {
695         ret = -EIO;
696         goto out;
697     }
698     buf[ret] = 0;
699     /* The file is ended with '\n', pass 'end' to accept that. */
700     ret = qemu_strtol(buf, &end, 10, &max_segments);
701     if (ret == 0 && end && *end == '\n') {
702         ret = max_segments;
703     }
704 
705 out:
706     g_free(sysfspath);
707     return ret;
708 #else
709     return -ENOTSUP;
710 #endif
711 }
712 
713 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
714 {
715     BDRVRawState *s = bs->opaque;
716     struct stat st;
717 
718     if (!fstat(s->fd, &st)) {
719         if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
720             int ret = hdev_get_max_transfer_length(bs, s->fd);
721             if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
722                 bs->bl.max_transfer = pow2floor(ret);
723             }
724             ret = hdev_get_max_segments(&st);
725             if (ret > 0) {
726                 bs->bl.max_transfer = MIN(bs->bl.max_transfer,
727                                           ret * getpagesize());
728             }
729         }
730     }
731 
732     raw_probe_alignment(bs, s->fd, errp);
733     bs->bl.min_mem_alignment = s->buf_align;
734     bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
735 }
736 
737 static int check_for_dasd(int fd)
738 {
739 #ifdef BIODASDINFO2
740     struct dasd_information2_t info = {0};
741 
742     return ioctl(fd, BIODASDINFO2, &info);
743 #else
744     return -1;
745 #endif
746 }
747 
748 /**
749  * Try to get @bs's logical and physical block size.
750  * On success, store them in @bsz and return zero.
751  * On failure, return negative errno.
752  */
753 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
754 {
755     BDRVRawState *s = bs->opaque;
756     int ret;
757 
758     /* If DASD, get blocksizes */
759     if (check_for_dasd(s->fd) < 0) {
760         return -ENOTSUP;
761     }
762     ret = probe_logical_blocksize(s->fd, &bsz->log);
763     if (ret < 0) {
764         return ret;
765     }
766     return probe_physical_blocksize(s->fd, &bsz->phys);
767 }
768 
769 /**
770  * Try to get @bs's geometry: cyls, heads, sectors.
771  * On success, store them in @geo and return 0.
772  * On failure return -errno.
773  * (Allows block driver to assign default geometry values that guest sees)
774  */
775 #ifdef __linux__
776 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
777 {
778     BDRVRawState *s = bs->opaque;
779     struct hd_geometry ioctl_geo = {0};
780 
781     /* If DASD, get its geometry */
782     if (check_for_dasd(s->fd) < 0) {
783         return -ENOTSUP;
784     }
785     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
786         return -errno;
787     }
788     /* HDIO_GETGEO may return success even though geo contains zeros
789        (e.g. certain multipath setups) */
790     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
791         return -ENOTSUP;
792     }
793     /* Do not return a geometry for partition */
794     if (ioctl_geo.start != 0) {
795         return -ENOTSUP;
796     }
797     geo->heads = ioctl_geo.heads;
798     geo->sectors = ioctl_geo.sectors;
799     geo->cylinders = ioctl_geo.cylinders;
800 
801     return 0;
802 }
803 #else /* __linux__ */
804 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
805 {
806     return -ENOTSUP;
807 }
808 #endif
809 
810 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
811 {
812     int ret;
813 
814     ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
815     if (ret == -1) {
816         return -errno;
817     }
818 
819     return 0;
820 }
821 
822 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
823 {
824     int ret;
825 
826     ret = qemu_fdatasync(aiocb->aio_fildes);
827     if (ret == -1) {
828         return -errno;
829     }
830     return 0;
831 }
832 
833 #ifdef CONFIG_PREADV
834 
835 static bool preadv_present = true;
836 
837 static ssize_t
838 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
839 {
840     return preadv(fd, iov, nr_iov, offset);
841 }
842 
843 static ssize_t
844 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
845 {
846     return pwritev(fd, iov, nr_iov, offset);
847 }
848 
849 #else
850 
851 static bool preadv_present = false;
852 
853 static ssize_t
854 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
855 {
856     return -ENOSYS;
857 }
858 
859 static ssize_t
860 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
861 {
862     return -ENOSYS;
863 }
864 
865 #endif
866 
867 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
868 {
869     ssize_t len;
870 
871     do {
872         if (aiocb->aio_type & QEMU_AIO_WRITE)
873             len = qemu_pwritev(aiocb->aio_fildes,
874                                aiocb->aio_iov,
875                                aiocb->aio_niov,
876                                aiocb->aio_offset);
877          else
878             len = qemu_preadv(aiocb->aio_fildes,
879                               aiocb->aio_iov,
880                               aiocb->aio_niov,
881                               aiocb->aio_offset);
882     } while (len == -1 && errno == EINTR);
883 
884     if (len == -1) {
885         return -errno;
886     }
887     return len;
888 }
889 
890 /*
891  * Read/writes the data to/from a given linear buffer.
892  *
893  * Returns the number of bytes handles or -errno in case of an error. Short
894  * reads are only returned if the end of the file is reached.
895  */
896 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
897 {
898     ssize_t offset = 0;
899     ssize_t len;
900 
901     while (offset < aiocb->aio_nbytes) {
902         if (aiocb->aio_type & QEMU_AIO_WRITE) {
903             len = pwrite(aiocb->aio_fildes,
904                          (const char *)buf + offset,
905                          aiocb->aio_nbytes - offset,
906                          aiocb->aio_offset + offset);
907         } else {
908             len = pread(aiocb->aio_fildes,
909                         buf + offset,
910                         aiocb->aio_nbytes - offset,
911                         aiocb->aio_offset + offset);
912         }
913         if (len == -1 && errno == EINTR) {
914             continue;
915         } else if (len == -1 && errno == EINVAL &&
916                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
917                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
918                    offset > 0) {
919             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
920              * after a short read.  Assume that O_DIRECT short reads only occur
921              * at EOF.  Therefore this is a short read, not an I/O error.
922              */
923             break;
924         } else if (len == -1) {
925             offset = -errno;
926             break;
927         } else if (len == 0) {
928             break;
929         }
930         offset += len;
931     }
932 
933     return offset;
934 }
935 
936 static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
937 {
938     ssize_t nbytes;
939     char *buf;
940 
941     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
942         /*
943          * If there is just a single buffer, and it is properly aligned
944          * we can just use plain pread/pwrite without any problems.
945          */
946         if (aiocb->aio_niov == 1) {
947              return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
948         }
949         /*
950          * We have more than one iovec, and all are properly aligned.
951          *
952          * Try preadv/pwritev first and fall back to linearizing the
953          * buffer if it's not supported.
954          */
955         if (preadv_present) {
956             nbytes = handle_aiocb_rw_vector(aiocb);
957             if (nbytes == aiocb->aio_nbytes ||
958                 (nbytes < 0 && nbytes != -ENOSYS)) {
959                 return nbytes;
960             }
961             preadv_present = false;
962         }
963 
964         /*
965          * XXX(hch): short read/write.  no easy way to handle the reminder
966          * using these interfaces.  For now retry using plain
967          * pread/pwrite?
968          */
969     }
970 
971     /*
972      * Ok, we have to do it the hard way, copy all segments into
973      * a single aligned buffer.
974      */
975     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
976     if (buf == NULL) {
977         return -ENOMEM;
978     }
979 
980     if (aiocb->aio_type & QEMU_AIO_WRITE) {
981         char *p = buf;
982         int i;
983 
984         for (i = 0; i < aiocb->aio_niov; ++i) {
985             memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
986             p += aiocb->aio_iov[i].iov_len;
987         }
988         assert(p - buf == aiocb->aio_nbytes);
989     }
990 
991     nbytes = handle_aiocb_rw_linear(aiocb, buf);
992     if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
993         char *p = buf;
994         size_t count = aiocb->aio_nbytes, copy;
995         int i;
996 
997         for (i = 0; i < aiocb->aio_niov && count; ++i) {
998             copy = count;
999             if (copy > aiocb->aio_iov[i].iov_len) {
1000                 copy = aiocb->aio_iov[i].iov_len;
1001             }
1002             memcpy(aiocb->aio_iov[i].iov_base, p, copy);
1003             assert(count >= copy);
1004             p     += copy;
1005             count -= copy;
1006         }
1007         assert(count == 0);
1008     }
1009     qemu_vfree(buf);
1010 
1011     return nbytes;
1012 }
1013 
1014 #ifdef CONFIG_XFS
1015 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
1016 {
1017     struct xfs_flock64 fl;
1018     int err;
1019 
1020     memset(&fl, 0, sizeof(fl));
1021     fl.l_whence = SEEK_SET;
1022     fl.l_start = offset;
1023     fl.l_len = bytes;
1024 
1025     if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
1026         err = errno;
1027         DPRINTF("cannot write zero range (%s)\n", strerror(errno));
1028         return -err;
1029     }
1030 
1031     return 0;
1032 }
1033 
1034 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
1035 {
1036     struct xfs_flock64 fl;
1037     int err;
1038 
1039     memset(&fl, 0, sizeof(fl));
1040     fl.l_whence = SEEK_SET;
1041     fl.l_start = offset;
1042     fl.l_len = bytes;
1043 
1044     if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
1045         err = errno;
1046         DPRINTF("cannot punch hole (%s)\n", strerror(errno));
1047         return -err;
1048     }
1049 
1050     return 0;
1051 }
1052 #endif
1053 
1054 static int translate_err(int err)
1055 {
1056     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1057         err == -ENOTTY) {
1058         err = -ENOTSUP;
1059     }
1060     return err;
1061 }
1062 
1063 #ifdef CONFIG_FALLOCATE
1064 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1065 {
1066     do {
1067         if (fallocate(fd, mode, offset, len) == 0) {
1068             return 0;
1069         }
1070     } while (errno == EINTR);
1071     return translate_err(-errno);
1072 }
1073 #endif
1074 
1075 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1076 {
1077     int ret = -ENOTSUP;
1078     BDRVRawState *s = aiocb->bs->opaque;
1079 
1080     if (!s->has_write_zeroes) {
1081         return -ENOTSUP;
1082     }
1083 
1084 #ifdef BLKZEROOUT
1085     do {
1086         uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1087         if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1088             return 0;
1089         }
1090     } while (errno == EINTR);
1091 
1092     ret = translate_err(-errno);
1093 #endif
1094 
1095     if (ret == -ENOTSUP) {
1096         s->has_write_zeroes = false;
1097     }
1098     return ret;
1099 }
1100 
1101 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
1102 {
1103 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1104     BDRVRawState *s = aiocb->bs->opaque;
1105 #endif
1106 
1107     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1108         return handle_aiocb_write_zeroes_block(aiocb);
1109     }
1110 
1111 #ifdef CONFIG_XFS
1112     if (s->is_xfs) {
1113         return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
1114     }
1115 #endif
1116 
1117 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1118     if (s->has_write_zeroes) {
1119         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1120                                aiocb->aio_offset, aiocb->aio_nbytes);
1121         if (ret == 0 || ret != -ENOTSUP) {
1122             return ret;
1123         }
1124         s->has_write_zeroes = false;
1125     }
1126 #endif
1127 
1128 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1129     if (s->has_discard && s->has_fallocate) {
1130         int ret = do_fallocate(s->fd,
1131                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1132                                aiocb->aio_offset, aiocb->aio_nbytes);
1133         if (ret == 0) {
1134             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1135             if (ret == 0 || ret != -ENOTSUP) {
1136                 return ret;
1137             }
1138             s->has_fallocate = false;
1139         } else if (ret != -ENOTSUP) {
1140             return ret;
1141         } else {
1142             s->has_discard = false;
1143         }
1144     }
1145 #endif
1146 
1147 #ifdef CONFIG_FALLOCATE
1148     if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
1149         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1150         if (ret == 0 || ret != -ENOTSUP) {
1151             return ret;
1152         }
1153         s->has_fallocate = false;
1154     }
1155 #endif
1156 
1157     return -ENOTSUP;
1158 }
1159 
1160 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
1161 {
1162     int ret = -EOPNOTSUPP;
1163     BDRVRawState *s = aiocb->bs->opaque;
1164 
1165     if (!s->has_discard) {
1166         return -ENOTSUP;
1167     }
1168 
1169     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1170 #ifdef BLKDISCARD
1171         do {
1172             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1173             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1174                 return 0;
1175             }
1176         } while (errno == EINTR);
1177 
1178         ret = -errno;
1179 #endif
1180     } else {
1181 #ifdef CONFIG_XFS
1182         if (s->is_xfs) {
1183             return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1184         }
1185 #endif
1186 
1187 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1188         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1189                            aiocb->aio_offset, aiocb->aio_nbytes);
1190 #endif
1191     }
1192 
1193     ret = translate_err(ret);
1194     if (ret == -ENOTSUP) {
1195         s->has_discard = false;
1196     }
1197     return ret;
1198 }
1199 
1200 static int aio_worker(void *arg)
1201 {
1202     RawPosixAIOData *aiocb = arg;
1203     ssize_t ret = 0;
1204 
1205     switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
1206     case QEMU_AIO_READ:
1207         ret = handle_aiocb_rw(aiocb);
1208         if (ret >= 0 && ret < aiocb->aio_nbytes) {
1209             iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
1210                       0, aiocb->aio_nbytes - ret);
1211 
1212             ret = aiocb->aio_nbytes;
1213         }
1214         if (ret == aiocb->aio_nbytes) {
1215             ret = 0;
1216         } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1217             ret = -EINVAL;
1218         }
1219         break;
1220     case QEMU_AIO_WRITE:
1221         ret = handle_aiocb_rw(aiocb);
1222         if (ret == aiocb->aio_nbytes) {
1223             ret = 0;
1224         } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1225             ret = -EINVAL;
1226         }
1227         break;
1228     case QEMU_AIO_FLUSH:
1229         ret = handle_aiocb_flush(aiocb);
1230         break;
1231     case QEMU_AIO_IOCTL:
1232         ret = handle_aiocb_ioctl(aiocb);
1233         break;
1234     case QEMU_AIO_DISCARD:
1235         ret = handle_aiocb_discard(aiocb);
1236         break;
1237     case QEMU_AIO_WRITE_ZEROES:
1238         ret = handle_aiocb_write_zeroes(aiocb);
1239         break;
1240     default:
1241         fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
1242         ret = -EINVAL;
1243         break;
1244     }
1245 
1246     g_free(aiocb);
1247     return ret;
1248 }
1249 
1250 static int paio_submit_co(BlockDriverState *bs, int fd,
1251                           int64_t offset, QEMUIOVector *qiov,
1252                           int count, int type)
1253 {
1254     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1255     ThreadPool *pool;
1256 
1257     acb->bs = bs;
1258     acb->aio_type = type;
1259     acb->aio_fildes = fd;
1260 
1261     acb->aio_nbytes = count;
1262     acb->aio_offset = offset;
1263 
1264     if (qiov) {
1265         acb->aio_iov = qiov->iov;
1266         acb->aio_niov = qiov->niov;
1267         assert(qiov->size == count);
1268     }
1269 
1270     trace_paio_submit_co(offset, count, type);
1271     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1272     return thread_pool_submit_co(pool, aio_worker, acb);
1273 }
1274 
1275 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
1276         int64_t offset, QEMUIOVector *qiov, int count,
1277         BlockCompletionFunc *cb, void *opaque, int type)
1278 {
1279     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1280     ThreadPool *pool;
1281 
1282     acb->bs = bs;
1283     acb->aio_type = type;
1284     acb->aio_fildes = fd;
1285 
1286     acb->aio_nbytes = count;
1287     acb->aio_offset = offset;
1288 
1289     if (qiov) {
1290         acb->aio_iov = qiov->iov;
1291         acb->aio_niov = qiov->niov;
1292         assert(qiov->size == acb->aio_nbytes);
1293     }
1294 
1295     trace_paio_submit(acb, opaque, offset, count, type);
1296     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1297     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
1298 }
1299 
1300 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1301                                    uint64_t bytes, QEMUIOVector *qiov, int type)
1302 {
1303     BDRVRawState *s = bs->opaque;
1304 
1305     if (fd_open(bs) < 0)
1306         return -EIO;
1307 
1308     /*
1309      * Check if the underlying device requires requests to be aligned,
1310      * and if the request we are trying to submit is aligned or not.
1311      * If this is the case tell the low-level driver that it needs
1312      * to copy the buffer.
1313      */
1314     if (s->needs_alignment) {
1315         if (!bdrv_qiov_is_aligned(bs, qiov)) {
1316             type |= QEMU_AIO_MISALIGNED;
1317 #ifdef CONFIG_LINUX_AIO
1318         } else if (s->use_linux_aio) {
1319             LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1320             assert(qiov->size == bytes);
1321             return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1322 #endif
1323         }
1324     }
1325 
1326     return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
1327 }
1328 
1329 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1330                                       uint64_t bytes, QEMUIOVector *qiov,
1331                                       int flags)
1332 {
1333     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1334 }
1335 
1336 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1337                                        uint64_t bytes, QEMUIOVector *qiov,
1338                                        int flags)
1339 {
1340     assert(flags == 0);
1341     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1342 }
1343 
1344 static void raw_aio_plug(BlockDriverState *bs)
1345 {
1346 #ifdef CONFIG_LINUX_AIO
1347     BDRVRawState *s = bs->opaque;
1348     if (s->use_linux_aio) {
1349         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1350         laio_io_plug(bs, aio);
1351     }
1352 #endif
1353 }
1354 
1355 static void raw_aio_unplug(BlockDriverState *bs)
1356 {
1357 #ifdef CONFIG_LINUX_AIO
1358     BDRVRawState *s = bs->opaque;
1359     if (s->use_linux_aio) {
1360         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1361         laio_io_unplug(bs, aio);
1362     }
1363 #endif
1364 }
1365 
1366 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
1367         BlockCompletionFunc *cb, void *opaque)
1368 {
1369     BDRVRawState *s = bs->opaque;
1370 
1371     if (fd_open(bs) < 0)
1372         return NULL;
1373 
1374     return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
1375 }
1376 
1377 static void raw_close(BlockDriverState *bs)
1378 {
1379     BDRVRawState *s = bs->opaque;
1380 
1381     if (s->fd >= 0) {
1382         qemu_close(s->fd);
1383         s->fd = -1;
1384     }
1385 }
1386 
1387 static int raw_truncate(BlockDriverState *bs, int64_t offset)
1388 {
1389     BDRVRawState *s = bs->opaque;
1390     struct stat st;
1391 
1392     if (fstat(s->fd, &st)) {
1393         return -errno;
1394     }
1395 
1396     if (S_ISREG(st.st_mode)) {
1397         if (ftruncate(s->fd, offset) < 0) {
1398             return -errno;
1399         }
1400     } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1401        if (offset > raw_getlength(bs)) {
1402            return -EINVAL;
1403        }
1404     } else {
1405         return -ENOTSUP;
1406     }
1407 
1408     return 0;
1409 }
1410 
1411 #ifdef __OpenBSD__
1412 static int64_t raw_getlength(BlockDriverState *bs)
1413 {
1414     BDRVRawState *s = bs->opaque;
1415     int fd = s->fd;
1416     struct stat st;
1417 
1418     if (fstat(fd, &st))
1419         return -errno;
1420     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1421         struct disklabel dl;
1422 
1423         if (ioctl(fd, DIOCGDINFO, &dl))
1424             return -errno;
1425         return (uint64_t)dl.d_secsize *
1426             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1427     } else
1428         return st.st_size;
1429 }
1430 #elif defined(__NetBSD__)
1431 static int64_t raw_getlength(BlockDriverState *bs)
1432 {
1433     BDRVRawState *s = bs->opaque;
1434     int fd = s->fd;
1435     struct stat st;
1436 
1437     if (fstat(fd, &st))
1438         return -errno;
1439     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1440         struct dkwedge_info dkw;
1441 
1442         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
1443             return dkw.dkw_size * 512;
1444         } else {
1445             struct disklabel dl;
1446 
1447             if (ioctl(fd, DIOCGDINFO, &dl))
1448                 return -errno;
1449             return (uint64_t)dl.d_secsize *
1450                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1451         }
1452     } else
1453         return st.st_size;
1454 }
1455 #elif defined(__sun__)
1456 static int64_t raw_getlength(BlockDriverState *bs)
1457 {
1458     BDRVRawState *s = bs->opaque;
1459     struct dk_minfo minfo;
1460     int ret;
1461     int64_t size;
1462 
1463     ret = fd_open(bs);
1464     if (ret < 0) {
1465         return ret;
1466     }
1467 
1468     /*
1469      * Use the DKIOCGMEDIAINFO ioctl to read the size.
1470      */
1471     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
1472     if (ret != -1) {
1473         return minfo.dki_lbsize * minfo.dki_capacity;
1474     }
1475 
1476     /*
1477      * There are reports that lseek on some devices fails, but
1478      * irc discussion said that contingency on contingency was overkill.
1479      */
1480     size = lseek(s->fd, 0, SEEK_END);
1481     if (size < 0) {
1482         return -errno;
1483     }
1484     return size;
1485 }
1486 #elif defined(CONFIG_BSD)
1487 static int64_t raw_getlength(BlockDriverState *bs)
1488 {
1489     BDRVRawState *s = bs->opaque;
1490     int fd = s->fd;
1491     int64_t size;
1492     struct stat sb;
1493 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1494     int reopened = 0;
1495 #endif
1496     int ret;
1497 
1498     ret = fd_open(bs);
1499     if (ret < 0)
1500         return ret;
1501 
1502 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1503 again:
1504 #endif
1505     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
1506 #ifdef DIOCGMEDIASIZE
1507 	if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
1508 #elif defined(DIOCGPART)
1509         {
1510                 struct partinfo pi;
1511                 if (ioctl(fd, DIOCGPART, &pi) == 0)
1512                         size = pi.media_size;
1513                 else
1514                         size = 0;
1515         }
1516         if (size == 0)
1517 #endif
1518 #if defined(__APPLE__) && defined(__MACH__)
1519         {
1520             uint64_t sectors = 0;
1521             uint32_t sector_size = 0;
1522 
1523             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
1524                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
1525                 size = sectors * sector_size;
1526             } else {
1527                 size = lseek(fd, 0LL, SEEK_END);
1528                 if (size < 0) {
1529                     return -errno;
1530                 }
1531             }
1532         }
1533 #else
1534         size = lseek(fd, 0LL, SEEK_END);
1535         if (size < 0) {
1536             return -errno;
1537         }
1538 #endif
1539 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1540         switch(s->type) {
1541         case FTYPE_CD:
1542             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
1543             if (size == 2048LL * (unsigned)-1)
1544                 size = 0;
1545             /* XXX no disc?  maybe we need to reopen... */
1546             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
1547                 reopened = 1;
1548                 goto again;
1549             }
1550         }
1551 #endif
1552     } else {
1553         size = lseek(fd, 0, SEEK_END);
1554         if (size < 0) {
1555             return -errno;
1556         }
1557     }
1558     return size;
1559 }
1560 #else
1561 static int64_t raw_getlength(BlockDriverState *bs)
1562 {
1563     BDRVRawState *s = bs->opaque;
1564     int ret;
1565     int64_t size;
1566 
1567     ret = fd_open(bs);
1568     if (ret < 0) {
1569         return ret;
1570     }
1571 
1572     size = lseek(s->fd, 0, SEEK_END);
1573     if (size < 0) {
1574         return -errno;
1575     }
1576     return size;
1577 }
1578 #endif
1579 
1580 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
1581 {
1582     struct stat st;
1583     BDRVRawState *s = bs->opaque;
1584 
1585     if (fstat(s->fd, &st) < 0) {
1586         return -errno;
1587     }
1588     return (int64_t)st.st_blocks * 512;
1589 }
1590 
1591 static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
1592 {
1593     int fd;
1594     int result = 0;
1595     int64_t total_size = 0;
1596     bool nocow = false;
1597     PreallocMode prealloc;
1598     char *buf = NULL;
1599     Error *local_err = NULL;
1600 
1601     strstart(filename, "file:", &filename);
1602 
1603     /* Read out options */
1604     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1605                           BDRV_SECTOR_SIZE);
1606     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
1607     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1608     prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
1609                                PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
1610                                &local_err);
1611     g_free(buf);
1612     if (local_err) {
1613         error_propagate(errp, local_err);
1614         result = -EINVAL;
1615         goto out;
1616     }
1617 
1618     fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
1619                    0644);
1620     if (fd < 0) {
1621         result = -errno;
1622         error_setg_errno(errp, -result, "Could not create file");
1623         goto out;
1624     }
1625 
1626     if (nocow) {
1627 #ifdef __linux__
1628         /* Set NOCOW flag to solve performance issue on fs like btrfs.
1629          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
1630          * will be ignored since any failure of this operation should not
1631          * block the left work.
1632          */
1633         int attr;
1634         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
1635             attr |= FS_NOCOW_FL;
1636             ioctl(fd, FS_IOC_SETFLAGS, &attr);
1637         }
1638 #endif
1639     }
1640 
1641     switch (prealloc) {
1642 #ifdef CONFIG_POSIX_FALLOCATE
1643     case PREALLOC_MODE_FALLOC:
1644         /*
1645          * Truncating before posix_fallocate() makes it about twice slower on
1646          * file systems that do not support fallocate(), trying to check if a
1647          * block is allocated before allocating it, so don't do that here.
1648          */
1649         result = -posix_fallocate(fd, 0, total_size);
1650         if (result != 0) {
1651             /* posix_fallocate() doesn't set errno. */
1652             error_setg_errno(errp, -result,
1653                              "Could not preallocate data for the new file");
1654         }
1655         break;
1656 #endif
1657     case PREALLOC_MODE_FULL:
1658     {
1659         /*
1660          * Knowing the final size from the beginning could allow the file
1661          * system driver to do less allocations and possibly avoid
1662          * fragmentation of the file.
1663          */
1664         if (ftruncate(fd, total_size) != 0) {
1665             result = -errno;
1666             error_setg_errno(errp, -result, "Could not resize file");
1667             goto out_close;
1668         }
1669 
1670         int64_t num = 0, left = total_size;
1671         buf = g_malloc0(65536);
1672 
1673         while (left > 0) {
1674             num = MIN(left, 65536);
1675             result = write(fd, buf, num);
1676             if (result < 0) {
1677                 result = -errno;
1678                 error_setg_errno(errp, -result,
1679                                  "Could not write to the new file");
1680                 break;
1681             }
1682             left -= result;
1683         }
1684         if (result >= 0) {
1685             result = fsync(fd);
1686             if (result < 0) {
1687                 result = -errno;
1688                 error_setg_errno(errp, -result,
1689                                  "Could not flush new file to disk");
1690             }
1691         }
1692         g_free(buf);
1693         break;
1694     }
1695     case PREALLOC_MODE_OFF:
1696         if (ftruncate(fd, total_size) != 0) {
1697             result = -errno;
1698             error_setg_errno(errp, -result, "Could not resize file");
1699         }
1700         break;
1701     default:
1702         result = -EINVAL;
1703         error_setg(errp, "Unsupported preallocation mode: %s",
1704                    PreallocMode_lookup[prealloc]);
1705         break;
1706     }
1707 
1708 out_close:
1709     if (qemu_close(fd) != 0 && result == 0) {
1710         result = -errno;
1711         error_setg_errno(errp, -result, "Could not close the new file");
1712     }
1713 out:
1714     return result;
1715 }
1716 
1717 /*
1718  * Find allocation range in @bs around offset @start.
1719  * May change underlying file descriptor's file offset.
1720  * If @start is not in a hole, store @start in @data, and the
1721  * beginning of the next hole in @hole, and return 0.
1722  * If @start is in a non-trailing hole, store @start in @hole and the
1723  * beginning of the next non-hole in @data, and return 0.
1724  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
1725  * If we can't find out, return a negative errno other than -ENXIO.
1726  */
1727 static int find_allocation(BlockDriverState *bs, off_t start,
1728                            off_t *data, off_t *hole)
1729 {
1730 #if defined SEEK_HOLE && defined SEEK_DATA
1731     BDRVRawState *s = bs->opaque;
1732     off_t offs;
1733 
1734     /*
1735      * SEEK_DATA cases:
1736      * D1. offs == start: start is in data
1737      * D2. offs > start: start is in a hole, next data at offs
1738      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
1739      *                              or start is beyond EOF
1740      *     If the latter happens, the file has been truncated behind
1741      *     our back since we opened it.  All bets are off then.
1742      *     Treating like a trailing hole is simplest.
1743      * D4. offs < 0, errno != ENXIO: we learned nothing
1744      */
1745     offs = lseek(s->fd, start, SEEK_DATA);
1746     if (offs < 0) {
1747         return -errno;          /* D3 or D4 */
1748     }
1749     assert(offs >= start);
1750 
1751     if (offs > start) {
1752         /* D2: in hole, next data at offs */
1753         *hole = start;
1754         *data = offs;
1755         return 0;
1756     }
1757 
1758     /* D1: in data, end not yet known */
1759 
1760     /*
1761      * SEEK_HOLE cases:
1762      * H1. offs == start: start is in a hole
1763      *     If this happens here, a hole has been dug behind our back
1764      *     since the previous lseek().
1765      * H2. offs > start: either start is in data, next hole at offs,
1766      *                   or start is in trailing hole, EOF at offs
1767      *     Linux treats trailing holes like any other hole: offs ==
1768      *     start.  Solaris seeks to EOF instead: offs > start (blech).
1769      *     If that happens here, a hole has been dug behind our back
1770      *     since the previous lseek().
1771      * H3. offs < 0, errno = ENXIO: start is beyond EOF
1772      *     If this happens, the file has been truncated behind our
1773      *     back since we opened it.  Treat it like a trailing hole.
1774      * H4. offs < 0, errno != ENXIO: we learned nothing
1775      *     Pretend we know nothing at all, i.e. "forget" about D1.
1776      */
1777     offs = lseek(s->fd, start, SEEK_HOLE);
1778     if (offs < 0) {
1779         return -errno;          /* D1 and (H3 or H4) */
1780     }
1781     assert(offs >= start);
1782 
1783     if (offs > start) {
1784         /*
1785          * D1 and H2: either in data, next hole at offs, or it was in
1786          * data but is now in a trailing hole.  In the latter case,
1787          * all bets are off.  Treating it as if it there was data all
1788          * the way to EOF is safe, so simply do that.
1789          */
1790         *data = start;
1791         *hole = offs;
1792         return 0;
1793     }
1794 
1795     /* D1 and H1 */
1796     return -EBUSY;
1797 #else
1798     return -ENOTSUP;
1799 #endif
1800 }
1801 
1802 /*
1803  * Returns the allocation status of the specified sectors.
1804  *
1805  * If 'sector_num' is beyond the end of the disk image the return value is 0
1806  * and 'pnum' is set to 0.
1807  *
1808  * 'pnum' is set to the number of sectors (including and immediately following
1809  * the specified sector) that are known to be in the same
1810  * allocated/unallocated state.
1811  *
1812  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1813  * beyond the end of the disk image it will be clamped.
1814  */
1815 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
1816                                                     int64_t sector_num,
1817                                                     int nb_sectors, int *pnum,
1818                                                     BlockDriverState **file)
1819 {
1820     off_t start, data = 0, hole = 0;
1821     int64_t total_size;
1822     int ret;
1823 
1824     ret = fd_open(bs);
1825     if (ret < 0) {
1826         return ret;
1827     }
1828 
1829     start = sector_num * BDRV_SECTOR_SIZE;
1830     total_size = bdrv_getlength(bs);
1831     if (total_size < 0) {
1832         return total_size;
1833     } else if (start >= total_size) {
1834         *pnum = 0;
1835         return 0;
1836     } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
1837         nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
1838     }
1839 
1840     ret = find_allocation(bs, start, &data, &hole);
1841     if (ret == -ENXIO) {
1842         /* Trailing hole */
1843         *pnum = nb_sectors;
1844         ret = BDRV_BLOCK_ZERO;
1845     } else if (ret < 0) {
1846         /* No info available, so pretend there are no holes */
1847         *pnum = nb_sectors;
1848         ret = BDRV_BLOCK_DATA;
1849     } else if (data == start) {
1850         /* On a data extent, compute sectors to the end of the extent,
1851          * possibly including a partial sector at EOF. */
1852         *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
1853         ret = BDRV_BLOCK_DATA;
1854     } else {
1855         /* On a hole, compute sectors to the beginning of the next extent.  */
1856         assert(hole == start);
1857         *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
1858         ret = BDRV_BLOCK_ZERO;
1859     }
1860     *file = bs;
1861     return ret | BDRV_BLOCK_OFFSET_VALID | start;
1862 }
1863 
1864 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
1865     int64_t offset, int count,
1866     BlockCompletionFunc *cb, void *opaque)
1867 {
1868     BDRVRawState *s = bs->opaque;
1869 
1870     return paio_submit(bs, s->fd, offset, NULL, count,
1871                        cb, opaque, QEMU_AIO_DISCARD);
1872 }
1873 
1874 static int coroutine_fn raw_co_pwrite_zeroes(
1875     BlockDriverState *bs, int64_t offset,
1876     int count, BdrvRequestFlags flags)
1877 {
1878     BDRVRawState *s = bs->opaque;
1879 
1880     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
1881         return paio_submit_co(bs, s->fd, offset, NULL, count,
1882                               QEMU_AIO_WRITE_ZEROES);
1883     } else if (s->discard_zeroes) {
1884         return paio_submit_co(bs, s->fd, offset, NULL, count,
1885                               QEMU_AIO_DISCARD);
1886     }
1887     return -ENOTSUP;
1888 }
1889 
1890 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1891 {
1892     BDRVRawState *s = bs->opaque;
1893 
1894     bdi->unallocated_blocks_are_zero = s->discard_zeroes;
1895     bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
1896     return 0;
1897 }
1898 
1899 static QemuOptsList raw_create_opts = {
1900     .name = "raw-create-opts",
1901     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
1902     .desc = {
1903         {
1904             .name = BLOCK_OPT_SIZE,
1905             .type = QEMU_OPT_SIZE,
1906             .help = "Virtual disk size"
1907         },
1908         {
1909             .name = BLOCK_OPT_NOCOW,
1910             .type = QEMU_OPT_BOOL,
1911             .help = "Turn off copy-on-write (valid only on btrfs)"
1912         },
1913         {
1914             .name = BLOCK_OPT_PREALLOC,
1915             .type = QEMU_OPT_STRING,
1916             .help = "Preallocation mode (allowed values: off, falloc, full)"
1917         },
1918         { /* end of list */ }
1919     }
1920 };
1921 
1922 BlockDriver bdrv_file = {
1923     .format_name = "file",
1924     .protocol_name = "file",
1925     .instance_size = sizeof(BDRVRawState),
1926     .bdrv_needs_filename = true,
1927     .bdrv_probe = NULL, /* no probe for protocols */
1928     .bdrv_parse_filename = raw_parse_filename,
1929     .bdrv_file_open = raw_open,
1930     .bdrv_reopen_prepare = raw_reopen_prepare,
1931     .bdrv_reopen_commit = raw_reopen_commit,
1932     .bdrv_reopen_abort = raw_reopen_abort,
1933     .bdrv_close = raw_close,
1934     .bdrv_create = raw_create,
1935     .bdrv_has_zero_init = bdrv_has_zero_init_1,
1936     .bdrv_co_get_block_status = raw_co_get_block_status,
1937     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
1938 
1939     .bdrv_co_preadv         = raw_co_preadv,
1940     .bdrv_co_pwritev        = raw_co_pwritev,
1941     .bdrv_aio_flush = raw_aio_flush,
1942     .bdrv_aio_pdiscard = raw_aio_pdiscard,
1943     .bdrv_refresh_limits = raw_refresh_limits,
1944     .bdrv_io_plug = raw_aio_plug,
1945     .bdrv_io_unplug = raw_aio_unplug,
1946 
1947     .bdrv_truncate = raw_truncate,
1948     .bdrv_getlength = raw_getlength,
1949     .bdrv_get_info = raw_get_info,
1950     .bdrv_get_allocated_file_size
1951                         = raw_get_allocated_file_size,
1952 
1953     .create_opts = &raw_create_opts,
1954 };
1955 
1956 /***********************************************/
1957 /* host device */
1958 
1959 #if defined(__APPLE__) && defined(__MACH__)
1960 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
1961                                 CFIndex maxPathSize, int flags);
1962 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
1963 {
1964     kern_return_t kernResult = KERN_FAILURE;
1965     mach_port_t     masterPort;
1966     CFMutableDictionaryRef  classesToMatch;
1967     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
1968     char *mediaType = NULL;
1969 
1970     kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
1971     if ( KERN_SUCCESS != kernResult ) {
1972         printf( "IOMasterPort returned %d\n", kernResult );
1973     }
1974 
1975     int index;
1976     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
1977         classesToMatch = IOServiceMatching(matching_array[index]);
1978         if (classesToMatch == NULL) {
1979             error_report("IOServiceMatching returned NULL for %s",
1980                          matching_array[index]);
1981             continue;
1982         }
1983         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
1984                              kCFBooleanTrue);
1985         kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
1986                                                   mediaIterator);
1987         if (kernResult != KERN_SUCCESS) {
1988             error_report("Note: IOServiceGetMatchingServices returned %d",
1989                          kernResult);
1990             continue;
1991         }
1992 
1993         /* If a match was found, leave the loop */
1994         if (*mediaIterator != 0) {
1995             DPRINTF("Matching using %s\n", matching_array[index]);
1996             mediaType = g_strdup(matching_array[index]);
1997             break;
1998         }
1999     }
2000     return mediaType;
2001 }
2002 
2003 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2004                          CFIndex maxPathSize, int flags)
2005 {
2006     io_object_t     nextMedia;
2007     kern_return_t   kernResult = KERN_FAILURE;
2008     *bsdPath = '\0';
2009     nextMedia = IOIteratorNext( mediaIterator );
2010     if ( nextMedia )
2011     {
2012         CFTypeRef   bsdPathAsCFString;
2013     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
2014         if ( bsdPathAsCFString ) {
2015             size_t devPathLength;
2016             strcpy( bsdPath, _PATH_DEV );
2017             if (flags & BDRV_O_NOCACHE) {
2018                 strcat(bsdPath, "r");
2019             }
2020             devPathLength = strlen( bsdPath );
2021             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
2022                 kernResult = KERN_SUCCESS;
2023             }
2024             CFRelease( bsdPathAsCFString );
2025         }
2026         IOObjectRelease( nextMedia );
2027     }
2028 
2029     return kernResult;
2030 }
2031 
2032 /* Sets up a real cdrom for use in QEMU */
2033 static bool setup_cdrom(char *bsd_path, Error **errp)
2034 {
2035     int index, num_of_test_partitions = 2, fd;
2036     char test_partition[MAXPATHLEN];
2037     bool partition_found = false;
2038 
2039     /* look for a working partition */
2040     for (index = 0; index < num_of_test_partitions; index++) {
2041         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
2042                  index);
2043         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
2044         if (fd >= 0) {
2045             partition_found = true;
2046             qemu_close(fd);
2047             break;
2048         }
2049     }
2050 
2051     /* if a working partition on the device was not found */
2052     if (partition_found == false) {
2053         error_setg(errp, "Failed to find a working partition on disc");
2054     } else {
2055         DPRINTF("Using %s as optical disc\n", test_partition);
2056         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
2057     }
2058     return partition_found;
2059 }
2060 
2061 /* Prints directions on mounting and unmounting a device */
2062 static void print_unmounting_directions(const char *file_name)
2063 {
2064     error_report("If device %s is mounted on the desktop, unmount"
2065                  " it first before using it in QEMU", file_name);
2066     error_report("Command to unmount device: diskutil unmountDisk %s",
2067                  file_name);
2068     error_report("Command to mount device: diskutil mountDisk %s", file_name);
2069 }
2070 
2071 #endif /* defined(__APPLE__) && defined(__MACH__) */
2072 
2073 static int hdev_probe_device(const char *filename)
2074 {
2075     struct stat st;
2076 
2077     /* allow a dedicated CD-ROM driver to match with a higher priority */
2078     if (strstart(filename, "/dev/cdrom", NULL))
2079         return 50;
2080 
2081     if (stat(filename, &st) >= 0 &&
2082             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
2083         return 100;
2084     }
2085 
2086     return 0;
2087 }
2088 
2089 static int check_hdev_writable(BDRVRawState *s)
2090 {
2091 #if defined(BLKROGET)
2092     /* Linux block devices can be configured "read-only" using blockdev(8).
2093      * This is independent of device node permissions and therefore open(2)
2094      * with O_RDWR succeeds.  Actual writes fail with EPERM.
2095      *
2096      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
2097      * check for read-only block devices so that Linux block devices behave
2098      * properly.
2099      */
2100     struct stat st;
2101     int readonly = 0;
2102 
2103     if (fstat(s->fd, &st)) {
2104         return -errno;
2105     }
2106 
2107     if (!S_ISBLK(st.st_mode)) {
2108         return 0;
2109     }
2110 
2111     if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
2112         return -errno;
2113     }
2114 
2115     if (readonly) {
2116         return -EACCES;
2117     }
2118 #endif /* defined(BLKROGET) */
2119     return 0;
2120 }
2121 
2122 static void hdev_parse_filename(const char *filename, QDict *options,
2123                                 Error **errp)
2124 {
2125     /* The prefix is optional, just as for "file". */
2126     strstart(filename, "host_device:", &filename);
2127 
2128     qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2129 }
2130 
2131 static bool hdev_is_sg(BlockDriverState *bs)
2132 {
2133 
2134 #if defined(__linux__)
2135 
2136     BDRVRawState *s = bs->opaque;
2137     struct stat st;
2138     struct sg_scsi_id scsiid;
2139     int sg_version;
2140     int ret;
2141 
2142     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
2143         return false;
2144     }
2145 
2146     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
2147     if (ret < 0) {
2148         return false;
2149     }
2150 
2151     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
2152     if (ret >= 0) {
2153         DPRINTF("SG device found: type=%d, version=%d\n",
2154             scsiid.scsi_type, sg_version);
2155         return true;
2156     }
2157 
2158 #endif
2159 
2160     return false;
2161 }
2162 
2163 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
2164                      Error **errp)
2165 {
2166     BDRVRawState *s = bs->opaque;
2167     Error *local_err = NULL;
2168     int ret;
2169 
2170 #if defined(__APPLE__) && defined(__MACH__)
2171     const char *filename = qdict_get_str(options, "filename");
2172     char bsd_path[MAXPATHLEN] = "";
2173     bool error_occurred = false;
2174 
2175     /* If using a real cdrom */
2176     if (strcmp(filename, "/dev/cdrom") == 0) {
2177         char *mediaType = NULL;
2178         kern_return_t ret_val;
2179         io_iterator_t mediaIterator = 0;
2180 
2181         mediaType = FindEjectableOpticalMedia(&mediaIterator);
2182         if (mediaType == NULL) {
2183             error_setg(errp, "Please make sure your CD/DVD is in the optical"
2184                        " drive");
2185             error_occurred = true;
2186             goto hdev_open_Mac_error;
2187         }
2188 
2189         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
2190         if (ret_val != KERN_SUCCESS) {
2191             error_setg(errp, "Could not get BSD path for optical drive");
2192             error_occurred = true;
2193             goto hdev_open_Mac_error;
2194         }
2195 
2196         /* If a real optical drive was not found */
2197         if (bsd_path[0] == '\0') {
2198             error_setg(errp, "Failed to obtain bsd path for optical drive");
2199             error_occurred = true;
2200             goto hdev_open_Mac_error;
2201         }
2202 
2203         /* If using a cdrom disc and finding a partition on the disc failed */
2204         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
2205             setup_cdrom(bsd_path, errp) == false) {
2206             print_unmounting_directions(bsd_path);
2207             error_occurred = true;
2208             goto hdev_open_Mac_error;
2209         }
2210 
2211         qdict_put(options, "filename", qstring_from_str(bsd_path));
2212 
2213 hdev_open_Mac_error:
2214         g_free(mediaType);
2215         if (mediaIterator) {
2216             IOObjectRelease(mediaIterator);
2217         }
2218         if (error_occurred) {
2219             return -ENOENT;
2220         }
2221     }
2222 #endif /* defined(__APPLE__) && defined(__MACH__) */
2223 
2224     s->type = FTYPE_FILE;
2225 
2226     ret = raw_open_common(bs, options, flags, 0, &local_err);
2227     if (ret < 0) {
2228         error_propagate(errp, local_err);
2229 #if defined(__APPLE__) && defined(__MACH__)
2230         if (*bsd_path) {
2231             filename = bsd_path;
2232         }
2233         /* if a physical device experienced an error while being opened */
2234         if (strncmp(filename, "/dev/", 5) == 0) {
2235             print_unmounting_directions(filename);
2236         }
2237 #endif /* defined(__APPLE__) && defined(__MACH__) */
2238         return ret;
2239     }
2240 
2241     /* Since this does ioctl the device must be already opened */
2242     bs->sg = hdev_is_sg(bs);
2243 
2244     if (flags & BDRV_O_RDWR) {
2245         ret = check_hdev_writable(s);
2246         if (ret < 0) {
2247             raw_close(bs);
2248             error_setg_errno(errp, -ret, "The device is not writable");
2249             return ret;
2250         }
2251     }
2252 
2253     return ret;
2254 }
2255 
2256 #if defined(__linux__)
2257 
2258 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
2259         unsigned long int req, void *buf,
2260         BlockCompletionFunc *cb, void *opaque)
2261 {
2262     BDRVRawState *s = bs->opaque;
2263     RawPosixAIOData *acb;
2264     ThreadPool *pool;
2265 
2266     if (fd_open(bs) < 0)
2267         return NULL;
2268 
2269     acb = g_new(RawPosixAIOData, 1);
2270     acb->bs = bs;
2271     acb->aio_type = QEMU_AIO_IOCTL;
2272     acb->aio_fildes = s->fd;
2273     acb->aio_offset = 0;
2274     acb->aio_ioctl_buf = buf;
2275     acb->aio_ioctl_cmd = req;
2276     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
2277     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
2278 }
2279 #endif /* linux */
2280 
2281 static int fd_open(BlockDriverState *bs)
2282 {
2283     BDRVRawState *s = bs->opaque;
2284 
2285     /* this is just to ensure s->fd is sane (its called by io ops) */
2286     if (s->fd >= 0)
2287         return 0;
2288     return -EIO;
2289 }
2290 
2291 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
2292     int64_t offset, int count,
2293     BlockCompletionFunc *cb, void *opaque)
2294 {
2295     BDRVRawState *s = bs->opaque;
2296 
2297     if (fd_open(bs) < 0) {
2298         return NULL;
2299     }
2300     return paio_submit(bs, s->fd, offset, NULL, count,
2301                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2302 }
2303 
2304 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
2305     int64_t offset, int count, BdrvRequestFlags flags)
2306 {
2307     BDRVRawState *s = bs->opaque;
2308     int rc;
2309 
2310     rc = fd_open(bs);
2311     if (rc < 0) {
2312         return rc;
2313     }
2314     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
2315         return paio_submit_co(bs, s->fd, offset, NULL, count,
2316                               QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
2317     } else if (s->discard_zeroes) {
2318         return paio_submit_co(bs, s->fd, offset, NULL, count,
2319                               QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2320     }
2321     return -ENOTSUP;
2322 }
2323 
2324 static int hdev_create(const char *filename, QemuOpts *opts,
2325                        Error **errp)
2326 {
2327     int fd;
2328     int ret = 0;
2329     struct stat stat_buf;
2330     int64_t total_size = 0;
2331     bool has_prefix;
2332 
2333     /* This function is used by both protocol block drivers and therefore either
2334      * of these prefixes may be given.
2335      * The return value has to be stored somewhere, otherwise this is an error
2336      * due to -Werror=unused-value. */
2337     has_prefix =
2338         strstart(filename, "host_device:", &filename) ||
2339         strstart(filename, "host_cdrom:" , &filename);
2340 
2341     (void)has_prefix;
2342 
2343     ret = raw_normalize_devicepath(&filename);
2344     if (ret < 0) {
2345         error_setg_errno(errp, -ret, "Could not normalize device path");
2346         return ret;
2347     }
2348 
2349     /* Read out options */
2350     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2351                           BDRV_SECTOR_SIZE);
2352 
2353     fd = qemu_open(filename, O_WRONLY | O_BINARY);
2354     if (fd < 0) {
2355         ret = -errno;
2356         error_setg_errno(errp, -ret, "Could not open device");
2357         return ret;
2358     }
2359 
2360     if (fstat(fd, &stat_buf) < 0) {
2361         ret = -errno;
2362         error_setg_errno(errp, -ret, "Could not stat device");
2363     } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
2364         error_setg(errp,
2365                    "The given file is neither a block nor a character device");
2366         ret = -ENODEV;
2367     } else if (lseek(fd, 0, SEEK_END) < total_size) {
2368         error_setg(errp, "Device is too small");
2369         ret = -ENOSPC;
2370     }
2371 
2372     qemu_close(fd);
2373     return ret;
2374 }
2375 
2376 static BlockDriver bdrv_host_device = {
2377     .format_name        = "host_device",
2378     .protocol_name        = "host_device",
2379     .instance_size      = sizeof(BDRVRawState),
2380     .bdrv_needs_filename = true,
2381     .bdrv_probe_device  = hdev_probe_device,
2382     .bdrv_parse_filename = hdev_parse_filename,
2383     .bdrv_file_open     = hdev_open,
2384     .bdrv_close         = raw_close,
2385     .bdrv_reopen_prepare = raw_reopen_prepare,
2386     .bdrv_reopen_commit  = raw_reopen_commit,
2387     .bdrv_reopen_abort   = raw_reopen_abort,
2388     .bdrv_create         = hdev_create,
2389     .create_opts         = &raw_create_opts,
2390     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
2391 
2392     .bdrv_co_preadv         = raw_co_preadv,
2393     .bdrv_co_pwritev        = raw_co_pwritev,
2394     .bdrv_aio_flush	= raw_aio_flush,
2395     .bdrv_aio_pdiscard   = hdev_aio_pdiscard,
2396     .bdrv_refresh_limits = raw_refresh_limits,
2397     .bdrv_io_plug = raw_aio_plug,
2398     .bdrv_io_unplug = raw_aio_unplug,
2399 
2400     .bdrv_truncate      = raw_truncate,
2401     .bdrv_getlength	= raw_getlength,
2402     .bdrv_get_info = raw_get_info,
2403     .bdrv_get_allocated_file_size
2404                         = raw_get_allocated_file_size,
2405     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
2406     .bdrv_probe_geometry = hdev_probe_geometry,
2407 
2408     /* generic scsi device */
2409 #ifdef __linux__
2410     .bdrv_aio_ioctl     = hdev_aio_ioctl,
2411 #endif
2412 };
2413 
2414 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2415 static void cdrom_parse_filename(const char *filename, QDict *options,
2416                                  Error **errp)
2417 {
2418     /* The prefix is optional, just as for "file". */
2419     strstart(filename, "host_cdrom:", &filename);
2420 
2421     qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2422 }
2423 #endif
2424 
2425 #ifdef __linux__
2426 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2427                       Error **errp)
2428 {
2429     BDRVRawState *s = bs->opaque;
2430 
2431     s->type = FTYPE_CD;
2432 
2433     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
2434     return raw_open_common(bs, options, flags, O_NONBLOCK, errp);
2435 }
2436 
2437 static int cdrom_probe_device(const char *filename)
2438 {
2439     int fd, ret;
2440     int prio = 0;
2441     struct stat st;
2442 
2443     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
2444     if (fd < 0) {
2445         goto out;
2446     }
2447     ret = fstat(fd, &st);
2448     if (ret == -1 || !S_ISBLK(st.st_mode)) {
2449         goto outc;
2450     }
2451 
2452     /* Attempt to detect via a CDROM specific ioctl */
2453     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2454     if (ret >= 0)
2455         prio = 100;
2456 
2457 outc:
2458     qemu_close(fd);
2459 out:
2460     return prio;
2461 }
2462 
2463 static bool cdrom_is_inserted(BlockDriverState *bs)
2464 {
2465     BDRVRawState *s = bs->opaque;
2466     int ret;
2467 
2468     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2469     return ret == CDS_DISC_OK;
2470 }
2471 
2472 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2473 {
2474     BDRVRawState *s = bs->opaque;
2475 
2476     if (eject_flag) {
2477         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
2478             perror("CDROMEJECT");
2479     } else {
2480         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
2481             perror("CDROMEJECT");
2482     }
2483 }
2484 
2485 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2486 {
2487     BDRVRawState *s = bs->opaque;
2488 
2489     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
2490         /*
2491          * Note: an error can happen if the distribution automatically
2492          * mounts the CD-ROM
2493          */
2494         /* perror("CDROM_LOCKDOOR"); */
2495     }
2496 }
2497 
2498 static BlockDriver bdrv_host_cdrom = {
2499     .format_name        = "host_cdrom",
2500     .protocol_name      = "host_cdrom",
2501     .instance_size      = sizeof(BDRVRawState),
2502     .bdrv_needs_filename = true,
2503     .bdrv_probe_device	= cdrom_probe_device,
2504     .bdrv_parse_filename = cdrom_parse_filename,
2505     .bdrv_file_open     = cdrom_open,
2506     .bdrv_close         = raw_close,
2507     .bdrv_reopen_prepare = raw_reopen_prepare,
2508     .bdrv_reopen_commit  = raw_reopen_commit,
2509     .bdrv_reopen_abort   = raw_reopen_abort,
2510     .bdrv_create         = hdev_create,
2511     .create_opts         = &raw_create_opts,
2512 
2513 
2514     .bdrv_co_preadv         = raw_co_preadv,
2515     .bdrv_co_pwritev        = raw_co_pwritev,
2516     .bdrv_aio_flush	= raw_aio_flush,
2517     .bdrv_refresh_limits = raw_refresh_limits,
2518     .bdrv_io_plug = raw_aio_plug,
2519     .bdrv_io_unplug = raw_aio_unplug,
2520 
2521     .bdrv_truncate      = raw_truncate,
2522     .bdrv_getlength      = raw_getlength,
2523     .has_variable_length = true,
2524     .bdrv_get_allocated_file_size
2525                         = raw_get_allocated_file_size,
2526 
2527     /* removable device support */
2528     .bdrv_is_inserted   = cdrom_is_inserted,
2529     .bdrv_eject         = cdrom_eject,
2530     .bdrv_lock_medium   = cdrom_lock_medium,
2531 
2532     /* generic scsi device */
2533     .bdrv_aio_ioctl     = hdev_aio_ioctl,
2534 };
2535 #endif /* __linux__ */
2536 
2537 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2538 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2539                       Error **errp)
2540 {
2541     BDRVRawState *s = bs->opaque;
2542     Error *local_err = NULL;
2543     int ret;
2544 
2545     s->type = FTYPE_CD;
2546 
2547     ret = raw_open_common(bs, options, flags, 0, &local_err);
2548     if (ret) {
2549         error_propagate(errp, local_err);
2550         return ret;
2551     }
2552 
2553     /* make sure the door isn't locked at this time */
2554     ioctl(s->fd, CDIOCALLOW);
2555     return 0;
2556 }
2557 
2558 static int cdrom_probe_device(const char *filename)
2559 {
2560     if (strstart(filename, "/dev/cd", NULL) ||
2561             strstart(filename, "/dev/acd", NULL))
2562         return 100;
2563     return 0;
2564 }
2565 
2566 static int cdrom_reopen(BlockDriverState *bs)
2567 {
2568     BDRVRawState *s = bs->opaque;
2569     int fd;
2570 
2571     /*
2572      * Force reread of possibly changed/newly loaded disc,
2573      * FreeBSD seems to not notice sometimes...
2574      */
2575     if (s->fd >= 0)
2576         qemu_close(s->fd);
2577     fd = qemu_open(bs->filename, s->open_flags, 0644);
2578     if (fd < 0) {
2579         s->fd = -1;
2580         return -EIO;
2581     }
2582     s->fd = fd;
2583 
2584     /* make sure the door isn't locked at this time */
2585     ioctl(s->fd, CDIOCALLOW);
2586     return 0;
2587 }
2588 
2589 static bool cdrom_is_inserted(BlockDriverState *bs)
2590 {
2591     return raw_getlength(bs) > 0;
2592 }
2593 
2594 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2595 {
2596     BDRVRawState *s = bs->opaque;
2597 
2598     if (s->fd < 0)
2599         return;
2600 
2601     (void) ioctl(s->fd, CDIOCALLOW);
2602 
2603     if (eject_flag) {
2604         if (ioctl(s->fd, CDIOCEJECT) < 0)
2605             perror("CDIOCEJECT");
2606     } else {
2607         if (ioctl(s->fd, CDIOCCLOSE) < 0)
2608             perror("CDIOCCLOSE");
2609     }
2610 
2611     cdrom_reopen(bs);
2612 }
2613 
2614 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2615 {
2616     BDRVRawState *s = bs->opaque;
2617 
2618     if (s->fd < 0)
2619         return;
2620     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
2621         /*
2622          * Note: an error can happen if the distribution automatically
2623          * mounts the CD-ROM
2624          */
2625         /* perror("CDROM_LOCKDOOR"); */
2626     }
2627 }
2628 
2629 static BlockDriver bdrv_host_cdrom = {
2630     .format_name        = "host_cdrom",
2631     .protocol_name      = "host_cdrom",
2632     .instance_size      = sizeof(BDRVRawState),
2633     .bdrv_needs_filename = true,
2634     .bdrv_probe_device	= cdrom_probe_device,
2635     .bdrv_parse_filename = cdrom_parse_filename,
2636     .bdrv_file_open     = cdrom_open,
2637     .bdrv_close         = raw_close,
2638     .bdrv_reopen_prepare = raw_reopen_prepare,
2639     .bdrv_reopen_commit  = raw_reopen_commit,
2640     .bdrv_reopen_abort   = raw_reopen_abort,
2641     .bdrv_create        = hdev_create,
2642     .create_opts        = &raw_create_opts,
2643 
2644     .bdrv_co_preadv         = raw_co_preadv,
2645     .bdrv_co_pwritev        = raw_co_pwritev,
2646     .bdrv_aio_flush	= raw_aio_flush,
2647     .bdrv_refresh_limits = raw_refresh_limits,
2648     .bdrv_io_plug = raw_aio_plug,
2649     .bdrv_io_unplug = raw_aio_unplug,
2650 
2651     .bdrv_truncate      = raw_truncate,
2652     .bdrv_getlength      = raw_getlength,
2653     .has_variable_length = true,
2654     .bdrv_get_allocated_file_size
2655                         = raw_get_allocated_file_size,
2656 
2657     /* removable device support */
2658     .bdrv_is_inserted   = cdrom_is_inserted,
2659     .bdrv_eject         = cdrom_eject,
2660     .bdrv_lock_medium   = cdrom_lock_medium,
2661 };
2662 #endif /* __FreeBSD__ */
2663 
2664 static void bdrv_file_init(void)
2665 {
2666     /*
2667      * Register all the drivers.  Note that order is important, the driver
2668      * registered last will get probed first.
2669      */
2670     bdrv_register(&bdrv_file);
2671     bdrv_register(&bdrv_host_device);
2672 #ifdef __linux__
2673     bdrv_register(&bdrv_host_cdrom);
2674 #endif
2675 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2676     bdrv_register(&bdrv_host_cdrom);
2677 #endif
2678 }
2679 
2680 block_init(bdrv_file_init);
2681