xref: /openbmc/qemu/block/file-posix.c (revision 7609ffb9)
1 /*
2  * Block driver for RAW files (posix)
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu/osdep.h"
25 #include "qapi/error.h"
26 #include "qemu/cutils.h"
27 #include "qemu/error-report.h"
28 #include "qemu/timer.h"
29 #include "qemu/log.h"
30 #include "block/block_int.h"
31 #include "qemu/module.h"
32 #include "trace.h"
33 #include "block/thread-pool.h"
34 #include "qemu/iov.h"
35 #include "block/raw-aio.h"
36 #include "qapi/util.h"
37 #include "qapi/qmp/qstring.h"
38 
39 #if defined(__APPLE__) && (__MACH__)
40 #include <paths.h>
41 #include <sys/param.h>
42 #include <IOKit/IOKitLib.h>
43 #include <IOKit/IOBSD.h>
44 #include <IOKit/storage/IOMediaBSDClient.h>
45 #include <IOKit/storage/IOMedia.h>
46 #include <IOKit/storage/IOCDMedia.h>
47 //#include <IOKit/storage/IOCDTypes.h>
48 #include <IOKit/storage/IODVDMedia.h>
49 #include <CoreFoundation/CoreFoundation.h>
50 #endif
51 
52 #ifdef __sun__
53 #define _POSIX_PTHREAD_SEMANTICS 1
54 #include <sys/dkio.h>
55 #endif
56 #ifdef __linux__
57 #include <sys/ioctl.h>
58 #include <sys/param.h>
59 #include <linux/cdrom.h>
60 #include <linux/fd.h>
61 #include <linux/fs.h>
62 #include <linux/hdreg.h>
63 #include <scsi/sg.h>
64 #ifdef __s390__
65 #include <asm/dasd.h>
66 #endif
67 #ifndef FS_NOCOW_FL
68 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
69 #endif
70 #endif
71 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
72 #include <linux/falloc.h>
73 #endif
74 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
75 #include <sys/disk.h>
76 #include <sys/cdio.h>
77 #endif
78 
79 #ifdef __OpenBSD__
80 #include <sys/ioctl.h>
81 #include <sys/disklabel.h>
82 #include <sys/dkio.h>
83 #endif
84 
85 #ifdef __NetBSD__
86 #include <sys/ioctl.h>
87 #include <sys/disklabel.h>
88 #include <sys/dkio.h>
89 #include <sys/disk.h>
90 #endif
91 
92 #ifdef __DragonFly__
93 #include <sys/ioctl.h>
94 #include <sys/diskslice.h>
95 #endif
96 
97 #ifdef CONFIG_XFS
98 #include <xfs/xfs.h>
99 #endif
100 
101 //#define DEBUG_BLOCK
102 
103 #ifdef DEBUG_BLOCK
104 # define DEBUG_BLOCK_PRINT 1
105 #else
106 # define DEBUG_BLOCK_PRINT 0
107 #endif
108 #define DPRINTF(fmt, ...) \
109 do { \
110     if (DEBUG_BLOCK_PRINT) { \
111         printf(fmt, ## __VA_ARGS__); \
112     } \
113 } while (0)
114 
115 /* OS X does not have O_DSYNC */
116 #ifndef O_DSYNC
117 #ifdef O_SYNC
118 #define O_DSYNC O_SYNC
119 #elif defined(O_FSYNC)
120 #define O_DSYNC O_FSYNC
121 #endif
122 #endif
123 
124 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
125 #ifndef O_DIRECT
126 #define O_DIRECT O_DSYNC
127 #endif
128 
129 #define FTYPE_FILE   0
130 #define FTYPE_CD     1
131 
132 #define MAX_BLOCKSIZE	4096
133 
134 typedef struct BDRVRawState {
135     int fd;
136     int type;
137     int open_flags;
138     size_t buf_align;
139 
140 #ifdef CONFIG_XFS
141     bool is_xfs:1;
142 #endif
143     bool has_discard:1;
144     bool has_write_zeroes:1;
145     bool discard_zeroes:1;
146     bool use_linux_aio:1;
147     bool page_cache_inconsistent:1;
148     bool has_fallocate;
149     bool needs_alignment;
150 } BDRVRawState;
151 
152 typedef struct BDRVRawReopenState {
153     int fd;
154     int open_flags;
155 } BDRVRawReopenState;
156 
157 static int fd_open(BlockDriverState *bs);
158 static int64_t raw_getlength(BlockDriverState *bs);
159 
160 typedef struct RawPosixAIOData {
161     BlockDriverState *bs;
162     int aio_fildes;
163     union {
164         struct iovec *aio_iov;
165         void *aio_ioctl_buf;
166     };
167     int aio_niov;
168     uint64_t aio_nbytes;
169 #define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
170     off_t aio_offset;
171     int aio_type;
172 } RawPosixAIOData;
173 
174 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
175 static int cdrom_reopen(BlockDriverState *bs);
176 #endif
177 
178 #if defined(__NetBSD__)
179 static int raw_normalize_devicepath(const char **filename)
180 {
181     static char namebuf[PATH_MAX];
182     const char *dp, *fname;
183     struct stat sb;
184 
185     fname = *filename;
186     dp = strrchr(fname, '/');
187     if (lstat(fname, &sb) < 0) {
188         fprintf(stderr, "%s: stat failed: %s\n",
189             fname, strerror(errno));
190         return -errno;
191     }
192 
193     if (!S_ISBLK(sb.st_mode)) {
194         return 0;
195     }
196 
197     if (dp == NULL) {
198         snprintf(namebuf, PATH_MAX, "r%s", fname);
199     } else {
200         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
201             (int)(dp - fname), fname, dp + 1);
202     }
203     fprintf(stderr, "%s is a block device", fname);
204     *filename = namebuf;
205     fprintf(stderr, ", using %s\n", *filename);
206 
207     return 0;
208 }
209 #else
210 static int raw_normalize_devicepath(const char **filename)
211 {
212     return 0;
213 }
214 #endif
215 
216 /*
217  * Get logical block size via ioctl. On success store it in @sector_size_p.
218  */
219 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
220 {
221     unsigned int sector_size;
222     bool success = false;
223     int i;
224 
225     errno = ENOTSUP;
226     static const unsigned long ioctl_list[] = {
227 #ifdef BLKSSZGET
228         BLKSSZGET,
229 #endif
230 #ifdef DKIOCGETBLOCKSIZE
231         DKIOCGETBLOCKSIZE,
232 #endif
233 #ifdef DIOCGSECTORSIZE
234         DIOCGSECTORSIZE,
235 #endif
236     };
237 
238     /* Try a few ioctls to get the right size */
239     for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
240         if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
241             *sector_size_p = sector_size;
242             success = true;
243         }
244     }
245 
246     return success ? 0 : -errno;
247 }
248 
249 /**
250  * Get physical block size of @fd.
251  * On success, store it in @blk_size and return 0.
252  * On failure, return -errno.
253  */
254 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
255 {
256 #ifdef BLKPBSZGET
257     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
258         return -errno;
259     }
260     return 0;
261 #else
262     return -ENOTSUP;
263 #endif
264 }
265 
266 /* Check if read is allowed with given memory buffer and length.
267  *
268  * This function is used to check O_DIRECT memory buffer and request alignment.
269  */
270 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
271 {
272     ssize_t ret = pread(fd, buf, len, 0);
273 
274     if (ret >= 0) {
275         return true;
276     }
277 
278 #ifdef __linux__
279     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
280      * other errors (e.g. real I/O error), which could happen on a failed
281      * drive, since we only care about probing alignment.
282      */
283     if (errno != EINVAL) {
284         return true;
285     }
286 #endif
287 
288     return false;
289 }
290 
291 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
292 {
293     BDRVRawState *s = bs->opaque;
294     char *buf;
295     size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
296 
297     /* For SCSI generic devices the alignment is not really used.
298        With buffered I/O, we don't have any restrictions. */
299     if (bdrv_is_sg(bs) || !s->needs_alignment) {
300         bs->bl.request_alignment = 1;
301         s->buf_align = 1;
302         return;
303     }
304 
305     bs->bl.request_alignment = 0;
306     s->buf_align = 0;
307     /* Let's try to use the logical blocksize for the alignment. */
308     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
309         bs->bl.request_alignment = 0;
310     }
311 #ifdef CONFIG_XFS
312     if (s->is_xfs) {
313         struct dioattr da;
314         if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
315             bs->bl.request_alignment = da.d_miniosz;
316             /* The kernel returns wrong information for d_mem */
317             /* s->buf_align = da.d_mem; */
318         }
319     }
320 #endif
321 
322     /* If we could not get the sizes so far, we can only guess them */
323     if (!s->buf_align) {
324         size_t align;
325         buf = qemu_memalign(max_align, 2 * max_align);
326         for (align = 512; align <= max_align; align <<= 1) {
327             if (raw_is_io_aligned(fd, buf + align, max_align)) {
328                 s->buf_align = align;
329                 break;
330             }
331         }
332         qemu_vfree(buf);
333     }
334 
335     if (!bs->bl.request_alignment) {
336         size_t align;
337         buf = qemu_memalign(s->buf_align, max_align);
338         for (align = 512; align <= max_align; align <<= 1) {
339             if (raw_is_io_aligned(fd, buf, align)) {
340                 bs->bl.request_alignment = align;
341                 break;
342             }
343         }
344         qemu_vfree(buf);
345     }
346 
347     if (!s->buf_align || !bs->bl.request_alignment) {
348         error_setg(errp, "Could not find working O_DIRECT alignment");
349         error_append_hint(errp, "Try cache.direct=off\n");
350     }
351 }
352 
353 static void raw_parse_flags(int bdrv_flags, int *open_flags)
354 {
355     assert(open_flags != NULL);
356 
357     *open_flags |= O_BINARY;
358     *open_flags &= ~O_ACCMODE;
359     if (bdrv_flags & BDRV_O_RDWR) {
360         *open_flags |= O_RDWR;
361     } else {
362         *open_flags |= O_RDONLY;
363     }
364 
365     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
366      * and O_DIRECT for no caching. */
367     if ((bdrv_flags & BDRV_O_NOCACHE)) {
368         *open_flags |= O_DIRECT;
369     }
370 }
371 
372 static void raw_parse_filename(const char *filename, QDict *options,
373                                Error **errp)
374 {
375     /* The filename does not have to be prefixed by the protocol name, since
376      * "file" is the default protocol; therefore, the return value of this
377      * function call can be ignored. */
378     strstart(filename, "file:", &filename);
379 
380     qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
381 }
382 
383 static QemuOptsList raw_runtime_opts = {
384     .name = "raw",
385     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
386     .desc = {
387         {
388             .name = "filename",
389             .type = QEMU_OPT_STRING,
390             .help = "File name of the image",
391         },
392         {
393             .name = "aio",
394             .type = QEMU_OPT_STRING,
395             .help = "host AIO implementation (threads, native)",
396         },
397         { /* end of list */ }
398     },
399 };
400 
401 static int raw_open_common(BlockDriverState *bs, QDict *options,
402                            int bdrv_flags, int open_flags, Error **errp)
403 {
404     BDRVRawState *s = bs->opaque;
405     QemuOpts *opts;
406     Error *local_err = NULL;
407     const char *filename = NULL;
408     BlockdevAioOptions aio, aio_default;
409     int fd, ret;
410     struct stat st;
411 
412     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
413     qemu_opts_absorb_qdict(opts, options, &local_err);
414     if (local_err) {
415         error_propagate(errp, local_err);
416         ret = -EINVAL;
417         goto fail;
418     }
419 
420     filename = qemu_opt_get(opts, "filename");
421 
422     ret = raw_normalize_devicepath(&filename);
423     if (ret != 0) {
424         error_setg_errno(errp, -ret, "Could not normalize device path");
425         goto fail;
426     }
427 
428     aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
429                   ? BLOCKDEV_AIO_OPTIONS_NATIVE
430                   : BLOCKDEV_AIO_OPTIONS_THREADS;
431     aio = qapi_enum_parse(BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"),
432                           BLOCKDEV_AIO_OPTIONS__MAX, aio_default, &local_err);
433     if (local_err) {
434         error_propagate(errp, local_err);
435         ret = -EINVAL;
436         goto fail;
437     }
438     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
439 
440     s->open_flags = open_flags;
441     raw_parse_flags(bdrv_flags, &s->open_flags);
442 
443     s->fd = -1;
444     fd = qemu_open(filename, s->open_flags, 0644);
445     if (fd < 0) {
446         ret = -errno;
447         error_setg_errno(errp, errno, "Could not open '%s'", filename);
448         if (ret == -EROFS) {
449             ret = -EACCES;
450         }
451         goto fail;
452     }
453     s->fd = fd;
454 
455 #ifdef CONFIG_LINUX_AIO
456      /* Currently Linux does AIO only for files opened with O_DIRECT */
457     if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
458         error_setg(errp, "aio=native was specified, but it requires "
459                          "cache.direct=on, which was not specified.");
460         ret = -EINVAL;
461         goto fail;
462     }
463 #else
464     if (s->use_linux_aio) {
465         error_setg(errp, "aio=native was specified, but is not supported "
466                          "in this build.");
467         ret = -EINVAL;
468         goto fail;
469     }
470 #endif /* !defined(CONFIG_LINUX_AIO) */
471 
472     s->has_discard = true;
473     s->has_write_zeroes = true;
474     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
475     if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
476         s->needs_alignment = true;
477     }
478 
479     if (fstat(s->fd, &st) < 0) {
480         ret = -errno;
481         error_setg_errno(errp, errno, "Could not stat file");
482         goto fail;
483     }
484     if (S_ISREG(st.st_mode)) {
485         s->discard_zeroes = true;
486         s->has_fallocate = true;
487     }
488     if (S_ISBLK(st.st_mode)) {
489 #ifdef BLKDISCARDZEROES
490         unsigned int arg;
491         if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
492             s->discard_zeroes = true;
493         }
494 #endif
495 #ifdef __linux__
496         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
497          * not rely on the contents of discarded blocks unless using O_DIRECT.
498          * Same for BLKZEROOUT.
499          */
500         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
501             s->discard_zeroes = false;
502             s->has_write_zeroes = false;
503         }
504 #endif
505     }
506 #ifdef __FreeBSD__
507     if (S_ISCHR(st.st_mode)) {
508         /*
509          * The file is a char device (disk), which on FreeBSD isn't behind
510          * a pager, so force all requests to be aligned. This is needed
511          * so QEMU makes sure all IO operations on the device are aligned
512          * to sector size, or else FreeBSD will reject them with EINVAL.
513          */
514         s->needs_alignment = true;
515     }
516 #endif
517 
518 #ifdef CONFIG_XFS
519     if (platform_test_xfs_fd(s->fd)) {
520         s->is_xfs = true;
521     }
522 #endif
523 
524     ret = 0;
525 fail:
526     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
527         unlink(filename);
528     }
529     qemu_opts_del(opts);
530     return ret;
531 }
532 
533 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
534                     Error **errp)
535 {
536     BDRVRawState *s = bs->opaque;
537 
538     s->type = FTYPE_FILE;
539     return raw_open_common(bs, options, flags, 0, errp);
540 }
541 
542 static int raw_reopen_prepare(BDRVReopenState *state,
543                               BlockReopenQueue *queue, Error **errp)
544 {
545     BDRVRawState *s;
546     BDRVRawReopenState *rs;
547     int ret = 0;
548     Error *local_err = NULL;
549 
550     assert(state != NULL);
551     assert(state->bs != NULL);
552 
553     s = state->bs->opaque;
554 
555     state->opaque = g_new0(BDRVRawReopenState, 1);
556     rs = state->opaque;
557 
558     if (s->type == FTYPE_CD) {
559         rs->open_flags |= O_NONBLOCK;
560     }
561 
562     raw_parse_flags(state->flags, &rs->open_flags);
563 
564     rs->fd = -1;
565 
566     int fcntl_flags = O_APPEND | O_NONBLOCK;
567 #ifdef O_NOATIME
568     fcntl_flags |= O_NOATIME;
569 #endif
570 
571 #ifdef O_ASYNC
572     /* Not all operating systems have O_ASYNC, and those that don't
573      * will not let us track the state into rs->open_flags (typically
574      * you achieve the same effect with an ioctl, for example I_SETSIG
575      * on Solaris). But we do not use O_ASYNC, so that's fine.
576      */
577     assert((s->open_flags & O_ASYNC) == 0);
578 #endif
579 
580     if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
581         /* dup the original fd */
582         rs->fd = qemu_dup(s->fd);
583         if (rs->fd >= 0) {
584             ret = fcntl_setfl(rs->fd, rs->open_flags);
585             if (ret) {
586                 qemu_close(rs->fd);
587                 rs->fd = -1;
588             }
589         }
590     }
591 
592     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
593     if (rs->fd == -1) {
594         const char *normalized_filename = state->bs->filename;
595         ret = raw_normalize_devicepath(&normalized_filename);
596         if (ret < 0) {
597             error_setg_errno(errp, -ret, "Could not normalize device path");
598         } else {
599             assert(!(rs->open_flags & O_CREAT));
600             rs->fd = qemu_open(normalized_filename, rs->open_flags);
601             if (rs->fd == -1) {
602                 error_setg_errno(errp, errno, "Could not reopen file");
603                 ret = -1;
604             }
605         }
606     }
607 
608     /* Fail already reopen_prepare() if we can't get a working O_DIRECT
609      * alignment with the new fd. */
610     if (rs->fd != -1) {
611         raw_probe_alignment(state->bs, rs->fd, &local_err);
612         if (local_err) {
613             qemu_close(rs->fd);
614             rs->fd = -1;
615             error_propagate(errp, local_err);
616             ret = -EINVAL;
617         }
618     }
619 
620     return ret;
621 }
622 
623 static void raw_reopen_commit(BDRVReopenState *state)
624 {
625     BDRVRawReopenState *rs = state->opaque;
626     BDRVRawState *s = state->bs->opaque;
627 
628     s->open_flags = rs->open_flags;
629 
630     qemu_close(s->fd);
631     s->fd = rs->fd;
632 
633     g_free(state->opaque);
634     state->opaque = NULL;
635 }
636 
637 
638 static void raw_reopen_abort(BDRVReopenState *state)
639 {
640     BDRVRawReopenState *rs = state->opaque;
641 
642      /* nothing to do if NULL, we didn't get far enough */
643     if (rs == NULL) {
644         return;
645     }
646 
647     if (rs->fd >= 0) {
648         qemu_close(rs->fd);
649         rs->fd = -1;
650     }
651     g_free(state->opaque);
652     state->opaque = NULL;
653 }
654 
655 static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
656 {
657 #ifdef BLKSECTGET
658     int max_bytes = 0;
659     short max_sectors = 0;
660     if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
661         return max_bytes;
662     } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
663         return max_sectors << BDRV_SECTOR_BITS;
664     } else {
665         return -errno;
666     }
667 #else
668     return -ENOSYS;
669 #endif
670 }
671 
672 static int hdev_get_max_segments(const struct stat *st)
673 {
674 #ifdef CONFIG_LINUX
675     char buf[32];
676     const char *end;
677     char *sysfspath;
678     int ret;
679     int fd = -1;
680     long max_segments;
681 
682     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
683                                 major(st->st_rdev), minor(st->st_rdev));
684     fd = open(sysfspath, O_RDONLY);
685     if (fd == -1) {
686         ret = -errno;
687         goto out;
688     }
689     do {
690         ret = read(fd, buf, sizeof(buf) - 1);
691     } while (ret == -1 && errno == EINTR);
692     if (ret < 0) {
693         ret = -errno;
694         goto out;
695     } else if (ret == 0) {
696         ret = -EIO;
697         goto out;
698     }
699     buf[ret] = 0;
700     /* The file is ended with '\n', pass 'end' to accept that. */
701     ret = qemu_strtol(buf, &end, 10, &max_segments);
702     if (ret == 0 && end && *end == '\n') {
703         ret = max_segments;
704     }
705 
706 out:
707     if (fd != -1) {
708         close(fd);
709     }
710     g_free(sysfspath);
711     return ret;
712 #else
713     return -ENOTSUP;
714 #endif
715 }
716 
717 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
718 {
719     BDRVRawState *s = bs->opaque;
720     struct stat st;
721 
722     if (!fstat(s->fd, &st)) {
723         if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
724             int ret = hdev_get_max_transfer_length(bs, s->fd);
725             if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
726                 bs->bl.max_transfer = pow2floor(ret);
727             }
728             ret = hdev_get_max_segments(&st);
729             if (ret > 0) {
730                 bs->bl.max_transfer = MIN(bs->bl.max_transfer,
731                                           ret * getpagesize());
732             }
733         }
734     }
735 
736     raw_probe_alignment(bs, s->fd, errp);
737     bs->bl.min_mem_alignment = s->buf_align;
738     bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
739 }
740 
741 static int check_for_dasd(int fd)
742 {
743 #ifdef BIODASDINFO2
744     struct dasd_information2_t info = {0};
745 
746     return ioctl(fd, BIODASDINFO2, &info);
747 #else
748     return -1;
749 #endif
750 }
751 
752 /**
753  * Try to get @bs's logical and physical block size.
754  * On success, store them in @bsz and return zero.
755  * On failure, return negative errno.
756  */
757 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
758 {
759     BDRVRawState *s = bs->opaque;
760     int ret;
761 
762     /* If DASD, get blocksizes */
763     if (check_for_dasd(s->fd) < 0) {
764         return -ENOTSUP;
765     }
766     ret = probe_logical_blocksize(s->fd, &bsz->log);
767     if (ret < 0) {
768         return ret;
769     }
770     return probe_physical_blocksize(s->fd, &bsz->phys);
771 }
772 
773 /**
774  * Try to get @bs's geometry: cyls, heads, sectors.
775  * On success, store them in @geo and return 0.
776  * On failure return -errno.
777  * (Allows block driver to assign default geometry values that guest sees)
778  */
779 #ifdef __linux__
780 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
781 {
782     BDRVRawState *s = bs->opaque;
783     struct hd_geometry ioctl_geo = {0};
784 
785     /* If DASD, get its geometry */
786     if (check_for_dasd(s->fd) < 0) {
787         return -ENOTSUP;
788     }
789     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
790         return -errno;
791     }
792     /* HDIO_GETGEO may return success even though geo contains zeros
793        (e.g. certain multipath setups) */
794     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
795         return -ENOTSUP;
796     }
797     /* Do not return a geometry for partition */
798     if (ioctl_geo.start != 0) {
799         return -ENOTSUP;
800     }
801     geo->heads = ioctl_geo.heads;
802     geo->sectors = ioctl_geo.sectors;
803     geo->cylinders = ioctl_geo.cylinders;
804 
805     return 0;
806 }
807 #else /* __linux__ */
808 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
809 {
810     return -ENOTSUP;
811 }
812 #endif
813 
814 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
815 {
816     int ret;
817 
818     ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
819     if (ret == -1) {
820         return -errno;
821     }
822 
823     return 0;
824 }
825 
826 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
827 {
828     BDRVRawState *s = aiocb->bs->opaque;
829     int ret;
830 
831     if (s->page_cache_inconsistent) {
832         return -EIO;
833     }
834 
835     ret = qemu_fdatasync(aiocb->aio_fildes);
836     if (ret == -1) {
837         /* There is no clear definition of the semantics of a failing fsync(),
838          * so we may have to assume the worst. The sad truth is that this
839          * assumption is correct for Linux. Some pages are now probably marked
840          * clean in the page cache even though they are inconsistent with the
841          * on-disk contents. The next fdatasync() call would succeed, but no
842          * further writeback attempt will be made. We can't get back to a state
843          * in which we know what is on disk (we would have to rewrite
844          * everything that was touched since the last fdatasync() at least), so
845          * make bdrv_flush() fail permanently. Given that the behaviour isn't
846          * really defined, I have little hope that other OSes are doing better.
847          *
848          * Obviously, this doesn't affect O_DIRECT, which bypasses the page
849          * cache. */
850         if ((s->open_flags & O_DIRECT) == 0) {
851             s->page_cache_inconsistent = true;
852         }
853         return -errno;
854     }
855     return 0;
856 }
857 
858 #ifdef CONFIG_PREADV
859 
860 static bool preadv_present = true;
861 
862 static ssize_t
863 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
864 {
865     return preadv(fd, iov, nr_iov, offset);
866 }
867 
868 static ssize_t
869 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
870 {
871     return pwritev(fd, iov, nr_iov, offset);
872 }
873 
874 #else
875 
876 static bool preadv_present = false;
877 
878 static ssize_t
879 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
880 {
881     return -ENOSYS;
882 }
883 
884 static ssize_t
885 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
886 {
887     return -ENOSYS;
888 }
889 
890 #endif
891 
892 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
893 {
894     ssize_t len;
895 
896     do {
897         if (aiocb->aio_type & QEMU_AIO_WRITE)
898             len = qemu_pwritev(aiocb->aio_fildes,
899                                aiocb->aio_iov,
900                                aiocb->aio_niov,
901                                aiocb->aio_offset);
902          else
903             len = qemu_preadv(aiocb->aio_fildes,
904                               aiocb->aio_iov,
905                               aiocb->aio_niov,
906                               aiocb->aio_offset);
907     } while (len == -1 && errno == EINTR);
908 
909     if (len == -1) {
910         return -errno;
911     }
912     return len;
913 }
914 
915 /*
916  * Read/writes the data to/from a given linear buffer.
917  *
918  * Returns the number of bytes handles or -errno in case of an error. Short
919  * reads are only returned if the end of the file is reached.
920  */
921 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
922 {
923     ssize_t offset = 0;
924     ssize_t len;
925 
926     while (offset < aiocb->aio_nbytes) {
927         if (aiocb->aio_type & QEMU_AIO_WRITE) {
928             len = pwrite(aiocb->aio_fildes,
929                          (const char *)buf + offset,
930                          aiocb->aio_nbytes - offset,
931                          aiocb->aio_offset + offset);
932         } else {
933             len = pread(aiocb->aio_fildes,
934                         buf + offset,
935                         aiocb->aio_nbytes - offset,
936                         aiocb->aio_offset + offset);
937         }
938         if (len == -1 && errno == EINTR) {
939             continue;
940         } else if (len == -1 && errno == EINVAL &&
941                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
942                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
943                    offset > 0) {
944             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
945              * after a short read.  Assume that O_DIRECT short reads only occur
946              * at EOF.  Therefore this is a short read, not an I/O error.
947              */
948             break;
949         } else if (len == -1) {
950             offset = -errno;
951             break;
952         } else if (len == 0) {
953             break;
954         }
955         offset += len;
956     }
957 
958     return offset;
959 }
960 
961 static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
962 {
963     ssize_t nbytes;
964     char *buf;
965 
966     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
967         /*
968          * If there is just a single buffer, and it is properly aligned
969          * we can just use plain pread/pwrite without any problems.
970          */
971         if (aiocb->aio_niov == 1) {
972              return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
973         }
974         /*
975          * We have more than one iovec, and all are properly aligned.
976          *
977          * Try preadv/pwritev first and fall back to linearizing the
978          * buffer if it's not supported.
979          */
980         if (preadv_present) {
981             nbytes = handle_aiocb_rw_vector(aiocb);
982             if (nbytes == aiocb->aio_nbytes ||
983                 (nbytes < 0 && nbytes != -ENOSYS)) {
984                 return nbytes;
985             }
986             preadv_present = false;
987         }
988 
989         /*
990          * XXX(hch): short read/write.  no easy way to handle the reminder
991          * using these interfaces.  For now retry using plain
992          * pread/pwrite?
993          */
994     }
995 
996     /*
997      * Ok, we have to do it the hard way, copy all segments into
998      * a single aligned buffer.
999      */
1000     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1001     if (buf == NULL) {
1002         return -ENOMEM;
1003     }
1004 
1005     if (aiocb->aio_type & QEMU_AIO_WRITE) {
1006         char *p = buf;
1007         int i;
1008 
1009         for (i = 0; i < aiocb->aio_niov; ++i) {
1010             memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
1011             p += aiocb->aio_iov[i].iov_len;
1012         }
1013         assert(p - buf == aiocb->aio_nbytes);
1014     }
1015 
1016     nbytes = handle_aiocb_rw_linear(aiocb, buf);
1017     if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1018         char *p = buf;
1019         size_t count = aiocb->aio_nbytes, copy;
1020         int i;
1021 
1022         for (i = 0; i < aiocb->aio_niov && count; ++i) {
1023             copy = count;
1024             if (copy > aiocb->aio_iov[i].iov_len) {
1025                 copy = aiocb->aio_iov[i].iov_len;
1026             }
1027             memcpy(aiocb->aio_iov[i].iov_base, p, copy);
1028             assert(count >= copy);
1029             p     += copy;
1030             count -= copy;
1031         }
1032         assert(count == 0);
1033     }
1034     qemu_vfree(buf);
1035 
1036     return nbytes;
1037 }
1038 
1039 #ifdef CONFIG_XFS
1040 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
1041 {
1042     struct xfs_flock64 fl;
1043     int err;
1044 
1045     memset(&fl, 0, sizeof(fl));
1046     fl.l_whence = SEEK_SET;
1047     fl.l_start = offset;
1048     fl.l_len = bytes;
1049 
1050     if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
1051         err = errno;
1052         DPRINTF("cannot write zero range (%s)\n", strerror(errno));
1053         return -err;
1054     }
1055 
1056     return 0;
1057 }
1058 
1059 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
1060 {
1061     struct xfs_flock64 fl;
1062     int err;
1063 
1064     memset(&fl, 0, sizeof(fl));
1065     fl.l_whence = SEEK_SET;
1066     fl.l_start = offset;
1067     fl.l_len = bytes;
1068 
1069     if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
1070         err = errno;
1071         DPRINTF("cannot punch hole (%s)\n", strerror(errno));
1072         return -err;
1073     }
1074 
1075     return 0;
1076 }
1077 #endif
1078 
1079 static int translate_err(int err)
1080 {
1081     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1082         err == -ENOTTY) {
1083         err = -ENOTSUP;
1084     }
1085     return err;
1086 }
1087 
1088 #ifdef CONFIG_FALLOCATE
1089 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1090 {
1091     do {
1092         if (fallocate(fd, mode, offset, len) == 0) {
1093             return 0;
1094         }
1095     } while (errno == EINTR);
1096     return translate_err(-errno);
1097 }
1098 #endif
1099 
1100 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1101 {
1102     int ret = -ENOTSUP;
1103     BDRVRawState *s = aiocb->bs->opaque;
1104 
1105     if (!s->has_write_zeroes) {
1106         return -ENOTSUP;
1107     }
1108 
1109 #ifdef BLKZEROOUT
1110     do {
1111         uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1112         if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1113             return 0;
1114         }
1115     } while (errno == EINTR);
1116 
1117     ret = translate_err(-errno);
1118 #endif
1119 
1120     if (ret == -ENOTSUP) {
1121         s->has_write_zeroes = false;
1122     }
1123     return ret;
1124 }
1125 
1126 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
1127 {
1128 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1129     BDRVRawState *s = aiocb->bs->opaque;
1130 #endif
1131 
1132     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1133         return handle_aiocb_write_zeroes_block(aiocb);
1134     }
1135 
1136 #ifdef CONFIG_XFS
1137     if (s->is_xfs) {
1138         return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
1139     }
1140 #endif
1141 
1142 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1143     if (s->has_write_zeroes) {
1144         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1145                                aiocb->aio_offset, aiocb->aio_nbytes);
1146         if (ret == 0 || ret != -ENOTSUP) {
1147             return ret;
1148         }
1149         s->has_write_zeroes = false;
1150     }
1151 #endif
1152 
1153 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1154     if (s->has_discard && s->has_fallocate) {
1155         int ret = do_fallocate(s->fd,
1156                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1157                                aiocb->aio_offset, aiocb->aio_nbytes);
1158         if (ret == 0) {
1159             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1160             if (ret == 0 || ret != -ENOTSUP) {
1161                 return ret;
1162             }
1163             s->has_fallocate = false;
1164         } else if (ret != -ENOTSUP) {
1165             return ret;
1166         } else {
1167             s->has_discard = false;
1168         }
1169     }
1170 #endif
1171 
1172 #ifdef CONFIG_FALLOCATE
1173     if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
1174         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1175         if (ret == 0 || ret != -ENOTSUP) {
1176             return ret;
1177         }
1178         s->has_fallocate = false;
1179     }
1180 #endif
1181 
1182     return -ENOTSUP;
1183 }
1184 
1185 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
1186 {
1187     int ret = -EOPNOTSUPP;
1188     BDRVRawState *s = aiocb->bs->opaque;
1189 
1190     if (!s->has_discard) {
1191         return -ENOTSUP;
1192     }
1193 
1194     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1195 #ifdef BLKDISCARD
1196         do {
1197             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1198             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1199                 return 0;
1200             }
1201         } while (errno == EINTR);
1202 
1203         ret = -errno;
1204 #endif
1205     } else {
1206 #ifdef CONFIG_XFS
1207         if (s->is_xfs) {
1208             return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1209         }
1210 #endif
1211 
1212 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1213         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1214                            aiocb->aio_offset, aiocb->aio_nbytes);
1215 #endif
1216     }
1217 
1218     ret = translate_err(ret);
1219     if (ret == -ENOTSUP) {
1220         s->has_discard = false;
1221     }
1222     return ret;
1223 }
1224 
1225 static int aio_worker(void *arg)
1226 {
1227     RawPosixAIOData *aiocb = arg;
1228     ssize_t ret = 0;
1229 
1230     switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
1231     case QEMU_AIO_READ:
1232         ret = handle_aiocb_rw(aiocb);
1233         if (ret >= 0 && ret < aiocb->aio_nbytes) {
1234             iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
1235                       0, aiocb->aio_nbytes - ret);
1236 
1237             ret = aiocb->aio_nbytes;
1238         }
1239         if (ret == aiocb->aio_nbytes) {
1240             ret = 0;
1241         } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1242             ret = -EINVAL;
1243         }
1244         break;
1245     case QEMU_AIO_WRITE:
1246         ret = handle_aiocb_rw(aiocb);
1247         if (ret == aiocb->aio_nbytes) {
1248             ret = 0;
1249         } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1250             ret = -EINVAL;
1251         }
1252         break;
1253     case QEMU_AIO_FLUSH:
1254         ret = handle_aiocb_flush(aiocb);
1255         break;
1256     case QEMU_AIO_IOCTL:
1257         ret = handle_aiocb_ioctl(aiocb);
1258         break;
1259     case QEMU_AIO_DISCARD:
1260         ret = handle_aiocb_discard(aiocb);
1261         break;
1262     case QEMU_AIO_WRITE_ZEROES:
1263         ret = handle_aiocb_write_zeroes(aiocb);
1264         break;
1265     default:
1266         fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
1267         ret = -EINVAL;
1268         break;
1269     }
1270 
1271     g_free(aiocb);
1272     return ret;
1273 }
1274 
1275 static int paio_submit_co(BlockDriverState *bs, int fd,
1276                           int64_t offset, QEMUIOVector *qiov,
1277                           int count, int type)
1278 {
1279     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1280     ThreadPool *pool;
1281 
1282     acb->bs = bs;
1283     acb->aio_type = type;
1284     acb->aio_fildes = fd;
1285 
1286     acb->aio_nbytes = count;
1287     acb->aio_offset = offset;
1288 
1289     if (qiov) {
1290         acb->aio_iov = qiov->iov;
1291         acb->aio_niov = qiov->niov;
1292         assert(qiov->size == count);
1293     }
1294 
1295     trace_paio_submit_co(offset, count, type);
1296     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1297     return thread_pool_submit_co(pool, aio_worker, acb);
1298 }
1299 
1300 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
1301         int64_t offset, QEMUIOVector *qiov, int count,
1302         BlockCompletionFunc *cb, void *opaque, int type)
1303 {
1304     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1305     ThreadPool *pool;
1306 
1307     acb->bs = bs;
1308     acb->aio_type = type;
1309     acb->aio_fildes = fd;
1310 
1311     acb->aio_nbytes = count;
1312     acb->aio_offset = offset;
1313 
1314     if (qiov) {
1315         acb->aio_iov = qiov->iov;
1316         acb->aio_niov = qiov->niov;
1317         assert(qiov->size == acb->aio_nbytes);
1318     }
1319 
1320     trace_paio_submit(acb, opaque, offset, count, type);
1321     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1322     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
1323 }
1324 
1325 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1326                                    uint64_t bytes, QEMUIOVector *qiov, int type)
1327 {
1328     BDRVRawState *s = bs->opaque;
1329 
1330     if (fd_open(bs) < 0)
1331         return -EIO;
1332 
1333     /*
1334      * Check if the underlying device requires requests to be aligned,
1335      * and if the request we are trying to submit is aligned or not.
1336      * If this is the case tell the low-level driver that it needs
1337      * to copy the buffer.
1338      */
1339     if (s->needs_alignment) {
1340         if (!bdrv_qiov_is_aligned(bs, qiov)) {
1341             type |= QEMU_AIO_MISALIGNED;
1342 #ifdef CONFIG_LINUX_AIO
1343         } else if (s->use_linux_aio) {
1344             LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1345             assert(qiov->size == bytes);
1346             return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1347 #endif
1348         }
1349     }
1350 
1351     return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
1352 }
1353 
1354 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1355                                       uint64_t bytes, QEMUIOVector *qiov,
1356                                       int flags)
1357 {
1358     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1359 }
1360 
1361 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1362                                        uint64_t bytes, QEMUIOVector *qiov,
1363                                        int flags)
1364 {
1365     assert(flags == 0);
1366     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1367 }
1368 
1369 static void raw_aio_plug(BlockDriverState *bs)
1370 {
1371 #ifdef CONFIG_LINUX_AIO
1372     BDRVRawState *s = bs->opaque;
1373     if (s->use_linux_aio) {
1374         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1375         laio_io_plug(bs, aio);
1376     }
1377 #endif
1378 }
1379 
1380 static void raw_aio_unplug(BlockDriverState *bs)
1381 {
1382 #ifdef CONFIG_LINUX_AIO
1383     BDRVRawState *s = bs->opaque;
1384     if (s->use_linux_aio) {
1385         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1386         laio_io_unplug(bs, aio);
1387     }
1388 #endif
1389 }
1390 
1391 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
1392         BlockCompletionFunc *cb, void *opaque)
1393 {
1394     BDRVRawState *s = bs->opaque;
1395 
1396     if (fd_open(bs) < 0)
1397         return NULL;
1398 
1399     return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
1400 }
1401 
1402 static void raw_close(BlockDriverState *bs)
1403 {
1404     BDRVRawState *s = bs->opaque;
1405 
1406     if (s->fd >= 0) {
1407         qemu_close(s->fd);
1408         s->fd = -1;
1409     }
1410 }
1411 
1412 static int raw_truncate(BlockDriverState *bs, int64_t offset)
1413 {
1414     BDRVRawState *s = bs->opaque;
1415     struct stat st;
1416 
1417     if (fstat(s->fd, &st)) {
1418         return -errno;
1419     }
1420 
1421     if (S_ISREG(st.st_mode)) {
1422         if (ftruncate(s->fd, offset) < 0) {
1423             return -errno;
1424         }
1425     } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1426        if (offset > raw_getlength(bs)) {
1427            return -EINVAL;
1428        }
1429     } else {
1430         return -ENOTSUP;
1431     }
1432 
1433     return 0;
1434 }
1435 
1436 #ifdef __OpenBSD__
1437 static int64_t raw_getlength(BlockDriverState *bs)
1438 {
1439     BDRVRawState *s = bs->opaque;
1440     int fd = s->fd;
1441     struct stat st;
1442 
1443     if (fstat(fd, &st))
1444         return -errno;
1445     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1446         struct disklabel dl;
1447 
1448         if (ioctl(fd, DIOCGDINFO, &dl))
1449             return -errno;
1450         return (uint64_t)dl.d_secsize *
1451             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1452     } else
1453         return st.st_size;
1454 }
1455 #elif defined(__NetBSD__)
1456 static int64_t raw_getlength(BlockDriverState *bs)
1457 {
1458     BDRVRawState *s = bs->opaque;
1459     int fd = s->fd;
1460     struct stat st;
1461 
1462     if (fstat(fd, &st))
1463         return -errno;
1464     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1465         struct dkwedge_info dkw;
1466 
1467         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
1468             return dkw.dkw_size * 512;
1469         } else {
1470             struct disklabel dl;
1471 
1472             if (ioctl(fd, DIOCGDINFO, &dl))
1473                 return -errno;
1474             return (uint64_t)dl.d_secsize *
1475                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1476         }
1477     } else
1478         return st.st_size;
1479 }
1480 #elif defined(__sun__)
1481 static int64_t raw_getlength(BlockDriverState *bs)
1482 {
1483     BDRVRawState *s = bs->opaque;
1484     struct dk_minfo minfo;
1485     int ret;
1486     int64_t size;
1487 
1488     ret = fd_open(bs);
1489     if (ret < 0) {
1490         return ret;
1491     }
1492 
1493     /*
1494      * Use the DKIOCGMEDIAINFO ioctl to read the size.
1495      */
1496     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
1497     if (ret != -1) {
1498         return minfo.dki_lbsize * minfo.dki_capacity;
1499     }
1500 
1501     /*
1502      * There are reports that lseek on some devices fails, but
1503      * irc discussion said that contingency on contingency was overkill.
1504      */
1505     size = lseek(s->fd, 0, SEEK_END);
1506     if (size < 0) {
1507         return -errno;
1508     }
1509     return size;
1510 }
1511 #elif defined(CONFIG_BSD)
1512 static int64_t raw_getlength(BlockDriverState *bs)
1513 {
1514     BDRVRawState *s = bs->opaque;
1515     int fd = s->fd;
1516     int64_t size;
1517     struct stat sb;
1518 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1519     int reopened = 0;
1520 #endif
1521     int ret;
1522 
1523     ret = fd_open(bs);
1524     if (ret < 0)
1525         return ret;
1526 
1527 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1528 again:
1529 #endif
1530     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
1531 #ifdef DIOCGMEDIASIZE
1532 	if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
1533 #elif defined(DIOCGPART)
1534         {
1535                 struct partinfo pi;
1536                 if (ioctl(fd, DIOCGPART, &pi) == 0)
1537                         size = pi.media_size;
1538                 else
1539                         size = 0;
1540         }
1541         if (size == 0)
1542 #endif
1543 #if defined(__APPLE__) && defined(__MACH__)
1544         {
1545             uint64_t sectors = 0;
1546             uint32_t sector_size = 0;
1547 
1548             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
1549                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
1550                 size = sectors * sector_size;
1551             } else {
1552                 size = lseek(fd, 0LL, SEEK_END);
1553                 if (size < 0) {
1554                     return -errno;
1555                 }
1556             }
1557         }
1558 #else
1559         size = lseek(fd, 0LL, SEEK_END);
1560         if (size < 0) {
1561             return -errno;
1562         }
1563 #endif
1564 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1565         switch(s->type) {
1566         case FTYPE_CD:
1567             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
1568             if (size == 2048LL * (unsigned)-1)
1569                 size = 0;
1570             /* XXX no disc?  maybe we need to reopen... */
1571             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
1572                 reopened = 1;
1573                 goto again;
1574             }
1575         }
1576 #endif
1577     } else {
1578         size = lseek(fd, 0, SEEK_END);
1579         if (size < 0) {
1580             return -errno;
1581         }
1582     }
1583     return size;
1584 }
1585 #else
1586 static int64_t raw_getlength(BlockDriverState *bs)
1587 {
1588     BDRVRawState *s = bs->opaque;
1589     int ret;
1590     int64_t size;
1591 
1592     ret = fd_open(bs);
1593     if (ret < 0) {
1594         return ret;
1595     }
1596 
1597     size = lseek(s->fd, 0, SEEK_END);
1598     if (size < 0) {
1599         return -errno;
1600     }
1601     return size;
1602 }
1603 #endif
1604 
1605 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
1606 {
1607     struct stat st;
1608     BDRVRawState *s = bs->opaque;
1609 
1610     if (fstat(s->fd, &st) < 0) {
1611         return -errno;
1612     }
1613     return (int64_t)st.st_blocks * 512;
1614 }
1615 
1616 static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
1617 {
1618     int fd;
1619     int result = 0;
1620     int64_t total_size = 0;
1621     bool nocow = false;
1622     PreallocMode prealloc;
1623     char *buf = NULL;
1624     Error *local_err = NULL;
1625 
1626     strstart(filename, "file:", &filename);
1627 
1628     /* Read out options */
1629     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1630                           BDRV_SECTOR_SIZE);
1631     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
1632     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1633     prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
1634                                PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
1635                                &local_err);
1636     g_free(buf);
1637     if (local_err) {
1638         error_propagate(errp, local_err);
1639         result = -EINVAL;
1640         goto out;
1641     }
1642 
1643     fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
1644                    0644);
1645     if (fd < 0) {
1646         result = -errno;
1647         error_setg_errno(errp, -result, "Could not create file");
1648         goto out;
1649     }
1650 
1651     if (nocow) {
1652 #ifdef __linux__
1653         /* Set NOCOW flag to solve performance issue on fs like btrfs.
1654          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
1655          * will be ignored since any failure of this operation should not
1656          * block the left work.
1657          */
1658         int attr;
1659         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
1660             attr |= FS_NOCOW_FL;
1661             ioctl(fd, FS_IOC_SETFLAGS, &attr);
1662         }
1663 #endif
1664     }
1665 
1666     switch (prealloc) {
1667 #ifdef CONFIG_POSIX_FALLOCATE
1668     case PREALLOC_MODE_FALLOC:
1669         /*
1670          * Truncating before posix_fallocate() makes it about twice slower on
1671          * file systems that do not support fallocate(), trying to check if a
1672          * block is allocated before allocating it, so don't do that here.
1673          */
1674         result = -posix_fallocate(fd, 0, total_size);
1675         if (result != 0) {
1676             /* posix_fallocate() doesn't set errno. */
1677             error_setg_errno(errp, -result,
1678                              "Could not preallocate data for the new file");
1679         }
1680         break;
1681 #endif
1682     case PREALLOC_MODE_FULL:
1683     {
1684         /*
1685          * Knowing the final size from the beginning could allow the file
1686          * system driver to do less allocations and possibly avoid
1687          * fragmentation of the file.
1688          */
1689         if (ftruncate(fd, total_size) != 0) {
1690             result = -errno;
1691             error_setg_errno(errp, -result, "Could not resize file");
1692             goto out_close;
1693         }
1694 
1695         int64_t num = 0, left = total_size;
1696         buf = g_malloc0(65536);
1697 
1698         while (left > 0) {
1699             num = MIN(left, 65536);
1700             result = write(fd, buf, num);
1701             if (result < 0) {
1702                 result = -errno;
1703                 error_setg_errno(errp, -result,
1704                                  "Could not write to the new file");
1705                 break;
1706             }
1707             left -= result;
1708         }
1709         if (result >= 0) {
1710             result = fsync(fd);
1711             if (result < 0) {
1712                 result = -errno;
1713                 error_setg_errno(errp, -result,
1714                                  "Could not flush new file to disk");
1715             }
1716         }
1717         g_free(buf);
1718         break;
1719     }
1720     case PREALLOC_MODE_OFF:
1721         if (ftruncate(fd, total_size) != 0) {
1722             result = -errno;
1723             error_setg_errno(errp, -result, "Could not resize file");
1724         }
1725         break;
1726     default:
1727         result = -EINVAL;
1728         error_setg(errp, "Unsupported preallocation mode: %s",
1729                    PreallocMode_lookup[prealloc]);
1730         break;
1731     }
1732 
1733 out_close:
1734     if (qemu_close(fd) != 0 && result == 0) {
1735         result = -errno;
1736         error_setg_errno(errp, -result, "Could not close the new file");
1737     }
1738 out:
1739     return result;
1740 }
1741 
1742 /*
1743  * Find allocation range in @bs around offset @start.
1744  * May change underlying file descriptor's file offset.
1745  * If @start is not in a hole, store @start in @data, and the
1746  * beginning of the next hole in @hole, and return 0.
1747  * If @start is in a non-trailing hole, store @start in @hole and the
1748  * beginning of the next non-hole in @data, and return 0.
1749  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
1750  * If we can't find out, return a negative errno other than -ENXIO.
1751  */
1752 static int find_allocation(BlockDriverState *bs, off_t start,
1753                            off_t *data, off_t *hole)
1754 {
1755 #if defined SEEK_HOLE && defined SEEK_DATA
1756     BDRVRawState *s = bs->opaque;
1757     off_t offs;
1758 
1759     /*
1760      * SEEK_DATA cases:
1761      * D1. offs == start: start is in data
1762      * D2. offs > start: start is in a hole, next data at offs
1763      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
1764      *                              or start is beyond EOF
1765      *     If the latter happens, the file has been truncated behind
1766      *     our back since we opened it.  All bets are off then.
1767      *     Treating like a trailing hole is simplest.
1768      * D4. offs < 0, errno != ENXIO: we learned nothing
1769      */
1770     offs = lseek(s->fd, start, SEEK_DATA);
1771     if (offs < 0) {
1772         return -errno;          /* D3 or D4 */
1773     }
1774     assert(offs >= start);
1775 
1776     if (offs > start) {
1777         /* D2: in hole, next data at offs */
1778         *hole = start;
1779         *data = offs;
1780         return 0;
1781     }
1782 
1783     /* D1: in data, end not yet known */
1784 
1785     /*
1786      * SEEK_HOLE cases:
1787      * H1. offs == start: start is in a hole
1788      *     If this happens here, a hole has been dug behind our back
1789      *     since the previous lseek().
1790      * H2. offs > start: either start is in data, next hole at offs,
1791      *                   or start is in trailing hole, EOF at offs
1792      *     Linux treats trailing holes like any other hole: offs ==
1793      *     start.  Solaris seeks to EOF instead: offs > start (blech).
1794      *     If that happens here, a hole has been dug behind our back
1795      *     since the previous lseek().
1796      * H3. offs < 0, errno = ENXIO: start is beyond EOF
1797      *     If this happens, the file has been truncated behind our
1798      *     back since we opened it.  Treat it like a trailing hole.
1799      * H4. offs < 0, errno != ENXIO: we learned nothing
1800      *     Pretend we know nothing at all, i.e. "forget" about D1.
1801      */
1802     offs = lseek(s->fd, start, SEEK_HOLE);
1803     if (offs < 0) {
1804         return -errno;          /* D1 and (H3 or H4) */
1805     }
1806     assert(offs >= start);
1807 
1808     if (offs > start) {
1809         /*
1810          * D1 and H2: either in data, next hole at offs, or it was in
1811          * data but is now in a trailing hole.  In the latter case,
1812          * all bets are off.  Treating it as if it there was data all
1813          * the way to EOF is safe, so simply do that.
1814          */
1815         *data = start;
1816         *hole = offs;
1817         return 0;
1818     }
1819 
1820     /* D1 and H1 */
1821     return -EBUSY;
1822 #else
1823     return -ENOTSUP;
1824 #endif
1825 }
1826 
1827 /*
1828  * Returns the allocation status of the specified sectors.
1829  *
1830  * If 'sector_num' is beyond the end of the disk image the return value is 0
1831  * and 'pnum' is set to 0.
1832  *
1833  * 'pnum' is set to the number of sectors (including and immediately following
1834  * the specified sector) that are known to be in the same
1835  * allocated/unallocated state.
1836  *
1837  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1838  * beyond the end of the disk image it will be clamped.
1839  */
1840 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
1841                                                     int64_t sector_num,
1842                                                     int nb_sectors, int *pnum,
1843                                                     BlockDriverState **file)
1844 {
1845     off_t start, data = 0, hole = 0;
1846     int64_t total_size;
1847     int ret;
1848 
1849     ret = fd_open(bs);
1850     if (ret < 0) {
1851         return ret;
1852     }
1853 
1854     start = sector_num * BDRV_SECTOR_SIZE;
1855     total_size = bdrv_getlength(bs);
1856     if (total_size < 0) {
1857         return total_size;
1858     } else if (start >= total_size) {
1859         *pnum = 0;
1860         return 0;
1861     } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
1862         nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
1863     }
1864 
1865     ret = find_allocation(bs, start, &data, &hole);
1866     if (ret == -ENXIO) {
1867         /* Trailing hole */
1868         *pnum = nb_sectors;
1869         ret = BDRV_BLOCK_ZERO;
1870     } else if (ret < 0) {
1871         /* No info available, so pretend there are no holes */
1872         *pnum = nb_sectors;
1873         ret = BDRV_BLOCK_DATA;
1874     } else if (data == start) {
1875         /* On a data extent, compute sectors to the end of the extent,
1876          * possibly including a partial sector at EOF. */
1877         *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
1878         ret = BDRV_BLOCK_DATA;
1879     } else {
1880         /* On a hole, compute sectors to the beginning of the next extent.  */
1881         assert(hole == start);
1882         *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
1883         ret = BDRV_BLOCK_ZERO;
1884     }
1885     *file = bs;
1886     return ret | BDRV_BLOCK_OFFSET_VALID | start;
1887 }
1888 
1889 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
1890     int64_t offset, int count,
1891     BlockCompletionFunc *cb, void *opaque)
1892 {
1893     BDRVRawState *s = bs->opaque;
1894 
1895     return paio_submit(bs, s->fd, offset, NULL, count,
1896                        cb, opaque, QEMU_AIO_DISCARD);
1897 }
1898 
1899 static int coroutine_fn raw_co_pwrite_zeroes(
1900     BlockDriverState *bs, int64_t offset,
1901     int count, BdrvRequestFlags flags)
1902 {
1903     BDRVRawState *s = bs->opaque;
1904 
1905     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
1906         return paio_submit_co(bs, s->fd, offset, NULL, count,
1907                               QEMU_AIO_WRITE_ZEROES);
1908     } else if (s->discard_zeroes) {
1909         return paio_submit_co(bs, s->fd, offset, NULL, count,
1910                               QEMU_AIO_DISCARD);
1911     }
1912     return -ENOTSUP;
1913 }
1914 
1915 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1916 {
1917     BDRVRawState *s = bs->opaque;
1918 
1919     bdi->unallocated_blocks_are_zero = s->discard_zeroes;
1920     bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
1921     return 0;
1922 }
1923 
1924 static QemuOptsList raw_create_opts = {
1925     .name = "raw-create-opts",
1926     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
1927     .desc = {
1928         {
1929             .name = BLOCK_OPT_SIZE,
1930             .type = QEMU_OPT_SIZE,
1931             .help = "Virtual disk size"
1932         },
1933         {
1934             .name = BLOCK_OPT_NOCOW,
1935             .type = QEMU_OPT_BOOL,
1936             .help = "Turn off copy-on-write (valid only on btrfs)"
1937         },
1938         {
1939             .name = BLOCK_OPT_PREALLOC,
1940             .type = QEMU_OPT_STRING,
1941             .help = "Preallocation mode (allowed values: off, falloc, full)"
1942         },
1943         { /* end of list */ }
1944     }
1945 };
1946 
1947 BlockDriver bdrv_file = {
1948     .format_name = "file",
1949     .protocol_name = "file",
1950     .instance_size = sizeof(BDRVRawState),
1951     .bdrv_needs_filename = true,
1952     .bdrv_probe = NULL, /* no probe for protocols */
1953     .bdrv_parse_filename = raw_parse_filename,
1954     .bdrv_file_open = raw_open,
1955     .bdrv_reopen_prepare = raw_reopen_prepare,
1956     .bdrv_reopen_commit = raw_reopen_commit,
1957     .bdrv_reopen_abort = raw_reopen_abort,
1958     .bdrv_close = raw_close,
1959     .bdrv_create = raw_create,
1960     .bdrv_has_zero_init = bdrv_has_zero_init_1,
1961     .bdrv_co_get_block_status = raw_co_get_block_status,
1962     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
1963 
1964     .bdrv_co_preadv         = raw_co_preadv,
1965     .bdrv_co_pwritev        = raw_co_pwritev,
1966     .bdrv_aio_flush = raw_aio_flush,
1967     .bdrv_aio_pdiscard = raw_aio_pdiscard,
1968     .bdrv_refresh_limits = raw_refresh_limits,
1969     .bdrv_io_plug = raw_aio_plug,
1970     .bdrv_io_unplug = raw_aio_unplug,
1971 
1972     .bdrv_truncate = raw_truncate,
1973     .bdrv_getlength = raw_getlength,
1974     .bdrv_get_info = raw_get_info,
1975     .bdrv_get_allocated_file_size
1976                         = raw_get_allocated_file_size,
1977 
1978     .create_opts = &raw_create_opts,
1979 };
1980 
1981 /***********************************************/
1982 /* host device */
1983 
1984 #if defined(__APPLE__) && defined(__MACH__)
1985 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
1986                                 CFIndex maxPathSize, int flags);
1987 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
1988 {
1989     kern_return_t kernResult = KERN_FAILURE;
1990     mach_port_t     masterPort;
1991     CFMutableDictionaryRef  classesToMatch;
1992     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
1993     char *mediaType = NULL;
1994 
1995     kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
1996     if ( KERN_SUCCESS != kernResult ) {
1997         printf( "IOMasterPort returned %d\n", kernResult );
1998     }
1999 
2000     int index;
2001     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
2002         classesToMatch = IOServiceMatching(matching_array[index]);
2003         if (classesToMatch == NULL) {
2004             error_report("IOServiceMatching returned NULL for %s",
2005                          matching_array[index]);
2006             continue;
2007         }
2008         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
2009                              kCFBooleanTrue);
2010         kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
2011                                                   mediaIterator);
2012         if (kernResult != KERN_SUCCESS) {
2013             error_report("Note: IOServiceGetMatchingServices returned %d",
2014                          kernResult);
2015             continue;
2016         }
2017 
2018         /* If a match was found, leave the loop */
2019         if (*mediaIterator != 0) {
2020             DPRINTF("Matching using %s\n", matching_array[index]);
2021             mediaType = g_strdup(matching_array[index]);
2022             break;
2023         }
2024     }
2025     return mediaType;
2026 }
2027 
2028 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2029                          CFIndex maxPathSize, int flags)
2030 {
2031     io_object_t     nextMedia;
2032     kern_return_t   kernResult = KERN_FAILURE;
2033     *bsdPath = '\0';
2034     nextMedia = IOIteratorNext( mediaIterator );
2035     if ( nextMedia )
2036     {
2037         CFTypeRef   bsdPathAsCFString;
2038     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
2039         if ( bsdPathAsCFString ) {
2040             size_t devPathLength;
2041             strcpy( bsdPath, _PATH_DEV );
2042             if (flags & BDRV_O_NOCACHE) {
2043                 strcat(bsdPath, "r");
2044             }
2045             devPathLength = strlen( bsdPath );
2046             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
2047                 kernResult = KERN_SUCCESS;
2048             }
2049             CFRelease( bsdPathAsCFString );
2050         }
2051         IOObjectRelease( nextMedia );
2052     }
2053 
2054     return kernResult;
2055 }
2056 
2057 /* Sets up a real cdrom for use in QEMU */
2058 static bool setup_cdrom(char *bsd_path, Error **errp)
2059 {
2060     int index, num_of_test_partitions = 2, fd;
2061     char test_partition[MAXPATHLEN];
2062     bool partition_found = false;
2063 
2064     /* look for a working partition */
2065     for (index = 0; index < num_of_test_partitions; index++) {
2066         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
2067                  index);
2068         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
2069         if (fd >= 0) {
2070             partition_found = true;
2071             qemu_close(fd);
2072             break;
2073         }
2074     }
2075 
2076     /* if a working partition on the device was not found */
2077     if (partition_found == false) {
2078         error_setg(errp, "Failed to find a working partition on disc");
2079     } else {
2080         DPRINTF("Using %s as optical disc\n", test_partition);
2081         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
2082     }
2083     return partition_found;
2084 }
2085 
2086 /* Prints directions on mounting and unmounting a device */
2087 static void print_unmounting_directions(const char *file_name)
2088 {
2089     error_report("If device %s is mounted on the desktop, unmount"
2090                  " it first before using it in QEMU", file_name);
2091     error_report("Command to unmount device: diskutil unmountDisk %s",
2092                  file_name);
2093     error_report("Command to mount device: diskutil mountDisk %s", file_name);
2094 }
2095 
2096 #endif /* defined(__APPLE__) && defined(__MACH__) */
2097 
2098 static int hdev_probe_device(const char *filename)
2099 {
2100     struct stat st;
2101 
2102     /* allow a dedicated CD-ROM driver to match with a higher priority */
2103     if (strstart(filename, "/dev/cdrom", NULL))
2104         return 50;
2105 
2106     if (stat(filename, &st) >= 0 &&
2107             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
2108         return 100;
2109     }
2110 
2111     return 0;
2112 }
2113 
2114 static int check_hdev_writable(BDRVRawState *s)
2115 {
2116 #if defined(BLKROGET)
2117     /* Linux block devices can be configured "read-only" using blockdev(8).
2118      * This is independent of device node permissions and therefore open(2)
2119      * with O_RDWR succeeds.  Actual writes fail with EPERM.
2120      *
2121      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
2122      * check for read-only block devices so that Linux block devices behave
2123      * properly.
2124      */
2125     struct stat st;
2126     int readonly = 0;
2127 
2128     if (fstat(s->fd, &st)) {
2129         return -errno;
2130     }
2131 
2132     if (!S_ISBLK(st.st_mode)) {
2133         return 0;
2134     }
2135 
2136     if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
2137         return -errno;
2138     }
2139 
2140     if (readonly) {
2141         return -EACCES;
2142     }
2143 #endif /* defined(BLKROGET) */
2144     return 0;
2145 }
2146 
2147 static void hdev_parse_filename(const char *filename, QDict *options,
2148                                 Error **errp)
2149 {
2150     /* The prefix is optional, just as for "file". */
2151     strstart(filename, "host_device:", &filename);
2152 
2153     qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2154 }
2155 
2156 static bool hdev_is_sg(BlockDriverState *bs)
2157 {
2158 
2159 #if defined(__linux__)
2160 
2161     BDRVRawState *s = bs->opaque;
2162     struct stat st;
2163     struct sg_scsi_id scsiid;
2164     int sg_version;
2165     int ret;
2166 
2167     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
2168         return false;
2169     }
2170 
2171     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
2172     if (ret < 0) {
2173         return false;
2174     }
2175 
2176     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
2177     if (ret >= 0) {
2178         DPRINTF("SG device found: type=%d, version=%d\n",
2179             scsiid.scsi_type, sg_version);
2180         return true;
2181     }
2182 
2183 #endif
2184 
2185     return false;
2186 }
2187 
2188 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
2189                      Error **errp)
2190 {
2191     BDRVRawState *s = bs->opaque;
2192     Error *local_err = NULL;
2193     int ret;
2194 
2195 #if defined(__APPLE__) && defined(__MACH__)
2196     const char *filename = qdict_get_str(options, "filename");
2197     char bsd_path[MAXPATHLEN] = "";
2198     bool error_occurred = false;
2199 
2200     /* If using a real cdrom */
2201     if (strcmp(filename, "/dev/cdrom") == 0) {
2202         char *mediaType = NULL;
2203         kern_return_t ret_val;
2204         io_iterator_t mediaIterator = 0;
2205 
2206         mediaType = FindEjectableOpticalMedia(&mediaIterator);
2207         if (mediaType == NULL) {
2208             error_setg(errp, "Please make sure your CD/DVD is in the optical"
2209                        " drive");
2210             error_occurred = true;
2211             goto hdev_open_Mac_error;
2212         }
2213 
2214         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
2215         if (ret_val != KERN_SUCCESS) {
2216             error_setg(errp, "Could not get BSD path for optical drive");
2217             error_occurred = true;
2218             goto hdev_open_Mac_error;
2219         }
2220 
2221         /* If a real optical drive was not found */
2222         if (bsd_path[0] == '\0') {
2223             error_setg(errp, "Failed to obtain bsd path for optical drive");
2224             error_occurred = true;
2225             goto hdev_open_Mac_error;
2226         }
2227 
2228         /* If using a cdrom disc and finding a partition on the disc failed */
2229         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
2230             setup_cdrom(bsd_path, errp) == false) {
2231             print_unmounting_directions(bsd_path);
2232             error_occurred = true;
2233             goto hdev_open_Mac_error;
2234         }
2235 
2236         qdict_put(options, "filename", qstring_from_str(bsd_path));
2237 
2238 hdev_open_Mac_error:
2239         g_free(mediaType);
2240         if (mediaIterator) {
2241             IOObjectRelease(mediaIterator);
2242         }
2243         if (error_occurred) {
2244             return -ENOENT;
2245         }
2246     }
2247 #endif /* defined(__APPLE__) && defined(__MACH__) */
2248 
2249     s->type = FTYPE_FILE;
2250 
2251     ret = raw_open_common(bs, options, flags, 0, &local_err);
2252     if (ret < 0) {
2253         error_propagate(errp, local_err);
2254 #if defined(__APPLE__) && defined(__MACH__)
2255         if (*bsd_path) {
2256             filename = bsd_path;
2257         }
2258         /* if a physical device experienced an error while being opened */
2259         if (strncmp(filename, "/dev/", 5) == 0) {
2260             print_unmounting_directions(filename);
2261         }
2262 #endif /* defined(__APPLE__) && defined(__MACH__) */
2263         return ret;
2264     }
2265 
2266     /* Since this does ioctl the device must be already opened */
2267     bs->sg = hdev_is_sg(bs);
2268 
2269     if (flags & BDRV_O_RDWR) {
2270         ret = check_hdev_writable(s);
2271         if (ret < 0) {
2272             raw_close(bs);
2273             error_setg_errno(errp, -ret, "The device is not writable");
2274             return ret;
2275         }
2276     }
2277 
2278     return ret;
2279 }
2280 
2281 #if defined(__linux__)
2282 
2283 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
2284         unsigned long int req, void *buf,
2285         BlockCompletionFunc *cb, void *opaque)
2286 {
2287     BDRVRawState *s = bs->opaque;
2288     RawPosixAIOData *acb;
2289     ThreadPool *pool;
2290 
2291     if (fd_open(bs) < 0)
2292         return NULL;
2293 
2294     acb = g_new(RawPosixAIOData, 1);
2295     acb->bs = bs;
2296     acb->aio_type = QEMU_AIO_IOCTL;
2297     acb->aio_fildes = s->fd;
2298     acb->aio_offset = 0;
2299     acb->aio_ioctl_buf = buf;
2300     acb->aio_ioctl_cmd = req;
2301     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
2302     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
2303 }
2304 #endif /* linux */
2305 
2306 static int fd_open(BlockDriverState *bs)
2307 {
2308     BDRVRawState *s = bs->opaque;
2309 
2310     /* this is just to ensure s->fd is sane (its called by io ops) */
2311     if (s->fd >= 0)
2312         return 0;
2313     return -EIO;
2314 }
2315 
2316 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
2317     int64_t offset, int count,
2318     BlockCompletionFunc *cb, void *opaque)
2319 {
2320     BDRVRawState *s = bs->opaque;
2321 
2322     if (fd_open(bs) < 0) {
2323         return NULL;
2324     }
2325     return paio_submit(bs, s->fd, offset, NULL, count,
2326                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2327 }
2328 
2329 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
2330     int64_t offset, int count, BdrvRequestFlags flags)
2331 {
2332     BDRVRawState *s = bs->opaque;
2333     int rc;
2334 
2335     rc = fd_open(bs);
2336     if (rc < 0) {
2337         return rc;
2338     }
2339     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
2340         return paio_submit_co(bs, s->fd, offset, NULL, count,
2341                               QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
2342     } else if (s->discard_zeroes) {
2343         return paio_submit_co(bs, s->fd, offset, NULL, count,
2344                               QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2345     }
2346     return -ENOTSUP;
2347 }
2348 
2349 static int hdev_create(const char *filename, QemuOpts *opts,
2350                        Error **errp)
2351 {
2352     int fd;
2353     int ret = 0;
2354     struct stat stat_buf;
2355     int64_t total_size = 0;
2356     bool has_prefix;
2357 
2358     /* This function is used by both protocol block drivers and therefore either
2359      * of these prefixes may be given.
2360      * The return value has to be stored somewhere, otherwise this is an error
2361      * due to -Werror=unused-value. */
2362     has_prefix =
2363         strstart(filename, "host_device:", &filename) ||
2364         strstart(filename, "host_cdrom:" , &filename);
2365 
2366     (void)has_prefix;
2367 
2368     ret = raw_normalize_devicepath(&filename);
2369     if (ret < 0) {
2370         error_setg_errno(errp, -ret, "Could not normalize device path");
2371         return ret;
2372     }
2373 
2374     /* Read out options */
2375     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2376                           BDRV_SECTOR_SIZE);
2377 
2378     fd = qemu_open(filename, O_WRONLY | O_BINARY);
2379     if (fd < 0) {
2380         ret = -errno;
2381         error_setg_errno(errp, -ret, "Could not open device");
2382         return ret;
2383     }
2384 
2385     if (fstat(fd, &stat_buf) < 0) {
2386         ret = -errno;
2387         error_setg_errno(errp, -ret, "Could not stat device");
2388     } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
2389         error_setg(errp,
2390                    "The given file is neither a block nor a character device");
2391         ret = -ENODEV;
2392     } else if (lseek(fd, 0, SEEK_END) < total_size) {
2393         error_setg(errp, "Device is too small");
2394         ret = -ENOSPC;
2395     }
2396 
2397     qemu_close(fd);
2398     return ret;
2399 }
2400 
2401 static BlockDriver bdrv_host_device = {
2402     .format_name        = "host_device",
2403     .protocol_name        = "host_device",
2404     .instance_size      = sizeof(BDRVRawState),
2405     .bdrv_needs_filename = true,
2406     .bdrv_probe_device  = hdev_probe_device,
2407     .bdrv_parse_filename = hdev_parse_filename,
2408     .bdrv_file_open     = hdev_open,
2409     .bdrv_close         = raw_close,
2410     .bdrv_reopen_prepare = raw_reopen_prepare,
2411     .bdrv_reopen_commit  = raw_reopen_commit,
2412     .bdrv_reopen_abort   = raw_reopen_abort,
2413     .bdrv_create         = hdev_create,
2414     .create_opts         = &raw_create_opts,
2415     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
2416 
2417     .bdrv_co_preadv         = raw_co_preadv,
2418     .bdrv_co_pwritev        = raw_co_pwritev,
2419     .bdrv_aio_flush	= raw_aio_flush,
2420     .bdrv_aio_pdiscard   = hdev_aio_pdiscard,
2421     .bdrv_refresh_limits = raw_refresh_limits,
2422     .bdrv_io_plug = raw_aio_plug,
2423     .bdrv_io_unplug = raw_aio_unplug,
2424 
2425     .bdrv_truncate      = raw_truncate,
2426     .bdrv_getlength	= raw_getlength,
2427     .bdrv_get_info = raw_get_info,
2428     .bdrv_get_allocated_file_size
2429                         = raw_get_allocated_file_size,
2430     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
2431     .bdrv_probe_geometry = hdev_probe_geometry,
2432 
2433     /* generic scsi device */
2434 #ifdef __linux__
2435     .bdrv_aio_ioctl     = hdev_aio_ioctl,
2436 #endif
2437 };
2438 
2439 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2440 static void cdrom_parse_filename(const char *filename, QDict *options,
2441                                  Error **errp)
2442 {
2443     /* The prefix is optional, just as for "file". */
2444     strstart(filename, "host_cdrom:", &filename);
2445 
2446     qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2447 }
2448 #endif
2449 
2450 #ifdef __linux__
2451 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2452                       Error **errp)
2453 {
2454     BDRVRawState *s = bs->opaque;
2455 
2456     s->type = FTYPE_CD;
2457 
2458     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
2459     return raw_open_common(bs, options, flags, O_NONBLOCK, errp);
2460 }
2461 
2462 static int cdrom_probe_device(const char *filename)
2463 {
2464     int fd, ret;
2465     int prio = 0;
2466     struct stat st;
2467 
2468     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
2469     if (fd < 0) {
2470         goto out;
2471     }
2472     ret = fstat(fd, &st);
2473     if (ret == -1 || !S_ISBLK(st.st_mode)) {
2474         goto outc;
2475     }
2476 
2477     /* Attempt to detect via a CDROM specific ioctl */
2478     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2479     if (ret >= 0)
2480         prio = 100;
2481 
2482 outc:
2483     qemu_close(fd);
2484 out:
2485     return prio;
2486 }
2487 
2488 static bool cdrom_is_inserted(BlockDriverState *bs)
2489 {
2490     BDRVRawState *s = bs->opaque;
2491     int ret;
2492 
2493     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2494     return ret == CDS_DISC_OK;
2495 }
2496 
2497 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2498 {
2499     BDRVRawState *s = bs->opaque;
2500 
2501     if (eject_flag) {
2502         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
2503             perror("CDROMEJECT");
2504     } else {
2505         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
2506             perror("CDROMEJECT");
2507     }
2508 }
2509 
2510 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2511 {
2512     BDRVRawState *s = bs->opaque;
2513 
2514     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
2515         /*
2516          * Note: an error can happen if the distribution automatically
2517          * mounts the CD-ROM
2518          */
2519         /* perror("CDROM_LOCKDOOR"); */
2520     }
2521 }
2522 
2523 static BlockDriver bdrv_host_cdrom = {
2524     .format_name        = "host_cdrom",
2525     .protocol_name      = "host_cdrom",
2526     .instance_size      = sizeof(BDRVRawState),
2527     .bdrv_needs_filename = true,
2528     .bdrv_probe_device	= cdrom_probe_device,
2529     .bdrv_parse_filename = cdrom_parse_filename,
2530     .bdrv_file_open     = cdrom_open,
2531     .bdrv_close         = raw_close,
2532     .bdrv_reopen_prepare = raw_reopen_prepare,
2533     .bdrv_reopen_commit  = raw_reopen_commit,
2534     .bdrv_reopen_abort   = raw_reopen_abort,
2535     .bdrv_create         = hdev_create,
2536     .create_opts         = &raw_create_opts,
2537 
2538 
2539     .bdrv_co_preadv         = raw_co_preadv,
2540     .bdrv_co_pwritev        = raw_co_pwritev,
2541     .bdrv_aio_flush	= raw_aio_flush,
2542     .bdrv_refresh_limits = raw_refresh_limits,
2543     .bdrv_io_plug = raw_aio_plug,
2544     .bdrv_io_unplug = raw_aio_unplug,
2545 
2546     .bdrv_truncate      = raw_truncate,
2547     .bdrv_getlength      = raw_getlength,
2548     .has_variable_length = true,
2549     .bdrv_get_allocated_file_size
2550                         = raw_get_allocated_file_size,
2551 
2552     /* removable device support */
2553     .bdrv_is_inserted   = cdrom_is_inserted,
2554     .bdrv_eject         = cdrom_eject,
2555     .bdrv_lock_medium   = cdrom_lock_medium,
2556 
2557     /* generic scsi device */
2558     .bdrv_aio_ioctl     = hdev_aio_ioctl,
2559 };
2560 #endif /* __linux__ */
2561 
2562 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2563 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2564                       Error **errp)
2565 {
2566     BDRVRawState *s = bs->opaque;
2567     Error *local_err = NULL;
2568     int ret;
2569 
2570     s->type = FTYPE_CD;
2571 
2572     ret = raw_open_common(bs, options, flags, 0, &local_err);
2573     if (ret) {
2574         error_propagate(errp, local_err);
2575         return ret;
2576     }
2577 
2578     /* make sure the door isn't locked at this time */
2579     ioctl(s->fd, CDIOCALLOW);
2580     return 0;
2581 }
2582 
2583 static int cdrom_probe_device(const char *filename)
2584 {
2585     if (strstart(filename, "/dev/cd", NULL) ||
2586             strstart(filename, "/dev/acd", NULL))
2587         return 100;
2588     return 0;
2589 }
2590 
2591 static int cdrom_reopen(BlockDriverState *bs)
2592 {
2593     BDRVRawState *s = bs->opaque;
2594     int fd;
2595 
2596     /*
2597      * Force reread of possibly changed/newly loaded disc,
2598      * FreeBSD seems to not notice sometimes...
2599      */
2600     if (s->fd >= 0)
2601         qemu_close(s->fd);
2602     fd = qemu_open(bs->filename, s->open_flags, 0644);
2603     if (fd < 0) {
2604         s->fd = -1;
2605         return -EIO;
2606     }
2607     s->fd = fd;
2608 
2609     /* make sure the door isn't locked at this time */
2610     ioctl(s->fd, CDIOCALLOW);
2611     return 0;
2612 }
2613 
2614 static bool cdrom_is_inserted(BlockDriverState *bs)
2615 {
2616     return raw_getlength(bs) > 0;
2617 }
2618 
2619 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2620 {
2621     BDRVRawState *s = bs->opaque;
2622 
2623     if (s->fd < 0)
2624         return;
2625 
2626     (void) ioctl(s->fd, CDIOCALLOW);
2627 
2628     if (eject_flag) {
2629         if (ioctl(s->fd, CDIOCEJECT) < 0)
2630             perror("CDIOCEJECT");
2631     } else {
2632         if (ioctl(s->fd, CDIOCCLOSE) < 0)
2633             perror("CDIOCCLOSE");
2634     }
2635 
2636     cdrom_reopen(bs);
2637 }
2638 
2639 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2640 {
2641     BDRVRawState *s = bs->opaque;
2642 
2643     if (s->fd < 0)
2644         return;
2645     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
2646         /*
2647          * Note: an error can happen if the distribution automatically
2648          * mounts the CD-ROM
2649          */
2650         /* perror("CDROM_LOCKDOOR"); */
2651     }
2652 }
2653 
2654 static BlockDriver bdrv_host_cdrom = {
2655     .format_name        = "host_cdrom",
2656     .protocol_name      = "host_cdrom",
2657     .instance_size      = sizeof(BDRVRawState),
2658     .bdrv_needs_filename = true,
2659     .bdrv_probe_device	= cdrom_probe_device,
2660     .bdrv_parse_filename = cdrom_parse_filename,
2661     .bdrv_file_open     = cdrom_open,
2662     .bdrv_close         = raw_close,
2663     .bdrv_reopen_prepare = raw_reopen_prepare,
2664     .bdrv_reopen_commit  = raw_reopen_commit,
2665     .bdrv_reopen_abort   = raw_reopen_abort,
2666     .bdrv_create        = hdev_create,
2667     .create_opts        = &raw_create_opts,
2668 
2669     .bdrv_co_preadv         = raw_co_preadv,
2670     .bdrv_co_pwritev        = raw_co_pwritev,
2671     .bdrv_aio_flush	= raw_aio_flush,
2672     .bdrv_refresh_limits = raw_refresh_limits,
2673     .bdrv_io_plug = raw_aio_plug,
2674     .bdrv_io_unplug = raw_aio_unplug,
2675 
2676     .bdrv_truncate      = raw_truncate,
2677     .bdrv_getlength      = raw_getlength,
2678     .has_variable_length = true,
2679     .bdrv_get_allocated_file_size
2680                         = raw_get_allocated_file_size,
2681 
2682     /* removable device support */
2683     .bdrv_is_inserted   = cdrom_is_inserted,
2684     .bdrv_eject         = cdrom_eject,
2685     .bdrv_lock_medium   = cdrom_lock_medium,
2686 };
2687 #endif /* __FreeBSD__ */
2688 
2689 static void bdrv_file_init(void)
2690 {
2691     /*
2692      * Register all the drivers.  Note that order is important, the driver
2693      * registered last will get probed first.
2694      */
2695     bdrv_register(&bdrv_file);
2696     bdrv_register(&bdrv_host_device);
2697 #ifdef __linux__
2698     bdrv_register(&bdrv_host_cdrom);
2699 #endif
2700 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2701     bdrv_register(&bdrv_host_cdrom);
2702 #endif
2703 }
2704 
2705 block_init(bdrv_file_init);
2706