1 /* 2 * Block driver for RAW files (posix) 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "qapi/error.h" 26 #include "qemu/cutils.h" 27 #include "qemu/error-report.h" 28 #include "qemu/timer.h" 29 #include "qemu/log.h" 30 #include "block/block_int.h" 31 #include "qemu/module.h" 32 #include "trace.h" 33 #include "block/thread-pool.h" 34 #include "qemu/iov.h" 35 #include "block/raw-aio.h" 36 #include "qapi/util.h" 37 #include "qapi/qmp/qstring.h" 38 39 #if defined(__APPLE__) && (__MACH__) 40 #include <paths.h> 41 #include <sys/param.h> 42 #include <IOKit/IOKitLib.h> 43 #include <IOKit/IOBSD.h> 44 #include <IOKit/storage/IOMediaBSDClient.h> 45 #include <IOKit/storage/IOMedia.h> 46 #include <IOKit/storage/IOCDMedia.h> 47 //#include <IOKit/storage/IOCDTypes.h> 48 #include <IOKit/storage/IODVDMedia.h> 49 #include <CoreFoundation/CoreFoundation.h> 50 #endif 51 52 #ifdef __sun__ 53 #define _POSIX_PTHREAD_SEMANTICS 1 54 #include <sys/dkio.h> 55 #endif 56 #ifdef __linux__ 57 #include <sys/ioctl.h> 58 #include <sys/param.h> 59 #include <linux/cdrom.h> 60 #include <linux/fd.h> 61 #include <linux/fs.h> 62 #include <linux/hdreg.h> 63 #include <scsi/sg.h> 64 #ifdef __s390__ 65 #include <asm/dasd.h> 66 #endif 67 #ifndef FS_NOCOW_FL 68 #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ 69 #endif 70 #endif 71 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) 72 #include <linux/falloc.h> 73 #endif 74 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 75 #include <sys/disk.h> 76 #include <sys/cdio.h> 77 #endif 78 79 #ifdef __OpenBSD__ 80 #include <sys/ioctl.h> 81 #include <sys/disklabel.h> 82 #include <sys/dkio.h> 83 #endif 84 85 #ifdef __NetBSD__ 86 #include <sys/ioctl.h> 87 #include <sys/disklabel.h> 88 #include <sys/dkio.h> 89 #include <sys/disk.h> 90 #endif 91 92 #ifdef __DragonFly__ 93 #include <sys/ioctl.h> 94 #include <sys/diskslice.h> 95 #endif 96 97 #ifdef CONFIG_XFS 98 #include <xfs/xfs.h> 99 #endif 100 101 //#define DEBUG_BLOCK 102 103 #ifdef DEBUG_BLOCK 104 # define DEBUG_BLOCK_PRINT 1 105 #else 106 # define DEBUG_BLOCK_PRINT 0 107 #endif 108 #define DPRINTF(fmt, ...) \ 109 do { \ 110 if (DEBUG_BLOCK_PRINT) { \ 111 printf(fmt, ## __VA_ARGS__); \ 112 } \ 113 } while (0) 114 115 /* OS X does not have O_DSYNC */ 116 #ifndef O_DSYNC 117 #ifdef O_SYNC 118 #define O_DSYNC O_SYNC 119 #elif defined(O_FSYNC) 120 #define O_DSYNC O_FSYNC 121 #endif 122 #endif 123 124 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ 125 #ifndef O_DIRECT 126 #define O_DIRECT O_DSYNC 127 #endif 128 129 #define FTYPE_FILE 0 130 #define FTYPE_CD 1 131 132 #define MAX_BLOCKSIZE 4096 133 134 typedef struct BDRVRawState { 135 int fd; 136 int type; 137 int open_flags; 138 size_t buf_align; 139 140 #ifdef CONFIG_XFS 141 bool is_xfs:1; 142 #endif 143 bool has_discard:1; 144 bool has_write_zeroes:1; 145 bool discard_zeroes:1; 146 bool use_linux_aio:1; 147 bool has_fallocate; 148 bool needs_alignment; 149 } BDRVRawState; 150 151 typedef struct BDRVRawReopenState { 152 int fd; 153 int open_flags; 154 } BDRVRawReopenState; 155 156 static int fd_open(BlockDriverState *bs); 157 static int64_t raw_getlength(BlockDriverState *bs); 158 159 typedef struct RawPosixAIOData { 160 BlockDriverState *bs; 161 int aio_fildes; 162 union { 163 struct iovec *aio_iov; 164 void *aio_ioctl_buf; 165 }; 166 int aio_niov; 167 uint64_t aio_nbytes; 168 #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ 169 off_t aio_offset; 170 int aio_type; 171 } RawPosixAIOData; 172 173 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 174 static int cdrom_reopen(BlockDriverState *bs); 175 #endif 176 177 #if defined(__NetBSD__) 178 static int raw_normalize_devicepath(const char **filename) 179 { 180 static char namebuf[PATH_MAX]; 181 const char *dp, *fname; 182 struct stat sb; 183 184 fname = *filename; 185 dp = strrchr(fname, '/'); 186 if (lstat(fname, &sb) < 0) { 187 fprintf(stderr, "%s: stat failed: %s\n", 188 fname, strerror(errno)); 189 return -errno; 190 } 191 192 if (!S_ISBLK(sb.st_mode)) { 193 return 0; 194 } 195 196 if (dp == NULL) { 197 snprintf(namebuf, PATH_MAX, "r%s", fname); 198 } else { 199 snprintf(namebuf, PATH_MAX, "%.*s/r%s", 200 (int)(dp - fname), fname, dp + 1); 201 } 202 fprintf(stderr, "%s is a block device", fname); 203 *filename = namebuf; 204 fprintf(stderr, ", using %s\n", *filename); 205 206 return 0; 207 } 208 #else 209 static int raw_normalize_devicepath(const char **filename) 210 { 211 return 0; 212 } 213 #endif 214 215 /* 216 * Get logical block size via ioctl. On success store it in @sector_size_p. 217 */ 218 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) 219 { 220 unsigned int sector_size; 221 bool success = false; 222 223 errno = ENOTSUP; 224 225 /* Try a few ioctls to get the right size */ 226 #ifdef BLKSSZGET 227 if (ioctl(fd, BLKSSZGET, §or_size) >= 0) { 228 *sector_size_p = sector_size; 229 success = true; 230 } 231 #endif 232 #ifdef DKIOCGETBLOCKSIZE 233 if (ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { 234 *sector_size_p = sector_size; 235 success = true; 236 } 237 #endif 238 #ifdef DIOCGSECTORSIZE 239 if (ioctl(fd, DIOCGSECTORSIZE, §or_size) >= 0) { 240 *sector_size_p = sector_size; 241 success = true; 242 } 243 #endif 244 245 return success ? 0 : -errno; 246 } 247 248 /** 249 * Get physical block size of @fd. 250 * On success, store it in @blk_size and return 0. 251 * On failure, return -errno. 252 */ 253 static int probe_physical_blocksize(int fd, unsigned int *blk_size) 254 { 255 #ifdef BLKPBSZGET 256 if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { 257 return -errno; 258 } 259 return 0; 260 #else 261 return -ENOTSUP; 262 #endif 263 } 264 265 /* Check if read is allowed with given memory buffer and length. 266 * 267 * This function is used to check O_DIRECT memory buffer and request alignment. 268 */ 269 static bool raw_is_io_aligned(int fd, void *buf, size_t len) 270 { 271 ssize_t ret = pread(fd, buf, len, 0); 272 273 if (ret >= 0) { 274 return true; 275 } 276 277 #ifdef __linux__ 278 /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore 279 * other errors (e.g. real I/O error), which could happen on a failed 280 * drive, since we only care about probing alignment. 281 */ 282 if (errno != EINVAL) { 283 return true; 284 } 285 #endif 286 287 return false; 288 } 289 290 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) 291 { 292 BDRVRawState *s = bs->opaque; 293 char *buf; 294 size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); 295 296 /* For SCSI generic devices the alignment is not really used. 297 With buffered I/O, we don't have any restrictions. */ 298 if (bdrv_is_sg(bs) || !s->needs_alignment) { 299 bs->bl.request_alignment = 1; 300 s->buf_align = 1; 301 return; 302 } 303 304 bs->bl.request_alignment = 0; 305 s->buf_align = 0; 306 /* Let's try to use the logical blocksize for the alignment. */ 307 if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) { 308 bs->bl.request_alignment = 0; 309 } 310 #ifdef CONFIG_XFS 311 if (s->is_xfs) { 312 struct dioattr da; 313 if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { 314 bs->bl.request_alignment = da.d_miniosz; 315 /* The kernel returns wrong information for d_mem */ 316 /* s->buf_align = da.d_mem; */ 317 } 318 } 319 #endif 320 321 /* If we could not get the sizes so far, we can only guess them */ 322 if (!s->buf_align) { 323 size_t align; 324 buf = qemu_memalign(max_align, 2 * max_align); 325 for (align = 512; align <= max_align; align <<= 1) { 326 if (raw_is_io_aligned(fd, buf + align, max_align)) { 327 s->buf_align = align; 328 break; 329 } 330 } 331 qemu_vfree(buf); 332 } 333 334 if (!bs->bl.request_alignment) { 335 size_t align; 336 buf = qemu_memalign(s->buf_align, max_align); 337 for (align = 512; align <= max_align; align <<= 1) { 338 if (raw_is_io_aligned(fd, buf, align)) { 339 bs->bl.request_alignment = align; 340 break; 341 } 342 } 343 qemu_vfree(buf); 344 } 345 346 if (!s->buf_align || !bs->bl.request_alignment) { 347 error_setg(errp, "Could not find working O_DIRECT alignment"); 348 error_append_hint(errp, "Try cache.direct=off\n"); 349 } 350 } 351 352 static void raw_parse_flags(int bdrv_flags, int *open_flags) 353 { 354 assert(open_flags != NULL); 355 356 *open_flags |= O_BINARY; 357 *open_flags &= ~O_ACCMODE; 358 if (bdrv_flags & BDRV_O_RDWR) { 359 *open_flags |= O_RDWR; 360 } else { 361 *open_flags |= O_RDONLY; 362 } 363 364 /* Use O_DSYNC for write-through caching, no flags for write-back caching, 365 * and O_DIRECT for no caching. */ 366 if ((bdrv_flags & BDRV_O_NOCACHE)) { 367 *open_flags |= O_DIRECT; 368 } 369 } 370 371 static void raw_parse_filename(const char *filename, QDict *options, 372 Error **errp) 373 { 374 /* The filename does not have to be prefixed by the protocol name, since 375 * "file" is the default protocol; therefore, the return value of this 376 * function call can be ignored. */ 377 strstart(filename, "file:", &filename); 378 379 qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); 380 } 381 382 static QemuOptsList raw_runtime_opts = { 383 .name = "raw", 384 .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), 385 .desc = { 386 { 387 .name = "filename", 388 .type = QEMU_OPT_STRING, 389 .help = "File name of the image", 390 }, 391 { 392 .name = "aio", 393 .type = QEMU_OPT_STRING, 394 .help = "host AIO implementation (threads, native)", 395 }, 396 { /* end of list */ } 397 }, 398 }; 399 400 static int raw_open_common(BlockDriverState *bs, QDict *options, 401 int bdrv_flags, int open_flags, Error **errp) 402 { 403 BDRVRawState *s = bs->opaque; 404 QemuOpts *opts; 405 Error *local_err = NULL; 406 const char *filename = NULL; 407 BlockdevAioOptions aio, aio_default; 408 int fd, ret; 409 struct stat st; 410 411 opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); 412 qemu_opts_absorb_qdict(opts, options, &local_err); 413 if (local_err) { 414 error_propagate(errp, local_err); 415 ret = -EINVAL; 416 goto fail; 417 } 418 419 filename = qemu_opt_get(opts, "filename"); 420 421 ret = raw_normalize_devicepath(&filename); 422 if (ret != 0) { 423 error_setg_errno(errp, -ret, "Could not normalize device path"); 424 goto fail; 425 } 426 427 aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) 428 ? BLOCKDEV_AIO_OPTIONS_NATIVE 429 : BLOCKDEV_AIO_OPTIONS_THREADS; 430 aio = qapi_enum_parse(BlockdevAioOptions_lookup, qemu_opt_get(opts, "aio"), 431 BLOCKDEV_AIO_OPTIONS__MAX, aio_default, &local_err); 432 if (local_err) { 433 error_propagate(errp, local_err); 434 ret = -EINVAL; 435 goto fail; 436 } 437 s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); 438 439 s->open_flags = open_flags; 440 raw_parse_flags(bdrv_flags, &s->open_flags); 441 442 s->fd = -1; 443 fd = qemu_open(filename, s->open_flags, 0644); 444 if (fd < 0) { 445 ret = -errno; 446 error_setg_errno(errp, errno, "Could not open '%s'", filename); 447 if (ret == -EROFS) { 448 ret = -EACCES; 449 } 450 goto fail; 451 } 452 s->fd = fd; 453 454 #ifdef CONFIG_LINUX_AIO 455 /* Currently Linux does AIO only for files opened with O_DIRECT */ 456 if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) { 457 error_setg(errp, "aio=native was specified, but it requires " 458 "cache.direct=on, which was not specified."); 459 ret = -EINVAL; 460 goto fail; 461 } 462 #else 463 if (s->use_linux_aio) { 464 error_setg(errp, "aio=native was specified, but is not supported " 465 "in this build."); 466 ret = -EINVAL; 467 goto fail; 468 } 469 #endif /* !defined(CONFIG_LINUX_AIO) */ 470 471 s->has_discard = true; 472 s->has_write_zeroes = true; 473 bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; 474 if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { 475 s->needs_alignment = true; 476 } 477 478 if (fstat(s->fd, &st) < 0) { 479 ret = -errno; 480 error_setg_errno(errp, errno, "Could not stat file"); 481 goto fail; 482 } 483 if (S_ISREG(st.st_mode)) { 484 s->discard_zeroes = true; 485 s->has_fallocate = true; 486 } 487 if (S_ISBLK(st.st_mode)) { 488 #ifdef BLKDISCARDZEROES 489 unsigned int arg; 490 if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { 491 s->discard_zeroes = true; 492 } 493 #endif 494 #ifdef __linux__ 495 /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do 496 * not rely on the contents of discarded blocks unless using O_DIRECT. 497 * Same for BLKZEROOUT. 498 */ 499 if (!(bs->open_flags & BDRV_O_NOCACHE)) { 500 s->discard_zeroes = false; 501 s->has_write_zeroes = false; 502 } 503 #endif 504 } 505 #ifdef __FreeBSD__ 506 if (S_ISCHR(st.st_mode)) { 507 /* 508 * The file is a char device (disk), which on FreeBSD isn't behind 509 * a pager, so force all requests to be aligned. This is needed 510 * so QEMU makes sure all IO operations on the device are aligned 511 * to sector size, or else FreeBSD will reject them with EINVAL. 512 */ 513 s->needs_alignment = true; 514 } 515 #endif 516 517 #ifdef CONFIG_XFS 518 if (platform_test_xfs_fd(s->fd)) { 519 s->is_xfs = true; 520 } 521 #endif 522 523 ret = 0; 524 fail: 525 if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { 526 unlink(filename); 527 } 528 qemu_opts_del(opts); 529 return ret; 530 } 531 532 static int raw_open(BlockDriverState *bs, QDict *options, int flags, 533 Error **errp) 534 { 535 BDRVRawState *s = bs->opaque; 536 537 s->type = FTYPE_FILE; 538 return raw_open_common(bs, options, flags, 0, errp); 539 } 540 541 static int raw_reopen_prepare(BDRVReopenState *state, 542 BlockReopenQueue *queue, Error **errp) 543 { 544 BDRVRawState *s; 545 BDRVRawReopenState *rs; 546 int ret = 0; 547 Error *local_err = NULL; 548 549 assert(state != NULL); 550 assert(state->bs != NULL); 551 552 s = state->bs->opaque; 553 554 state->opaque = g_new0(BDRVRawReopenState, 1); 555 rs = state->opaque; 556 557 if (s->type == FTYPE_CD) { 558 rs->open_flags |= O_NONBLOCK; 559 } 560 561 raw_parse_flags(state->flags, &rs->open_flags); 562 563 rs->fd = -1; 564 565 int fcntl_flags = O_APPEND | O_NONBLOCK; 566 #ifdef O_NOATIME 567 fcntl_flags |= O_NOATIME; 568 #endif 569 570 #ifdef O_ASYNC 571 /* Not all operating systems have O_ASYNC, and those that don't 572 * will not let us track the state into rs->open_flags (typically 573 * you achieve the same effect with an ioctl, for example I_SETSIG 574 * on Solaris). But we do not use O_ASYNC, so that's fine. 575 */ 576 assert((s->open_flags & O_ASYNC) == 0); 577 #endif 578 579 if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { 580 /* dup the original fd */ 581 rs->fd = qemu_dup(s->fd); 582 if (rs->fd >= 0) { 583 ret = fcntl_setfl(rs->fd, rs->open_flags); 584 if (ret) { 585 qemu_close(rs->fd); 586 rs->fd = -1; 587 } 588 } 589 } 590 591 /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ 592 if (rs->fd == -1) { 593 const char *normalized_filename = state->bs->filename; 594 ret = raw_normalize_devicepath(&normalized_filename); 595 if (ret < 0) { 596 error_setg_errno(errp, -ret, "Could not normalize device path"); 597 } else { 598 assert(!(rs->open_flags & O_CREAT)); 599 rs->fd = qemu_open(normalized_filename, rs->open_flags); 600 if (rs->fd == -1) { 601 error_setg_errno(errp, errno, "Could not reopen file"); 602 ret = -1; 603 } 604 } 605 } 606 607 /* Fail already reopen_prepare() if we can't get a working O_DIRECT 608 * alignment with the new fd. */ 609 if (rs->fd != -1) { 610 raw_probe_alignment(state->bs, rs->fd, &local_err); 611 if (local_err) { 612 qemu_close(rs->fd); 613 rs->fd = -1; 614 error_propagate(errp, local_err); 615 ret = -EINVAL; 616 } 617 } 618 619 return ret; 620 } 621 622 static void raw_reopen_commit(BDRVReopenState *state) 623 { 624 BDRVRawReopenState *rs = state->opaque; 625 BDRVRawState *s = state->bs->opaque; 626 627 s->open_flags = rs->open_flags; 628 629 qemu_close(s->fd); 630 s->fd = rs->fd; 631 632 g_free(state->opaque); 633 state->opaque = NULL; 634 } 635 636 637 static void raw_reopen_abort(BDRVReopenState *state) 638 { 639 BDRVRawReopenState *rs = state->opaque; 640 641 /* nothing to do if NULL, we didn't get far enough */ 642 if (rs == NULL) { 643 return; 644 } 645 646 if (rs->fd >= 0) { 647 qemu_close(rs->fd); 648 rs->fd = -1; 649 } 650 g_free(state->opaque); 651 state->opaque = NULL; 652 } 653 654 static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd) 655 { 656 #ifdef BLKSECTGET 657 int max_bytes = 0; 658 short max_sectors = 0; 659 if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) { 660 return max_bytes; 661 } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) { 662 return max_sectors << BDRV_SECTOR_BITS; 663 } else { 664 return -errno; 665 } 666 #else 667 return -ENOSYS; 668 #endif 669 } 670 671 static int hdev_get_max_segments(const struct stat *st) 672 { 673 #ifdef CONFIG_LINUX 674 char buf[32]; 675 const char *end; 676 char *sysfspath; 677 int ret; 678 int fd = -1; 679 long max_segments; 680 681 sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments", 682 major(st->st_rdev), minor(st->st_rdev)); 683 fd = open(sysfspath, O_RDONLY); 684 if (fd == -1) { 685 ret = -errno; 686 goto out; 687 } 688 do { 689 ret = read(fd, buf, sizeof(buf)); 690 } while (ret == -1 && errno == EINTR); 691 if (ret < 0) { 692 ret = -errno; 693 goto out; 694 } else if (ret == 0) { 695 ret = -EIO; 696 goto out; 697 } 698 buf[ret] = 0; 699 /* The file is ended with '\n', pass 'end' to accept that. */ 700 ret = qemu_strtol(buf, &end, 10, &max_segments); 701 if (ret == 0 && end && *end == '\n') { 702 ret = max_segments; 703 } 704 705 out: 706 g_free(sysfspath); 707 return ret; 708 #else 709 return -ENOTSUP; 710 #endif 711 } 712 713 static void raw_refresh_limits(BlockDriverState *bs, Error **errp) 714 { 715 BDRVRawState *s = bs->opaque; 716 struct stat st; 717 718 if (!fstat(s->fd, &st)) { 719 if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { 720 int ret = hdev_get_max_transfer_length(bs, s->fd); 721 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { 722 bs->bl.max_transfer = pow2floor(ret); 723 } 724 ret = hdev_get_max_segments(&st); 725 if (ret > 0) { 726 bs->bl.max_transfer = MIN(bs->bl.max_transfer, 727 ret * getpagesize()); 728 } 729 } 730 } 731 732 raw_probe_alignment(bs, s->fd, errp); 733 bs->bl.min_mem_alignment = s->buf_align; 734 bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); 735 } 736 737 static int check_for_dasd(int fd) 738 { 739 #ifdef BIODASDINFO2 740 struct dasd_information2_t info = {0}; 741 742 return ioctl(fd, BIODASDINFO2, &info); 743 #else 744 return -1; 745 #endif 746 } 747 748 /** 749 * Try to get @bs's logical and physical block size. 750 * On success, store them in @bsz and return zero. 751 * On failure, return negative errno. 752 */ 753 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 754 { 755 BDRVRawState *s = bs->opaque; 756 int ret; 757 758 /* If DASD, get blocksizes */ 759 if (check_for_dasd(s->fd) < 0) { 760 return -ENOTSUP; 761 } 762 ret = probe_logical_blocksize(s->fd, &bsz->log); 763 if (ret < 0) { 764 return ret; 765 } 766 return probe_physical_blocksize(s->fd, &bsz->phys); 767 } 768 769 /** 770 * Try to get @bs's geometry: cyls, heads, sectors. 771 * On success, store them in @geo and return 0. 772 * On failure return -errno. 773 * (Allows block driver to assign default geometry values that guest sees) 774 */ 775 #ifdef __linux__ 776 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 777 { 778 BDRVRawState *s = bs->opaque; 779 struct hd_geometry ioctl_geo = {0}; 780 781 /* If DASD, get its geometry */ 782 if (check_for_dasd(s->fd) < 0) { 783 return -ENOTSUP; 784 } 785 if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { 786 return -errno; 787 } 788 /* HDIO_GETGEO may return success even though geo contains zeros 789 (e.g. certain multipath setups) */ 790 if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { 791 return -ENOTSUP; 792 } 793 /* Do not return a geometry for partition */ 794 if (ioctl_geo.start != 0) { 795 return -ENOTSUP; 796 } 797 geo->heads = ioctl_geo.heads; 798 geo->sectors = ioctl_geo.sectors; 799 geo->cylinders = ioctl_geo.cylinders; 800 801 return 0; 802 } 803 #else /* __linux__ */ 804 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 805 { 806 return -ENOTSUP; 807 } 808 #endif 809 810 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) 811 { 812 int ret; 813 814 ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); 815 if (ret == -1) { 816 return -errno; 817 } 818 819 return 0; 820 } 821 822 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) 823 { 824 int ret; 825 826 ret = qemu_fdatasync(aiocb->aio_fildes); 827 if (ret == -1) { 828 return -errno; 829 } 830 return 0; 831 } 832 833 #ifdef CONFIG_PREADV 834 835 static bool preadv_present = true; 836 837 static ssize_t 838 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 839 { 840 return preadv(fd, iov, nr_iov, offset); 841 } 842 843 static ssize_t 844 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 845 { 846 return pwritev(fd, iov, nr_iov, offset); 847 } 848 849 #else 850 851 static bool preadv_present = false; 852 853 static ssize_t 854 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 855 { 856 return -ENOSYS; 857 } 858 859 static ssize_t 860 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 861 { 862 return -ENOSYS; 863 } 864 865 #endif 866 867 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) 868 { 869 ssize_t len; 870 871 do { 872 if (aiocb->aio_type & QEMU_AIO_WRITE) 873 len = qemu_pwritev(aiocb->aio_fildes, 874 aiocb->aio_iov, 875 aiocb->aio_niov, 876 aiocb->aio_offset); 877 else 878 len = qemu_preadv(aiocb->aio_fildes, 879 aiocb->aio_iov, 880 aiocb->aio_niov, 881 aiocb->aio_offset); 882 } while (len == -1 && errno == EINTR); 883 884 if (len == -1) { 885 return -errno; 886 } 887 return len; 888 } 889 890 /* 891 * Read/writes the data to/from a given linear buffer. 892 * 893 * Returns the number of bytes handles or -errno in case of an error. Short 894 * reads are only returned if the end of the file is reached. 895 */ 896 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) 897 { 898 ssize_t offset = 0; 899 ssize_t len; 900 901 while (offset < aiocb->aio_nbytes) { 902 if (aiocb->aio_type & QEMU_AIO_WRITE) { 903 len = pwrite(aiocb->aio_fildes, 904 (const char *)buf + offset, 905 aiocb->aio_nbytes - offset, 906 aiocb->aio_offset + offset); 907 } else { 908 len = pread(aiocb->aio_fildes, 909 buf + offset, 910 aiocb->aio_nbytes - offset, 911 aiocb->aio_offset + offset); 912 } 913 if (len == -1 && errno == EINTR) { 914 continue; 915 } else if (len == -1 && errno == EINVAL && 916 (aiocb->bs->open_flags & BDRV_O_NOCACHE) && 917 !(aiocb->aio_type & QEMU_AIO_WRITE) && 918 offset > 0) { 919 /* O_DIRECT pread() may fail with EINVAL when offset is unaligned 920 * after a short read. Assume that O_DIRECT short reads only occur 921 * at EOF. Therefore this is a short read, not an I/O error. 922 */ 923 break; 924 } else if (len == -1) { 925 offset = -errno; 926 break; 927 } else if (len == 0) { 928 break; 929 } 930 offset += len; 931 } 932 933 return offset; 934 } 935 936 static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) 937 { 938 ssize_t nbytes; 939 char *buf; 940 941 if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { 942 /* 943 * If there is just a single buffer, and it is properly aligned 944 * we can just use plain pread/pwrite without any problems. 945 */ 946 if (aiocb->aio_niov == 1) { 947 return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); 948 } 949 /* 950 * We have more than one iovec, and all are properly aligned. 951 * 952 * Try preadv/pwritev first and fall back to linearizing the 953 * buffer if it's not supported. 954 */ 955 if (preadv_present) { 956 nbytes = handle_aiocb_rw_vector(aiocb); 957 if (nbytes == aiocb->aio_nbytes || 958 (nbytes < 0 && nbytes != -ENOSYS)) { 959 return nbytes; 960 } 961 preadv_present = false; 962 } 963 964 /* 965 * XXX(hch): short read/write. no easy way to handle the reminder 966 * using these interfaces. For now retry using plain 967 * pread/pwrite? 968 */ 969 } 970 971 /* 972 * Ok, we have to do it the hard way, copy all segments into 973 * a single aligned buffer. 974 */ 975 buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); 976 if (buf == NULL) { 977 return -ENOMEM; 978 } 979 980 if (aiocb->aio_type & QEMU_AIO_WRITE) { 981 char *p = buf; 982 int i; 983 984 for (i = 0; i < aiocb->aio_niov; ++i) { 985 memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); 986 p += aiocb->aio_iov[i].iov_len; 987 } 988 assert(p - buf == aiocb->aio_nbytes); 989 } 990 991 nbytes = handle_aiocb_rw_linear(aiocb, buf); 992 if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { 993 char *p = buf; 994 size_t count = aiocb->aio_nbytes, copy; 995 int i; 996 997 for (i = 0; i < aiocb->aio_niov && count; ++i) { 998 copy = count; 999 if (copy > aiocb->aio_iov[i].iov_len) { 1000 copy = aiocb->aio_iov[i].iov_len; 1001 } 1002 memcpy(aiocb->aio_iov[i].iov_base, p, copy); 1003 assert(count >= copy); 1004 p += copy; 1005 count -= copy; 1006 } 1007 assert(count == 0); 1008 } 1009 qemu_vfree(buf); 1010 1011 return nbytes; 1012 } 1013 1014 #ifdef CONFIG_XFS 1015 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) 1016 { 1017 struct xfs_flock64 fl; 1018 int err; 1019 1020 memset(&fl, 0, sizeof(fl)); 1021 fl.l_whence = SEEK_SET; 1022 fl.l_start = offset; 1023 fl.l_len = bytes; 1024 1025 if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { 1026 err = errno; 1027 DPRINTF("cannot write zero range (%s)\n", strerror(errno)); 1028 return -err; 1029 } 1030 1031 return 0; 1032 } 1033 1034 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) 1035 { 1036 struct xfs_flock64 fl; 1037 int err; 1038 1039 memset(&fl, 0, sizeof(fl)); 1040 fl.l_whence = SEEK_SET; 1041 fl.l_start = offset; 1042 fl.l_len = bytes; 1043 1044 if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { 1045 err = errno; 1046 DPRINTF("cannot punch hole (%s)\n", strerror(errno)); 1047 return -err; 1048 } 1049 1050 return 0; 1051 } 1052 #endif 1053 1054 static int translate_err(int err) 1055 { 1056 if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || 1057 err == -ENOTTY) { 1058 err = -ENOTSUP; 1059 } 1060 return err; 1061 } 1062 1063 #ifdef CONFIG_FALLOCATE 1064 static int do_fallocate(int fd, int mode, off_t offset, off_t len) 1065 { 1066 do { 1067 if (fallocate(fd, mode, offset, len) == 0) { 1068 return 0; 1069 } 1070 } while (errno == EINTR); 1071 return translate_err(-errno); 1072 } 1073 #endif 1074 1075 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) 1076 { 1077 int ret = -ENOTSUP; 1078 BDRVRawState *s = aiocb->bs->opaque; 1079 1080 if (!s->has_write_zeroes) { 1081 return -ENOTSUP; 1082 } 1083 1084 #ifdef BLKZEROOUT 1085 do { 1086 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1087 if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { 1088 return 0; 1089 } 1090 } while (errno == EINTR); 1091 1092 ret = translate_err(-errno); 1093 #endif 1094 1095 if (ret == -ENOTSUP) { 1096 s->has_write_zeroes = false; 1097 } 1098 return ret; 1099 } 1100 1101 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) 1102 { 1103 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) 1104 BDRVRawState *s = aiocb->bs->opaque; 1105 #endif 1106 1107 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1108 return handle_aiocb_write_zeroes_block(aiocb); 1109 } 1110 1111 #ifdef CONFIG_XFS 1112 if (s->is_xfs) { 1113 return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); 1114 } 1115 #endif 1116 1117 #ifdef CONFIG_FALLOCATE_ZERO_RANGE 1118 if (s->has_write_zeroes) { 1119 int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, 1120 aiocb->aio_offset, aiocb->aio_nbytes); 1121 if (ret == 0 || ret != -ENOTSUP) { 1122 return ret; 1123 } 1124 s->has_write_zeroes = false; 1125 } 1126 #endif 1127 1128 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1129 if (s->has_discard && s->has_fallocate) { 1130 int ret = do_fallocate(s->fd, 1131 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1132 aiocb->aio_offset, aiocb->aio_nbytes); 1133 if (ret == 0) { 1134 ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1135 if (ret == 0 || ret != -ENOTSUP) { 1136 return ret; 1137 } 1138 s->has_fallocate = false; 1139 } else if (ret != -ENOTSUP) { 1140 return ret; 1141 } else { 1142 s->has_discard = false; 1143 } 1144 } 1145 #endif 1146 1147 #ifdef CONFIG_FALLOCATE 1148 if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) { 1149 int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1150 if (ret == 0 || ret != -ENOTSUP) { 1151 return ret; 1152 } 1153 s->has_fallocate = false; 1154 } 1155 #endif 1156 1157 return -ENOTSUP; 1158 } 1159 1160 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) 1161 { 1162 int ret = -EOPNOTSUPP; 1163 BDRVRawState *s = aiocb->bs->opaque; 1164 1165 if (!s->has_discard) { 1166 return -ENOTSUP; 1167 } 1168 1169 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1170 #ifdef BLKDISCARD 1171 do { 1172 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1173 if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { 1174 return 0; 1175 } 1176 } while (errno == EINTR); 1177 1178 ret = -errno; 1179 #endif 1180 } else { 1181 #ifdef CONFIG_XFS 1182 if (s->is_xfs) { 1183 return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); 1184 } 1185 #endif 1186 1187 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1188 ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1189 aiocb->aio_offset, aiocb->aio_nbytes); 1190 #endif 1191 } 1192 1193 ret = translate_err(ret); 1194 if (ret == -ENOTSUP) { 1195 s->has_discard = false; 1196 } 1197 return ret; 1198 } 1199 1200 static int aio_worker(void *arg) 1201 { 1202 RawPosixAIOData *aiocb = arg; 1203 ssize_t ret = 0; 1204 1205 switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { 1206 case QEMU_AIO_READ: 1207 ret = handle_aiocb_rw(aiocb); 1208 if (ret >= 0 && ret < aiocb->aio_nbytes) { 1209 iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, 1210 0, aiocb->aio_nbytes - ret); 1211 1212 ret = aiocb->aio_nbytes; 1213 } 1214 if (ret == aiocb->aio_nbytes) { 1215 ret = 0; 1216 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1217 ret = -EINVAL; 1218 } 1219 break; 1220 case QEMU_AIO_WRITE: 1221 ret = handle_aiocb_rw(aiocb); 1222 if (ret == aiocb->aio_nbytes) { 1223 ret = 0; 1224 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1225 ret = -EINVAL; 1226 } 1227 break; 1228 case QEMU_AIO_FLUSH: 1229 ret = handle_aiocb_flush(aiocb); 1230 break; 1231 case QEMU_AIO_IOCTL: 1232 ret = handle_aiocb_ioctl(aiocb); 1233 break; 1234 case QEMU_AIO_DISCARD: 1235 ret = handle_aiocb_discard(aiocb); 1236 break; 1237 case QEMU_AIO_WRITE_ZEROES: 1238 ret = handle_aiocb_write_zeroes(aiocb); 1239 break; 1240 default: 1241 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); 1242 ret = -EINVAL; 1243 break; 1244 } 1245 1246 g_free(aiocb); 1247 return ret; 1248 } 1249 1250 static int paio_submit_co(BlockDriverState *bs, int fd, 1251 int64_t offset, QEMUIOVector *qiov, 1252 int count, int type) 1253 { 1254 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1255 ThreadPool *pool; 1256 1257 acb->bs = bs; 1258 acb->aio_type = type; 1259 acb->aio_fildes = fd; 1260 1261 acb->aio_nbytes = count; 1262 acb->aio_offset = offset; 1263 1264 if (qiov) { 1265 acb->aio_iov = qiov->iov; 1266 acb->aio_niov = qiov->niov; 1267 assert(qiov->size == count); 1268 } 1269 1270 trace_paio_submit_co(offset, count, type); 1271 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1272 return thread_pool_submit_co(pool, aio_worker, acb); 1273 } 1274 1275 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, 1276 int64_t offset, QEMUIOVector *qiov, int count, 1277 BlockCompletionFunc *cb, void *opaque, int type) 1278 { 1279 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1280 ThreadPool *pool; 1281 1282 acb->bs = bs; 1283 acb->aio_type = type; 1284 acb->aio_fildes = fd; 1285 1286 acb->aio_nbytes = count; 1287 acb->aio_offset = offset; 1288 1289 if (qiov) { 1290 acb->aio_iov = qiov->iov; 1291 acb->aio_niov = qiov->niov; 1292 assert(qiov->size == acb->aio_nbytes); 1293 } 1294 1295 trace_paio_submit(acb, opaque, offset, count, type); 1296 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1297 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 1298 } 1299 1300 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, 1301 uint64_t bytes, QEMUIOVector *qiov, int type) 1302 { 1303 BDRVRawState *s = bs->opaque; 1304 1305 if (fd_open(bs) < 0) 1306 return -EIO; 1307 1308 /* 1309 * Check if the underlying device requires requests to be aligned, 1310 * and if the request we are trying to submit is aligned or not. 1311 * If this is the case tell the low-level driver that it needs 1312 * to copy the buffer. 1313 */ 1314 if (s->needs_alignment) { 1315 if (!bdrv_qiov_is_aligned(bs, qiov)) { 1316 type |= QEMU_AIO_MISALIGNED; 1317 #ifdef CONFIG_LINUX_AIO 1318 } else if (s->use_linux_aio) { 1319 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1320 assert(qiov->size == bytes); 1321 return laio_co_submit(bs, aio, s->fd, offset, qiov, type); 1322 #endif 1323 } 1324 } 1325 1326 return paio_submit_co(bs, s->fd, offset, qiov, bytes, type); 1327 } 1328 1329 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, 1330 uint64_t bytes, QEMUIOVector *qiov, 1331 int flags) 1332 { 1333 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); 1334 } 1335 1336 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, 1337 uint64_t bytes, QEMUIOVector *qiov, 1338 int flags) 1339 { 1340 assert(flags == 0); 1341 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); 1342 } 1343 1344 static void raw_aio_plug(BlockDriverState *bs) 1345 { 1346 #ifdef CONFIG_LINUX_AIO 1347 BDRVRawState *s = bs->opaque; 1348 if (s->use_linux_aio) { 1349 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1350 laio_io_plug(bs, aio); 1351 } 1352 #endif 1353 } 1354 1355 static void raw_aio_unplug(BlockDriverState *bs) 1356 { 1357 #ifdef CONFIG_LINUX_AIO 1358 BDRVRawState *s = bs->opaque; 1359 if (s->use_linux_aio) { 1360 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1361 laio_io_unplug(bs, aio); 1362 } 1363 #endif 1364 } 1365 1366 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, 1367 BlockCompletionFunc *cb, void *opaque) 1368 { 1369 BDRVRawState *s = bs->opaque; 1370 1371 if (fd_open(bs) < 0) 1372 return NULL; 1373 1374 return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); 1375 } 1376 1377 static void raw_close(BlockDriverState *bs) 1378 { 1379 BDRVRawState *s = bs->opaque; 1380 1381 if (s->fd >= 0) { 1382 qemu_close(s->fd); 1383 s->fd = -1; 1384 } 1385 } 1386 1387 static int raw_truncate(BlockDriverState *bs, int64_t offset) 1388 { 1389 BDRVRawState *s = bs->opaque; 1390 struct stat st; 1391 1392 if (fstat(s->fd, &st)) { 1393 return -errno; 1394 } 1395 1396 if (S_ISREG(st.st_mode)) { 1397 if (ftruncate(s->fd, offset) < 0) { 1398 return -errno; 1399 } 1400 } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1401 if (offset > raw_getlength(bs)) { 1402 return -EINVAL; 1403 } 1404 } else { 1405 return -ENOTSUP; 1406 } 1407 1408 return 0; 1409 } 1410 1411 #ifdef __OpenBSD__ 1412 static int64_t raw_getlength(BlockDriverState *bs) 1413 { 1414 BDRVRawState *s = bs->opaque; 1415 int fd = s->fd; 1416 struct stat st; 1417 1418 if (fstat(fd, &st)) 1419 return -errno; 1420 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1421 struct disklabel dl; 1422 1423 if (ioctl(fd, DIOCGDINFO, &dl)) 1424 return -errno; 1425 return (uint64_t)dl.d_secsize * 1426 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1427 } else 1428 return st.st_size; 1429 } 1430 #elif defined(__NetBSD__) 1431 static int64_t raw_getlength(BlockDriverState *bs) 1432 { 1433 BDRVRawState *s = bs->opaque; 1434 int fd = s->fd; 1435 struct stat st; 1436 1437 if (fstat(fd, &st)) 1438 return -errno; 1439 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1440 struct dkwedge_info dkw; 1441 1442 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { 1443 return dkw.dkw_size * 512; 1444 } else { 1445 struct disklabel dl; 1446 1447 if (ioctl(fd, DIOCGDINFO, &dl)) 1448 return -errno; 1449 return (uint64_t)dl.d_secsize * 1450 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1451 } 1452 } else 1453 return st.st_size; 1454 } 1455 #elif defined(__sun__) 1456 static int64_t raw_getlength(BlockDriverState *bs) 1457 { 1458 BDRVRawState *s = bs->opaque; 1459 struct dk_minfo minfo; 1460 int ret; 1461 int64_t size; 1462 1463 ret = fd_open(bs); 1464 if (ret < 0) { 1465 return ret; 1466 } 1467 1468 /* 1469 * Use the DKIOCGMEDIAINFO ioctl to read the size. 1470 */ 1471 ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); 1472 if (ret != -1) { 1473 return minfo.dki_lbsize * minfo.dki_capacity; 1474 } 1475 1476 /* 1477 * There are reports that lseek on some devices fails, but 1478 * irc discussion said that contingency on contingency was overkill. 1479 */ 1480 size = lseek(s->fd, 0, SEEK_END); 1481 if (size < 0) { 1482 return -errno; 1483 } 1484 return size; 1485 } 1486 #elif defined(CONFIG_BSD) 1487 static int64_t raw_getlength(BlockDriverState *bs) 1488 { 1489 BDRVRawState *s = bs->opaque; 1490 int fd = s->fd; 1491 int64_t size; 1492 struct stat sb; 1493 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1494 int reopened = 0; 1495 #endif 1496 int ret; 1497 1498 ret = fd_open(bs); 1499 if (ret < 0) 1500 return ret; 1501 1502 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1503 again: 1504 #endif 1505 if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { 1506 #ifdef DIOCGMEDIASIZE 1507 if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) 1508 #elif defined(DIOCGPART) 1509 { 1510 struct partinfo pi; 1511 if (ioctl(fd, DIOCGPART, &pi) == 0) 1512 size = pi.media_size; 1513 else 1514 size = 0; 1515 } 1516 if (size == 0) 1517 #endif 1518 #if defined(__APPLE__) && defined(__MACH__) 1519 { 1520 uint64_t sectors = 0; 1521 uint32_t sector_size = 0; 1522 1523 if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 1524 && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { 1525 size = sectors * sector_size; 1526 } else { 1527 size = lseek(fd, 0LL, SEEK_END); 1528 if (size < 0) { 1529 return -errno; 1530 } 1531 } 1532 } 1533 #else 1534 size = lseek(fd, 0LL, SEEK_END); 1535 if (size < 0) { 1536 return -errno; 1537 } 1538 #endif 1539 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 1540 switch(s->type) { 1541 case FTYPE_CD: 1542 /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ 1543 if (size == 2048LL * (unsigned)-1) 1544 size = 0; 1545 /* XXX no disc? maybe we need to reopen... */ 1546 if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { 1547 reopened = 1; 1548 goto again; 1549 } 1550 } 1551 #endif 1552 } else { 1553 size = lseek(fd, 0, SEEK_END); 1554 if (size < 0) { 1555 return -errno; 1556 } 1557 } 1558 return size; 1559 } 1560 #else 1561 static int64_t raw_getlength(BlockDriverState *bs) 1562 { 1563 BDRVRawState *s = bs->opaque; 1564 int ret; 1565 int64_t size; 1566 1567 ret = fd_open(bs); 1568 if (ret < 0) { 1569 return ret; 1570 } 1571 1572 size = lseek(s->fd, 0, SEEK_END); 1573 if (size < 0) { 1574 return -errno; 1575 } 1576 return size; 1577 } 1578 #endif 1579 1580 static int64_t raw_get_allocated_file_size(BlockDriverState *bs) 1581 { 1582 struct stat st; 1583 BDRVRawState *s = bs->opaque; 1584 1585 if (fstat(s->fd, &st) < 0) { 1586 return -errno; 1587 } 1588 return (int64_t)st.st_blocks * 512; 1589 } 1590 1591 static int raw_create(const char *filename, QemuOpts *opts, Error **errp) 1592 { 1593 int fd; 1594 int result = 0; 1595 int64_t total_size = 0; 1596 bool nocow = false; 1597 PreallocMode prealloc; 1598 char *buf = NULL; 1599 Error *local_err = NULL; 1600 1601 strstart(filename, "file:", &filename); 1602 1603 /* Read out options */ 1604 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 1605 BDRV_SECTOR_SIZE); 1606 nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); 1607 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 1608 prealloc = qapi_enum_parse(PreallocMode_lookup, buf, 1609 PREALLOC_MODE__MAX, PREALLOC_MODE_OFF, 1610 &local_err); 1611 g_free(buf); 1612 if (local_err) { 1613 error_propagate(errp, local_err); 1614 result = -EINVAL; 1615 goto out; 1616 } 1617 1618 fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, 1619 0644); 1620 if (fd < 0) { 1621 result = -errno; 1622 error_setg_errno(errp, -result, "Could not create file"); 1623 goto out; 1624 } 1625 1626 if (nocow) { 1627 #ifdef __linux__ 1628 /* Set NOCOW flag to solve performance issue on fs like btrfs. 1629 * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value 1630 * will be ignored since any failure of this operation should not 1631 * block the left work. 1632 */ 1633 int attr; 1634 if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { 1635 attr |= FS_NOCOW_FL; 1636 ioctl(fd, FS_IOC_SETFLAGS, &attr); 1637 } 1638 #endif 1639 } 1640 1641 switch (prealloc) { 1642 #ifdef CONFIG_POSIX_FALLOCATE 1643 case PREALLOC_MODE_FALLOC: 1644 /* 1645 * Truncating before posix_fallocate() makes it about twice slower on 1646 * file systems that do not support fallocate(), trying to check if a 1647 * block is allocated before allocating it, so don't do that here. 1648 */ 1649 result = -posix_fallocate(fd, 0, total_size); 1650 if (result != 0) { 1651 /* posix_fallocate() doesn't set errno. */ 1652 error_setg_errno(errp, -result, 1653 "Could not preallocate data for the new file"); 1654 } 1655 break; 1656 #endif 1657 case PREALLOC_MODE_FULL: 1658 { 1659 /* 1660 * Knowing the final size from the beginning could allow the file 1661 * system driver to do less allocations and possibly avoid 1662 * fragmentation of the file. 1663 */ 1664 if (ftruncate(fd, total_size) != 0) { 1665 result = -errno; 1666 error_setg_errno(errp, -result, "Could not resize file"); 1667 goto out_close; 1668 } 1669 1670 int64_t num = 0, left = total_size; 1671 buf = g_malloc0(65536); 1672 1673 while (left > 0) { 1674 num = MIN(left, 65536); 1675 result = write(fd, buf, num); 1676 if (result < 0) { 1677 result = -errno; 1678 error_setg_errno(errp, -result, 1679 "Could not write to the new file"); 1680 break; 1681 } 1682 left -= result; 1683 } 1684 if (result >= 0) { 1685 result = fsync(fd); 1686 if (result < 0) { 1687 result = -errno; 1688 error_setg_errno(errp, -result, 1689 "Could not flush new file to disk"); 1690 } 1691 } 1692 g_free(buf); 1693 break; 1694 } 1695 case PREALLOC_MODE_OFF: 1696 if (ftruncate(fd, total_size) != 0) { 1697 result = -errno; 1698 error_setg_errno(errp, -result, "Could not resize file"); 1699 } 1700 break; 1701 default: 1702 result = -EINVAL; 1703 error_setg(errp, "Unsupported preallocation mode: %s", 1704 PreallocMode_lookup[prealloc]); 1705 break; 1706 } 1707 1708 out_close: 1709 if (qemu_close(fd) != 0 && result == 0) { 1710 result = -errno; 1711 error_setg_errno(errp, -result, "Could not close the new file"); 1712 } 1713 out: 1714 return result; 1715 } 1716 1717 /* 1718 * Find allocation range in @bs around offset @start. 1719 * May change underlying file descriptor's file offset. 1720 * If @start is not in a hole, store @start in @data, and the 1721 * beginning of the next hole in @hole, and return 0. 1722 * If @start is in a non-trailing hole, store @start in @hole and the 1723 * beginning of the next non-hole in @data, and return 0. 1724 * If @start is in a trailing hole or beyond EOF, return -ENXIO. 1725 * If we can't find out, return a negative errno other than -ENXIO. 1726 */ 1727 static int find_allocation(BlockDriverState *bs, off_t start, 1728 off_t *data, off_t *hole) 1729 { 1730 #if defined SEEK_HOLE && defined SEEK_DATA 1731 BDRVRawState *s = bs->opaque; 1732 off_t offs; 1733 1734 /* 1735 * SEEK_DATA cases: 1736 * D1. offs == start: start is in data 1737 * D2. offs > start: start is in a hole, next data at offs 1738 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole 1739 * or start is beyond EOF 1740 * If the latter happens, the file has been truncated behind 1741 * our back since we opened it. All bets are off then. 1742 * Treating like a trailing hole is simplest. 1743 * D4. offs < 0, errno != ENXIO: we learned nothing 1744 */ 1745 offs = lseek(s->fd, start, SEEK_DATA); 1746 if (offs < 0) { 1747 return -errno; /* D3 or D4 */ 1748 } 1749 assert(offs >= start); 1750 1751 if (offs > start) { 1752 /* D2: in hole, next data at offs */ 1753 *hole = start; 1754 *data = offs; 1755 return 0; 1756 } 1757 1758 /* D1: in data, end not yet known */ 1759 1760 /* 1761 * SEEK_HOLE cases: 1762 * H1. offs == start: start is in a hole 1763 * If this happens here, a hole has been dug behind our back 1764 * since the previous lseek(). 1765 * H2. offs > start: either start is in data, next hole at offs, 1766 * or start is in trailing hole, EOF at offs 1767 * Linux treats trailing holes like any other hole: offs == 1768 * start. Solaris seeks to EOF instead: offs > start (blech). 1769 * If that happens here, a hole has been dug behind our back 1770 * since the previous lseek(). 1771 * H3. offs < 0, errno = ENXIO: start is beyond EOF 1772 * If this happens, the file has been truncated behind our 1773 * back since we opened it. Treat it like a trailing hole. 1774 * H4. offs < 0, errno != ENXIO: we learned nothing 1775 * Pretend we know nothing at all, i.e. "forget" about D1. 1776 */ 1777 offs = lseek(s->fd, start, SEEK_HOLE); 1778 if (offs < 0) { 1779 return -errno; /* D1 and (H3 or H4) */ 1780 } 1781 assert(offs >= start); 1782 1783 if (offs > start) { 1784 /* 1785 * D1 and H2: either in data, next hole at offs, or it was in 1786 * data but is now in a trailing hole. In the latter case, 1787 * all bets are off. Treating it as if it there was data all 1788 * the way to EOF is safe, so simply do that. 1789 */ 1790 *data = start; 1791 *hole = offs; 1792 return 0; 1793 } 1794 1795 /* D1 and H1 */ 1796 return -EBUSY; 1797 #else 1798 return -ENOTSUP; 1799 #endif 1800 } 1801 1802 /* 1803 * Returns the allocation status of the specified sectors. 1804 * 1805 * If 'sector_num' is beyond the end of the disk image the return value is 0 1806 * and 'pnum' is set to 0. 1807 * 1808 * 'pnum' is set to the number of sectors (including and immediately following 1809 * the specified sector) that are known to be in the same 1810 * allocated/unallocated state. 1811 * 1812 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1813 * beyond the end of the disk image it will be clamped. 1814 */ 1815 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, 1816 int64_t sector_num, 1817 int nb_sectors, int *pnum, 1818 BlockDriverState **file) 1819 { 1820 off_t start, data = 0, hole = 0; 1821 int64_t total_size; 1822 int ret; 1823 1824 ret = fd_open(bs); 1825 if (ret < 0) { 1826 return ret; 1827 } 1828 1829 start = sector_num * BDRV_SECTOR_SIZE; 1830 total_size = bdrv_getlength(bs); 1831 if (total_size < 0) { 1832 return total_size; 1833 } else if (start >= total_size) { 1834 *pnum = 0; 1835 return 0; 1836 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { 1837 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); 1838 } 1839 1840 ret = find_allocation(bs, start, &data, &hole); 1841 if (ret == -ENXIO) { 1842 /* Trailing hole */ 1843 *pnum = nb_sectors; 1844 ret = BDRV_BLOCK_ZERO; 1845 } else if (ret < 0) { 1846 /* No info available, so pretend there are no holes */ 1847 *pnum = nb_sectors; 1848 ret = BDRV_BLOCK_DATA; 1849 } else if (data == start) { 1850 /* On a data extent, compute sectors to the end of the extent, 1851 * possibly including a partial sector at EOF. */ 1852 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); 1853 ret = BDRV_BLOCK_DATA; 1854 } else { 1855 /* On a hole, compute sectors to the beginning of the next extent. */ 1856 assert(hole == start); 1857 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); 1858 ret = BDRV_BLOCK_ZERO; 1859 } 1860 *file = bs; 1861 return ret | BDRV_BLOCK_OFFSET_VALID | start; 1862 } 1863 1864 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs, 1865 int64_t offset, int count, 1866 BlockCompletionFunc *cb, void *opaque) 1867 { 1868 BDRVRawState *s = bs->opaque; 1869 1870 return paio_submit(bs, s->fd, offset, NULL, count, 1871 cb, opaque, QEMU_AIO_DISCARD); 1872 } 1873 1874 static int coroutine_fn raw_co_pwrite_zeroes( 1875 BlockDriverState *bs, int64_t offset, 1876 int count, BdrvRequestFlags flags) 1877 { 1878 BDRVRawState *s = bs->opaque; 1879 1880 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 1881 return paio_submit_co(bs, s->fd, offset, NULL, count, 1882 QEMU_AIO_WRITE_ZEROES); 1883 } else if (s->discard_zeroes) { 1884 return paio_submit_co(bs, s->fd, offset, NULL, count, 1885 QEMU_AIO_DISCARD); 1886 } 1887 return -ENOTSUP; 1888 } 1889 1890 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 1891 { 1892 BDRVRawState *s = bs->opaque; 1893 1894 bdi->unallocated_blocks_are_zero = s->discard_zeroes; 1895 bdi->can_write_zeroes_with_unmap = s->discard_zeroes; 1896 return 0; 1897 } 1898 1899 static QemuOptsList raw_create_opts = { 1900 .name = "raw-create-opts", 1901 .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), 1902 .desc = { 1903 { 1904 .name = BLOCK_OPT_SIZE, 1905 .type = QEMU_OPT_SIZE, 1906 .help = "Virtual disk size" 1907 }, 1908 { 1909 .name = BLOCK_OPT_NOCOW, 1910 .type = QEMU_OPT_BOOL, 1911 .help = "Turn off copy-on-write (valid only on btrfs)" 1912 }, 1913 { 1914 .name = BLOCK_OPT_PREALLOC, 1915 .type = QEMU_OPT_STRING, 1916 .help = "Preallocation mode (allowed values: off, falloc, full)" 1917 }, 1918 { /* end of list */ } 1919 } 1920 }; 1921 1922 BlockDriver bdrv_file = { 1923 .format_name = "file", 1924 .protocol_name = "file", 1925 .instance_size = sizeof(BDRVRawState), 1926 .bdrv_needs_filename = true, 1927 .bdrv_probe = NULL, /* no probe for protocols */ 1928 .bdrv_parse_filename = raw_parse_filename, 1929 .bdrv_file_open = raw_open, 1930 .bdrv_reopen_prepare = raw_reopen_prepare, 1931 .bdrv_reopen_commit = raw_reopen_commit, 1932 .bdrv_reopen_abort = raw_reopen_abort, 1933 .bdrv_close = raw_close, 1934 .bdrv_create = raw_create, 1935 .bdrv_has_zero_init = bdrv_has_zero_init_1, 1936 .bdrv_co_get_block_status = raw_co_get_block_status, 1937 .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, 1938 1939 .bdrv_co_preadv = raw_co_preadv, 1940 .bdrv_co_pwritev = raw_co_pwritev, 1941 .bdrv_aio_flush = raw_aio_flush, 1942 .bdrv_aio_pdiscard = raw_aio_pdiscard, 1943 .bdrv_refresh_limits = raw_refresh_limits, 1944 .bdrv_io_plug = raw_aio_plug, 1945 .bdrv_io_unplug = raw_aio_unplug, 1946 1947 .bdrv_truncate = raw_truncate, 1948 .bdrv_getlength = raw_getlength, 1949 .bdrv_get_info = raw_get_info, 1950 .bdrv_get_allocated_file_size 1951 = raw_get_allocated_file_size, 1952 1953 .create_opts = &raw_create_opts, 1954 }; 1955 1956 /***********************************************/ 1957 /* host device */ 1958 1959 #if defined(__APPLE__) && defined(__MACH__) 1960 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 1961 CFIndex maxPathSize, int flags); 1962 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) 1963 { 1964 kern_return_t kernResult = KERN_FAILURE; 1965 mach_port_t masterPort; 1966 CFMutableDictionaryRef classesToMatch; 1967 const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; 1968 char *mediaType = NULL; 1969 1970 kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); 1971 if ( KERN_SUCCESS != kernResult ) { 1972 printf( "IOMasterPort returned %d\n", kernResult ); 1973 } 1974 1975 int index; 1976 for (index = 0; index < ARRAY_SIZE(matching_array); index++) { 1977 classesToMatch = IOServiceMatching(matching_array[index]); 1978 if (classesToMatch == NULL) { 1979 error_report("IOServiceMatching returned NULL for %s", 1980 matching_array[index]); 1981 continue; 1982 } 1983 CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), 1984 kCFBooleanTrue); 1985 kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, 1986 mediaIterator); 1987 if (kernResult != KERN_SUCCESS) { 1988 error_report("Note: IOServiceGetMatchingServices returned %d", 1989 kernResult); 1990 continue; 1991 } 1992 1993 /* If a match was found, leave the loop */ 1994 if (*mediaIterator != 0) { 1995 DPRINTF("Matching using %s\n", matching_array[index]); 1996 mediaType = g_strdup(matching_array[index]); 1997 break; 1998 } 1999 } 2000 return mediaType; 2001 } 2002 2003 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 2004 CFIndex maxPathSize, int flags) 2005 { 2006 io_object_t nextMedia; 2007 kern_return_t kernResult = KERN_FAILURE; 2008 *bsdPath = '\0'; 2009 nextMedia = IOIteratorNext( mediaIterator ); 2010 if ( nextMedia ) 2011 { 2012 CFTypeRef bsdPathAsCFString; 2013 bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); 2014 if ( bsdPathAsCFString ) { 2015 size_t devPathLength; 2016 strcpy( bsdPath, _PATH_DEV ); 2017 if (flags & BDRV_O_NOCACHE) { 2018 strcat(bsdPath, "r"); 2019 } 2020 devPathLength = strlen( bsdPath ); 2021 if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { 2022 kernResult = KERN_SUCCESS; 2023 } 2024 CFRelease( bsdPathAsCFString ); 2025 } 2026 IOObjectRelease( nextMedia ); 2027 } 2028 2029 return kernResult; 2030 } 2031 2032 /* Sets up a real cdrom for use in QEMU */ 2033 static bool setup_cdrom(char *bsd_path, Error **errp) 2034 { 2035 int index, num_of_test_partitions = 2, fd; 2036 char test_partition[MAXPATHLEN]; 2037 bool partition_found = false; 2038 2039 /* look for a working partition */ 2040 for (index = 0; index < num_of_test_partitions; index++) { 2041 snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, 2042 index); 2043 fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE); 2044 if (fd >= 0) { 2045 partition_found = true; 2046 qemu_close(fd); 2047 break; 2048 } 2049 } 2050 2051 /* if a working partition on the device was not found */ 2052 if (partition_found == false) { 2053 error_setg(errp, "Failed to find a working partition on disc"); 2054 } else { 2055 DPRINTF("Using %s as optical disc\n", test_partition); 2056 pstrcpy(bsd_path, MAXPATHLEN, test_partition); 2057 } 2058 return partition_found; 2059 } 2060 2061 /* Prints directions on mounting and unmounting a device */ 2062 static void print_unmounting_directions(const char *file_name) 2063 { 2064 error_report("If device %s is mounted on the desktop, unmount" 2065 " it first before using it in QEMU", file_name); 2066 error_report("Command to unmount device: diskutil unmountDisk %s", 2067 file_name); 2068 error_report("Command to mount device: diskutil mountDisk %s", file_name); 2069 } 2070 2071 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2072 2073 static int hdev_probe_device(const char *filename) 2074 { 2075 struct stat st; 2076 2077 /* allow a dedicated CD-ROM driver to match with a higher priority */ 2078 if (strstart(filename, "/dev/cdrom", NULL)) 2079 return 50; 2080 2081 if (stat(filename, &st) >= 0 && 2082 (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { 2083 return 100; 2084 } 2085 2086 return 0; 2087 } 2088 2089 static int check_hdev_writable(BDRVRawState *s) 2090 { 2091 #if defined(BLKROGET) 2092 /* Linux block devices can be configured "read-only" using blockdev(8). 2093 * This is independent of device node permissions and therefore open(2) 2094 * with O_RDWR succeeds. Actual writes fail with EPERM. 2095 * 2096 * bdrv_open() is supposed to fail if the disk is read-only. Explicitly 2097 * check for read-only block devices so that Linux block devices behave 2098 * properly. 2099 */ 2100 struct stat st; 2101 int readonly = 0; 2102 2103 if (fstat(s->fd, &st)) { 2104 return -errno; 2105 } 2106 2107 if (!S_ISBLK(st.st_mode)) { 2108 return 0; 2109 } 2110 2111 if (ioctl(s->fd, BLKROGET, &readonly) < 0) { 2112 return -errno; 2113 } 2114 2115 if (readonly) { 2116 return -EACCES; 2117 } 2118 #endif /* defined(BLKROGET) */ 2119 return 0; 2120 } 2121 2122 static void hdev_parse_filename(const char *filename, QDict *options, 2123 Error **errp) 2124 { 2125 /* The prefix is optional, just as for "file". */ 2126 strstart(filename, "host_device:", &filename); 2127 2128 qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); 2129 } 2130 2131 static bool hdev_is_sg(BlockDriverState *bs) 2132 { 2133 2134 #if defined(__linux__) 2135 2136 BDRVRawState *s = bs->opaque; 2137 struct stat st; 2138 struct sg_scsi_id scsiid; 2139 int sg_version; 2140 int ret; 2141 2142 if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) { 2143 return false; 2144 } 2145 2146 ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version); 2147 if (ret < 0) { 2148 return false; 2149 } 2150 2151 ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid); 2152 if (ret >= 0) { 2153 DPRINTF("SG device found: type=%d, version=%d\n", 2154 scsiid.scsi_type, sg_version); 2155 return true; 2156 } 2157 2158 #endif 2159 2160 return false; 2161 } 2162 2163 static int hdev_open(BlockDriverState *bs, QDict *options, int flags, 2164 Error **errp) 2165 { 2166 BDRVRawState *s = bs->opaque; 2167 Error *local_err = NULL; 2168 int ret; 2169 2170 #if defined(__APPLE__) && defined(__MACH__) 2171 const char *filename = qdict_get_str(options, "filename"); 2172 char bsd_path[MAXPATHLEN] = ""; 2173 bool error_occurred = false; 2174 2175 /* If using a real cdrom */ 2176 if (strcmp(filename, "/dev/cdrom") == 0) { 2177 char *mediaType = NULL; 2178 kern_return_t ret_val; 2179 io_iterator_t mediaIterator = 0; 2180 2181 mediaType = FindEjectableOpticalMedia(&mediaIterator); 2182 if (mediaType == NULL) { 2183 error_setg(errp, "Please make sure your CD/DVD is in the optical" 2184 " drive"); 2185 error_occurred = true; 2186 goto hdev_open_Mac_error; 2187 } 2188 2189 ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags); 2190 if (ret_val != KERN_SUCCESS) { 2191 error_setg(errp, "Could not get BSD path for optical drive"); 2192 error_occurred = true; 2193 goto hdev_open_Mac_error; 2194 } 2195 2196 /* If a real optical drive was not found */ 2197 if (bsd_path[0] == '\0') { 2198 error_setg(errp, "Failed to obtain bsd path for optical drive"); 2199 error_occurred = true; 2200 goto hdev_open_Mac_error; 2201 } 2202 2203 /* If using a cdrom disc and finding a partition on the disc failed */ 2204 if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 && 2205 setup_cdrom(bsd_path, errp) == false) { 2206 print_unmounting_directions(bsd_path); 2207 error_occurred = true; 2208 goto hdev_open_Mac_error; 2209 } 2210 2211 qdict_put(options, "filename", qstring_from_str(bsd_path)); 2212 2213 hdev_open_Mac_error: 2214 g_free(mediaType); 2215 if (mediaIterator) { 2216 IOObjectRelease(mediaIterator); 2217 } 2218 if (error_occurred) { 2219 return -ENOENT; 2220 } 2221 } 2222 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2223 2224 s->type = FTYPE_FILE; 2225 2226 ret = raw_open_common(bs, options, flags, 0, &local_err); 2227 if (ret < 0) { 2228 error_propagate(errp, local_err); 2229 #if defined(__APPLE__) && defined(__MACH__) 2230 if (*bsd_path) { 2231 filename = bsd_path; 2232 } 2233 /* if a physical device experienced an error while being opened */ 2234 if (strncmp(filename, "/dev/", 5) == 0) { 2235 print_unmounting_directions(filename); 2236 } 2237 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2238 return ret; 2239 } 2240 2241 /* Since this does ioctl the device must be already opened */ 2242 bs->sg = hdev_is_sg(bs); 2243 2244 if (flags & BDRV_O_RDWR) { 2245 ret = check_hdev_writable(s); 2246 if (ret < 0) { 2247 raw_close(bs); 2248 error_setg_errno(errp, -ret, "The device is not writable"); 2249 return ret; 2250 } 2251 } 2252 2253 return ret; 2254 } 2255 2256 #if defined(__linux__) 2257 2258 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, 2259 unsigned long int req, void *buf, 2260 BlockCompletionFunc *cb, void *opaque) 2261 { 2262 BDRVRawState *s = bs->opaque; 2263 RawPosixAIOData *acb; 2264 ThreadPool *pool; 2265 2266 if (fd_open(bs) < 0) 2267 return NULL; 2268 2269 acb = g_new(RawPosixAIOData, 1); 2270 acb->bs = bs; 2271 acb->aio_type = QEMU_AIO_IOCTL; 2272 acb->aio_fildes = s->fd; 2273 acb->aio_offset = 0; 2274 acb->aio_ioctl_buf = buf; 2275 acb->aio_ioctl_cmd = req; 2276 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 2277 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 2278 } 2279 #endif /* linux */ 2280 2281 static int fd_open(BlockDriverState *bs) 2282 { 2283 BDRVRawState *s = bs->opaque; 2284 2285 /* this is just to ensure s->fd is sane (its called by io ops) */ 2286 if (s->fd >= 0) 2287 return 0; 2288 return -EIO; 2289 } 2290 2291 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs, 2292 int64_t offset, int count, 2293 BlockCompletionFunc *cb, void *opaque) 2294 { 2295 BDRVRawState *s = bs->opaque; 2296 2297 if (fd_open(bs) < 0) { 2298 return NULL; 2299 } 2300 return paio_submit(bs, s->fd, offset, NULL, count, 2301 cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2302 } 2303 2304 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, 2305 int64_t offset, int count, BdrvRequestFlags flags) 2306 { 2307 BDRVRawState *s = bs->opaque; 2308 int rc; 2309 2310 rc = fd_open(bs); 2311 if (rc < 0) { 2312 return rc; 2313 } 2314 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 2315 return paio_submit_co(bs, s->fd, offset, NULL, count, 2316 QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); 2317 } else if (s->discard_zeroes) { 2318 return paio_submit_co(bs, s->fd, offset, NULL, count, 2319 QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2320 } 2321 return -ENOTSUP; 2322 } 2323 2324 static int hdev_create(const char *filename, QemuOpts *opts, 2325 Error **errp) 2326 { 2327 int fd; 2328 int ret = 0; 2329 struct stat stat_buf; 2330 int64_t total_size = 0; 2331 bool has_prefix; 2332 2333 /* This function is used by both protocol block drivers and therefore either 2334 * of these prefixes may be given. 2335 * The return value has to be stored somewhere, otherwise this is an error 2336 * due to -Werror=unused-value. */ 2337 has_prefix = 2338 strstart(filename, "host_device:", &filename) || 2339 strstart(filename, "host_cdrom:" , &filename); 2340 2341 (void)has_prefix; 2342 2343 ret = raw_normalize_devicepath(&filename); 2344 if (ret < 0) { 2345 error_setg_errno(errp, -ret, "Could not normalize device path"); 2346 return ret; 2347 } 2348 2349 /* Read out options */ 2350 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 2351 BDRV_SECTOR_SIZE); 2352 2353 fd = qemu_open(filename, O_WRONLY | O_BINARY); 2354 if (fd < 0) { 2355 ret = -errno; 2356 error_setg_errno(errp, -ret, "Could not open device"); 2357 return ret; 2358 } 2359 2360 if (fstat(fd, &stat_buf) < 0) { 2361 ret = -errno; 2362 error_setg_errno(errp, -ret, "Could not stat device"); 2363 } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { 2364 error_setg(errp, 2365 "The given file is neither a block nor a character device"); 2366 ret = -ENODEV; 2367 } else if (lseek(fd, 0, SEEK_END) < total_size) { 2368 error_setg(errp, "Device is too small"); 2369 ret = -ENOSPC; 2370 } 2371 2372 qemu_close(fd); 2373 return ret; 2374 } 2375 2376 static BlockDriver bdrv_host_device = { 2377 .format_name = "host_device", 2378 .protocol_name = "host_device", 2379 .instance_size = sizeof(BDRVRawState), 2380 .bdrv_needs_filename = true, 2381 .bdrv_probe_device = hdev_probe_device, 2382 .bdrv_parse_filename = hdev_parse_filename, 2383 .bdrv_file_open = hdev_open, 2384 .bdrv_close = raw_close, 2385 .bdrv_reopen_prepare = raw_reopen_prepare, 2386 .bdrv_reopen_commit = raw_reopen_commit, 2387 .bdrv_reopen_abort = raw_reopen_abort, 2388 .bdrv_create = hdev_create, 2389 .create_opts = &raw_create_opts, 2390 .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, 2391 2392 .bdrv_co_preadv = raw_co_preadv, 2393 .bdrv_co_pwritev = raw_co_pwritev, 2394 .bdrv_aio_flush = raw_aio_flush, 2395 .bdrv_aio_pdiscard = hdev_aio_pdiscard, 2396 .bdrv_refresh_limits = raw_refresh_limits, 2397 .bdrv_io_plug = raw_aio_plug, 2398 .bdrv_io_unplug = raw_aio_unplug, 2399 2400 .bdrv_truncate = raw_truncate, 2401 .bdrv_getlength = raw_getlength, 2402 .bdrv_get_info = raw_get_info, 2403 .bdrv_get_allocated_file_size 2404 = raw_get_allocated_file_size, 2405 .bdrv_probe_blocksizes = hdev_probe_blocksizes, 2406 .bdrv_probe_geometry = hdev_probe_geometry, 2407 2408 /* generic scsi device */ 2409 #ifdef __linux__ 2410 .bdrv_aio_ioctl = hdev_aio_ioctl, 2411 #endif 2412 }; 2413 2414 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 2415 static void cdrom_parse_filename(const char *filename, QDict *options, 2416 Error **errp) 2417 { 2418 /* The prefix is optional, just as for "file". */ 2419 strstart(filename, "host_cdrom:", &filename); 2420 2421 qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); 2422 } 2423 #endif 2424 2425 #ifdef __linux__ 2426 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 2427 Error **errp) 2428 { 2429 BDRVRawState *s = bs->opaque; 2430 2431 s->type = FTYPE_CD; 2432 2433 /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ 2434 return raw_open_common(bs, options, flags, O_NONBLOCK, errp); 2435 } 2436 2437 static int cdrom_probe_device(const char *filename) 2438 { 2439 int fd, ret; 2440 int prio = 0; 2441 struct stat st; 2442 2443 fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); 2444 if (fd < 0) { 2445 goto out; 2446 } 2447 ret = fstat(fd, &st); 2448 if (ret == -1 || !S_ISBLK(st.st_mode)) { 2449 goto outc; 2450 } 2451 2452 /* Attempt to detect via a CDROM specific ioctl */ 2453 ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 2454 if (ret >= 0) 2455 prio = 100; 2456 2457 outc: 2458 qemu_close(fd); 2459 out: 2460 return prio; 2461 } 2462 2463 static bool cdrom_is_inserted(BlockDriverState *bs) 2464 { 2465 BDRVRawState *s = bs->opaque; 2466 int ret; 2467 2468 ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 2469 return ret == CDS_DISC_OK; 2470 } 2471 2472 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 2473 { 2474 BDRVRawState *s = bs->opaque; 2475 2476 if (eject_flag) { 2477 if (ioctl(s->fd, CDROMEJECT, NULL) < 0) 2478 perror("CDROMEJECT"); 2479 } else { 2480 if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) 2481 perror("CDROMEJECT"); 2482 } 2483 } 2484 2485 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 2486 { 2487 BDRVRawState *s = bs->opaque; 2488 2489 if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { 2490 /* 2491 * Note: an error can happen if the distribution automatically 2492 * mounts the CD-ROM 2493 */ 2494 /* perror("CDROM_LOCKDOOR"); */ 2495 } 2496 } 2497 2498 static BlockDriver bdrv_host_cdrom = { 2499 .format_name = "host_cdrom", 2500 .protocol_name = "host_cdrom", 2501 .instance_size = sizeof(BDRVRawState), 2502 .bdrv_needs_filename = true, 2503 .bdrv_probe_device = cdrom_probe_device, 2504 .bdrv_parse_filename = cdrom_parse_filename, 2505 .bdrv_file_open = cdrom_open, 2506 .bdrv_close = raw_close, 2507 .bdrv_reopen_prepare = raw_reopen_prepare, 2508 .bdrv_reopen_commit = raw_reopen_commit, 2509 .bdrv_reopen_abort = raw_reopen_abort, 2510 .bdrv_create = hdev_create, 2511 .create_opts = &raw_create_opts, 2512 2513 2514 .bdrv_co_preadv = raw_co_preadv, 2515 .bdrv_co_pwritev = raw_co_pwritev, 2516 .bdrv_aio_flush = raw_aio_flush, 2517 .bdrv_refresh_limits = raw_refresh_limits, 2518 .bdrv_io_plug = raw_aio_plug, 2519 .bdrv_io_unplug = raw_aio_unplug, 2520 2521 .bdrv_truncate = raw_truncate, 2522 .bdrv_getlength = raw_getlength, 2523 .has_variable_length = true, 2524 .bdrv_get_allocated_file_size 2525 = raw_get_allocated_file_size, 2526 2527 /* removable device support */ 2528 .bdrv_is_inserted = cdrom_is_inserted, 2529 .bdrv_eject = cdrom_eject, 2530 .bdrv_lock_medium = cdrom_lock_medium, 2531 2532 /* generic scsi device */ 2533 .bdrv_aio_ioctl = hdev_aio_ioctl, 2534 }; 2535 #endif /* __linux__ */ 2536 2537 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 2538 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 2539 Error **errp) 2540 { 2541 BDRVRawState *s = bs->opaque; 2542 Error *local_err = NULL; 2543 int ret; 2544 2545 s->type = FTYPE_CD; 2546 2547 ret = raw_open_common(bs, options, flags, 0, &local_err); 2548 if (ret) { 2549 error_propagate(errp, local_err); 2550 return ret; 2551 } 2552 2553 /* make sure the door isn't locked at this time */ 2554 ioctl(s->fd, CDIOCALLOW); 2555 return 0; 2556 } 2557 2558 static int cdrom_probe_device(const char *filename) 2559 { 2560 if (strstart(filename, "/dev/cd", NULL) || 2561 strstart(filename, "/dev/acd", NULL)) 2562 return 100; 2563 return 0; 2564 } 2565 2566 static int cdrom_reopen(BlockDriverState *bs) 2567 { 2568 BDRVRawState *s = bs->opaque; 2569 int fd; 2570 2571 /* 2572 * Force reread of possibly changed/newly loaded disc, 2573 * FreeBSD seems to not notice sometimes... 2574 */ 2575 if (s->fd >= 0) 2576 qemu_close(s->fd); 2577 fd = qemu_open(bs->filename, s->open_flags, 0644); 2578 if (fd < 0) { 2579 s->fd = -1; 2580 return -EIO; 2581 } 2582 s->fd = fd; 2583 2584 /* make sure the door isn't locked at this time */ 2585 ioctl(s->fd, CDIOCALLOW); 2586 return 0; 2587 } 2588 2589 static bool cdrom_is_inserted(BlockDriverState *bs) 2590 { 2591 return raw_getlength(bs) > 0; 2592 } 2593 2594 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 2595 { 2596 BDRVRawState *s = bs->opaque; 2597 2598 if (s->fd < 0) 2599 return; 2600 2601 (void) ioctl(s->fd, CDIOCALLOW); 2602 2603 if (eject_flag) { 2604 if (ioctl(s->fd, CDIOCEJECT) < 0) 2605 perror("CDIOCEJECT"); 2606 } else { 2607 if (ioctl(s->fd, CDIOCCLOSE) < 0) 2608 perror("CDIOCCLOSE"); 2609 } 2610 2611 cdrom_reopen(bs); 2612 } 2613 2614 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 2615 { 2616 BDRVRawState *s = bs->opaque; 2617 2618 if (s->fd < 0) 2619 return; 2620 if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { 2621 /* 2622 * Note: an error can happen if the distribution automatically 2623 * mounts the CD-ROM 2624 */ 2625 /* perror("CDROM_LOCKDOOR"); */ 2626 } 2627 } 2628 2629 static BlockDriver bdrv_host_cdrom = { 2630 .format_name = "host_cdrom", 2631 .protocol_name = "host_cdrom", 2632 .instance_size = sizeof(BDRVRawState), 2633 .bdrv_needs_filename = true, 2634 .bdrv_probe_device = cdrom_probe_device, 2635 .bdrv_parse_filename = cdrom_parse_filename, 2636 .bdrv_file_open = cdrom_open, 2637 .bdrv_close = raw_close, 2638 .bdrv_reopen_prepare = raw_reopen_prepare, 2639 .bdrv_reopen_commit = raw_reopen_commit, 2640 .bdrv_reopen_abort = raw_reopen_abort, 2641 .bdrv_create = hdev_create, 2642 .create_opts = &raw_create_opts, 2643 2644 .bdrv_co_preadv = raw_co_preadv, 2645 .bdrv_co_pwritev = raw_co_pwritev, 2646 .bdrv_aio_flush = raw_aio_flush, 2647 .bdrv_refresh_limits = raw_refresh_limits, 2648 .bdrv_io_plug = raw_aio_plug, 2649 .bdrv_io_unplug = raw_aio_unplug, 2650 2651 .bdrv_truncate = raw_truncate, 2652 .bdrv_getlength = raw_getlength, 2653 .has_variable_length = true, 2654 .bdrv_get_allocated_file_size 2655 = raw_get_allocated_file_size, 2656 2657 /* removable device support */ 2658 .bdrv_is_inserted = cdrom_is_inserted, 2659 .bdrv_eject = cdrom_eject, 2660 .bdrv_lock_medium = cdrom_lock_medium, 2661 }; 2662 #endif /* __FreeBSD__ */ 2663 2664 static void bdrv_file_init(void) 2665 { 2666 /* 2667 * Register all the drivers. Note that order is important, the driver 2668 * registered last will get probed first. 2669 */ 2670 bdrv_register(&bdrv_file); 2671 bdrv_register(&bdrv_host_device); 2672 #ifdef __linux__ 2673 bdrv_register(&bdrv_host_cdrom); 2674 #endif 2675 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 2676 bdrv_register(&bdrv_host_cdrom); 2677 #endif 2678 } 2679 2680 block_init(bdrv_file_init); 2681