1 /* 2 * Block driver for RAW files (posix) 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "qapi/error.h" 26 #include "qemu/cutils.h" 27 #include "qemu/error-report.h" 28 #include "block/block_int.h" 29 #include "qemu/module.h" 30 #include "trace.h" 31 #include "block/thread-pool.h" 32 #include "qemu/iov.h" 33 #include "block/raw-aio.h" 34 #include "qapi/qmp/qstring.h" 35 36 #include "scsi/pr-manager.h" 37 #include "scsi/constants.h" 38 39 #if defined(__APPLE__) && (__MACH__) 40 #include <paths.h> 41 #include <sys/param.h> 42 #include <IOKit/IOKitLib.h> 43 #include <IOKit/IOBSD.h> 44 #include <IOKit/storage/IOMediaBSDClient.h> 45 #include <IOKit/storage/IOMedia.h> 46 #include <IOKit/storage/IOCDMedia.h> 47 //#include <IOKit/storage/IOCDTypes.h> 48 #include <IOKit/storage/IODVDMedia.h> 49 #include <CoreFoundation/CoreFoundation.h> 50 #endif 51 52 #ifdef __sun__ 53 #define _POSIX_PTHREAD_SEMANTICS 1 54 #include <sys/dkio.h> 55 #endif 56 #ifdef __linux__ 57 #include <sys/ioctl.h> 58 #include <sys/param.h> 59 #include <linux/cdrom.h> 60 #include <linux/fd.h> 61 #include <linux/fs.h> 62 #include <linux/hdreg.h> 63 #include <scsi/sg.h> 64 #ifdef __s390__ 65 #include <asm/dasd.h> 66 #endif 67 #ifndef FS_NOCOW_FL 68 #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ 69 #endif 70 #endif 71 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) 72 #include <linux/falloc.h> 73 #endif 74 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 75 #include <sys/disk.h> 76 #include <sys/cdio.h> 77 #endif 78 79 #ifdef __OpenBSD__ 80 #include <sys/ioctl.h> 81 #include <sys/disklabel.h> 82 #include <sys/dkio.h> 83 #endif 84 85 #ifdef __NetBSD__ 86 #include <sys/ioctl.h> 87 #include <sys/disklabel.h> 88 #include <sys/dkio.h> 89 #include <sys/disk.h> 90 #endif 91 92 #ifdef __DragonFly__ 93 #include <sys/ioctl.h> 94 #include <sys/diskslice.h> 95 #endif 96 97 #ifdef CONFIG_XFS 98 #include <xfs/xfs.h> 99 #endif 100 101 //#define DEBUG_BLOCK 102 103 #ifdef DEBUG_BLOCK 104 # define DEBUG_BLOCK_PRINT 1 105 #else 106 # define DEBUG_BLOCK_PRINT 0 107 #endif 108 #define DPRINTF(fmt, ...) \ 109 do { \ 110 if (DEBUG_BLOCK_PRINT) { \ 111 printf(fmt, ## __VA_ARGS__); \ 112 } \ 113 } while (0) 114 115 /* OS X does not have O_DSYNC */ 116 #ifndef O_DSYNC 117 #ifdef O_SYNC 118 #define O_DSYNC O_SYNC 119 #elif defined(O_FSYNC) 120 #define O_DSYNC O_FSYNC 121 #endif 122 #endif 123 124 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ 125 #ifndef O_DIRECT 126 #define O_DIRECT O_DSYNC 127 #endif 128 129 #define FTYPE_FILE 0 130 #define FTYPE_CD 1 131 132 #define MAX_BLOCKSIZE 4096 133 134 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, 135 * leaving a few more bytes for its future use. */ 136 #define RAW_LOCK_PERM_BASE 100 137 #define RAW_LOCK_SHARED_BASE 200 138 139 typedef struct BDRVRawState { 140 int fd; 141 int lock_fd; 142 bool use_lock; 143 int type; 144 int open_flags; 145 size_t buf_align; 146 147 /* The current permissions. */ 148 uint64_t perm; 149 uint64_t shared_perm; 150 151 #ifdef CONFIG_XFS 152 bool is_xfs:1; 153 #endif 154 bool has_discard:1; 155 bool has_write_zeroes:1; 156 bool discard_zeroes:1; 157 bool use_linux_aio:1; 158 bool page_cache_inconsistent:1; 159 bool has_fallocate; 160 bool needs_alignment; 161 162 PRManager *pr_mgr; 163 } BDRVRawState; 164 165 typedef struct BDRVRawReopenState { 166 int fd; 167 int open_flags; 168 } BDRVRawReopenState; 169 170 static int fd_open(BlockDriverState *bs); 171 static int64_t raw_getlength(BlockDriverState *bs); 172 173 typedef struct RawPosixAIOData { 174 BlockDriverState *bs; 175 int aio_fildes; 176 union { 177 struct iovec *aio_iov; 178 void *aio_ioctl_buf; 179 }; 180 int aio_niov; 181 uint64_t aio_nbytes; 182 #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ 183 off_t aio_offset; 184 int aio_type; 185 } RawPosixAIOData; 186 187 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 188 static int cdrom_reopen(BlockDriverState *bs); 189 #endif 190 191 #if defined(__NetBSD__) 192 static int raw_normalize_devicepath(const char **filename) 193 { 194 static char namebuf[PATH_MAX]; 195 const char *dp, *fname; 196 struct stat sb; 197 198 fname = *filename; 199 dp = strrchr(fname, '/'); 200 if (lstat(fname, &sb) < 0) { 201 fprintf(stderr, "%s: stat failed: %s\n", 202 fname, strerror(errno)); 203 return -errno; 204 } 205 206 if (!S_ISBLK(sb.st_mode)) { 207 return 0; 208 } 209 210 if (dp == NULL) { 211 snprintf(namebuf, PATH_MAX, "r%s", fname); 212 } else { 213 snprintf(namebuf, PATH_MAX, "%.*s/r%s", 214 (int)(dp - fname), fname, dp + 1); 215 } 216 fprintf(stderr, "%s is a block device", fname); 217 *filename = namebuf; 218 fprintf(stderr, ", using %s\n", *filename); 219 220 return 0; 221 } 222 #else 223 static int raw_normalize_devicepath(const char **filename) 224 { 225 return 0; 226 } 227 #endif 228 229 /* 230 * Get logical block size via ioctl. On success store it in @sector_size_p. 231 */ 232 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) 233 { 234 unsigned int sector_size; 235 bool success = false; 236 int i; 237 238 errno = ENOTSUP; 239 static const unsigned long ioctl_list[] = { 240 #ifdef BLKSSZGET 241 BLKSSZGET, 242 #endif 243 #ifdef DKIOCGETBLOCKSIZE 244 DKIOCGETBLOCKSIZE, 245 #endif 246 #ifdef DIOCGSECTORSIZE 247 DIOCGSECTORSIZE, 248 #endif 249 }; 250 251 /* Try a few ioctls to get the right size */ 252 for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) { 253 if (ioctl(fd, ioctl_list[i], §or_size) >= 0) { 254 *sector_size_p = sector_size; 255 success = true; 256 } 257 } 258 259 return success ? 0 : -errno; 260 } 261 262 /** 263 * Get physical block size of @fd. 264 * On success, store it in @blk_size and return 0. 265 * On failure, return -errno. 266 */ 267 static int probe_physical_blocksize(int fd, unsigned int *blk_size) 268 { 269 #ifdef BLKPBSZGET 270 if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { 271 return -errno; 272 } 273 return 0; 274 #else 275 return -ENOTSUP; 276 #endif 277 } 278 279 /* Check if read is allowed with given memory buffer and length. 280 * 281 * This function is used to check O_DIRECT memory buffer and request alignment. 282 */ 283 static bool raw_is_io_aligned(int fd, void *buf, size_t len) 284 { 285 ssize_t ret = pread(fd, buf, len, 0); 286 287 if (ret >= 0) { 288 return true; 289 } 290 291 #ifdef __linux__ 292 /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore 293 * other errors (e.g. real I/O error), which could happen on a failed 294 * drive, since we only care about probing alignment. 295 */ 296 if (errno != EINVAL) { 297 return true; 298 } 299 #endif 300 301 return false; 302 } 303 304 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) 305 { 306 BDRVRawState *s = bs->opaque; 307 char *buf; 308 size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); 309 310 /* For SCSI generic devices the alignment is not really used. 311 With buffered I/O, we don't have any restrictions. */ 312 if (bdrv_is_sg(bs) || !s->needs_alignment) { 313 bs->bl.request_alignment = 1; 314 s->buf_align = 1; 315 return; 316 } 317 318 bs->bl.request_alignment = 0; 319 s->buf_align = 0; 320 /* Let's try to use the logical blocksize for the alignment. */ 321 if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) { 322 bs->bl.request_alignment = 0; 323 } 324 #ifdef CONFIG_XFS 325 if (s->is_xfs) { 326 struct dioattr da; 327 if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { 328 bs->bl.request_alignment = da.d_miniosz; 329 /* The kernel returns wrong information for d_mem */ 330 /* s->buf_align = da.d_mem; */ 331 } 332 } 333 #endif 334 335 /* If we could not get the sizes so far, we can only guess them */ 336 if (!s->buf_align) { 337 size_t align; 338 buf = qemu_memalign(max_align, 2 * max_align); 339 for (align = 512; align <= max_align; align <<= 1) { 340 if (raw_is_io_aligned(fd, buf + align, max_align)) { 341 s->buf_align = align; 342 break; 343 } 344 } 345 qemu_vfree(buf); 346 } 347 348 if (!bs->bl.request_alignment) { 349 size_t align; 350 buf = qemu_memalign(s->buf_align, max_align); 351 for (align = 512; align <= max_align; align <<= 1) { 352 if (raw_is_io_aligned(fd, buf, align)) { 353 bs->bl.request_alignment = align; 354 break; 355 } 356 } 357 qemu_vfree(buf); 358 } 359 360 if (!s->buf_align || !bs->bl.request_alignment) { 361 error_setg(errp, "Could not find working O_DIRECT alignment"); 362 error_append_hint(errp, "Try cache.direct=off\n"); 363 } 364 } 365 366 static void raw_parse_flags(int bdrv_flags, int *open_flags) 367 { 368 assert(open_flags != NULL); 369 370 *open_flags |= O_BINARY; 371 *open_flags &= ~O_ACCMODE; 372 if (bdrv_flags & BDRV_O_RDWR) { 373 *open_flags |= O_RDWR; 374 } else { 375 *open_flags |= O_RDONLY; 376 } 377 378 /* Use O_DSYNC for write-through caching, no flags for write-back caching, 379 * and O_DIRECT for no caching. */ 380 if ((bdrv_flags & BDRV_O_NOCACHE)) { 381 *open_flags |= O_DIRECT; 382 } 383 } 384 385 static void raw_parse_filename(const char *filename, QDict *options, 386 Error **errp) 387 { 388 bdrv_parse_filename_strip_prefix(filename, "file:", options); 389 } 390 391 static QemuOptsList raw_runtime_opts = { 392 .name = "raw", 393 .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), 394 .desc = { 395 { 396 .name = "filename", 397 .type = QEMU_OPT_STRING, 398 .help = "File name of the image", 399 }, 400 { 401 .name = "aio", 402 .type = QEMU_OPT_STRING, 403 .help = "host AIO implementation (threads, native)", 404 }, 405 { 406 .name = "locking", 407 .type = QEMU_OPT_STRING, 408 .help = "file locking mode (on/off/auto, default: auto)", 409 }, 410 { 411 .name = "pr-manager", 412 .type = QEMU_OPT_STRING, 413 .help = "id of persistent reservation manager object (default: none)", 414 }, 415 { /* end of list */ } 416 }, 417 }; 418 419 static int raw_open_common(BlockDriverState *bs, QDict *options, 420 int bdrv_flags, int open_flags, Error **errp) 421 { 422 BDRVRawState *s = bs->opaque; 423 QemuOpts *opts; 424 Error *local_err = NULL; 425 const char *filename = NULL; 426 const char *str; 427 BlockdevAioOptions aio, aio_default; 428 int fd, ret; 429 struct stat st; 430 OnOffAuto locking; 431 432 opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); 433 qemu_opts_absorb_qdict(opts, options, &local_err); 434 if (local_err) { 435 error_propagate(errp, local_err); 436 ret = -EINVAL; 437 goto fail; 438 } 439 440 filename = qemu_opt_get(opts, "filename"); 441 442 ret = raw_normalize_devicepath(&filename); 443 if (ret != 0) { 444 error_setg_errno(errp, -ret, "Could not normalize device path"); 445 goto fail; 446 } 447 448 aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) 449 ? BLOCKDEV_AIO_OPTIONS_NATIVE 450 : BLOCKDEV_AIO_OPTIONS_THREADS; 451 aio = qapi_enum_parse(&BlockdevAioOptions_lookup, 452 qemu_opt_get(opts, "aio"), 453 aio_default, &local_err); 454 if (local_err) { 455 error_propagate(errp, local_err); 456 ret = -EINVAL; 457 goto fail; 458 } 459 s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); 460 461 locking = qapi_enum_parse(&OnOffAuto_lookup, 462 qemu_opt_get(opts, "locking"), 463 ON_OFF_AUTO_AUTO, &local_err); 464 if (local_err) { 465 error_propagate(errp, local_err); 466 ret = -EINVAL; 467 goto fail; 468 } 469 switch (locking) { 470 case ON_OFF_AUTO_ON: 471 s->use_lock = true; 472 if (!qemu_has_ofd_lock()) { 473 fprintf(stderr, 474 "File lock requested but OFD locking syscall is " 475 "unavailable, falling back to POSIX file locks.\n" 476 "Due to the implementation, locks can be lost " 477 "unexpectedly.\n"); 478 } 479 break; 480 case ON_OFF_AUTO_OFF: 481 s->use_lock = false; 482 break; 483 case ON_OFF_AUTO_AUTO: 484 s->use_lock = qemu_has_ofd_lock(); 485 break; 486 default: 487 abort(); 488 } 489 490 str = qemu_opt_get(opts, "pr-manager"); 491 if (str) { 492 s->pr_mgr = pr_manager_lookup(str, &local_err); 493 if (local_err) { 494 error_propagate(errp, local_err); 495 ret = -EINVAL; 496 goto fail; 497 } 498 } 499 500 s->open_flags = open_flags; 501 raw_parse_flags(bdrv_flags, &s->open_flags); 502 503 s->fd = -1; 504 fd = qemu_open(filename, s->open_flags, 0644); 505 if (fd < 0) { 506 ret = -errno; 507 error_setg_errno(errp, errno, "Could not open '%s'", filename); 508 if (ret == -EROFS) { 509 ret = -EACCES; 510 } 511 goto fail; 512 } 513 s->fd = fd; 514 515 s->lock_fd = -1; 516 if (s->use_lock) { 517 fd = qemu_open(filename, s->open_flags); 518 if (fd < 0) { 519 ret = -errno; 520 error_setg_errno(errp, errno, "Could not open '%s' for locking", 521 filename); 522 qemu_close(s->fd); 523 goto fail; 524 } 525 s->lock_fd = fd; 526 } 527 s->perm = 0; 528 s->shared_perm = BLK_PERM_ALL; 529 530 #ifdef CONFIG_LINUX_AIO 531 /* Currently Linux does AIO only for files opened with O_DIRECT */ 532 if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) { 533 error_setg(errp, "aio=native was specified, but it requires " 534 "cache.direct=on, which was not specified."); 535 ret = -EINVAL; 536 goto fail; 537 } 538 #else 539 if (s->use_linux_aio) { 540 error_setg(errp, "aio=native was specified, but is not supported " 541 "in this build."); 542 ret = -EINVAL; 543 goto fail; 544 } 545 #endif /* !defined(CONFIG_LINUX_AIO) */ 546 547 s->has_discard = true; 548 s->has_write_zeroes = true; 549 bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; 550 if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { 551 s->needs_alignment = true; 552 } 553 554 if (fstat(s->fd, &st) < 0) { 555 ret = -errno; 556 error_setg_errno(errp, errno, "Could not stat file"); 557 goto fail; 558 } 559 if (S_ISREG(st.st_mode)) { 560 s->discard_zeroes = true; 561 s->has_fallocate = true; 562 } 563 if (S_ISBLK(st.st_mode)) { 564 #ifdef BLKDISCARDZEROES 565 unsigned int arg; 566 if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { 567 s->discard_zeroes = true; 568 } 569 #endif 570 #ifdef __linux__ 571 /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do 572 * not rely on the contents of discarded blocks unless using O_DIRECT. 573 * Same for BLKZEROOUT. 574 */ 575 if (!(bs->open_flags & BDRV_O_NOCACHE)) { 576 s->discard_zeroes = false; 577 s->has_write_zeroes = false; 578 } 579 #endif 580 } 581 #ifdef __FreeBSD__ 582 if (S_ISCHR(st.st_mode)) { 583 /* 584 * The file is a char device (disk), which on FreeBSD isn't behind 585 * a pager, so force all requests to be aligned. This is needed 586 * so QEMU makes sure all IO operations on the device are aligned 587 * to sector size, or else FreeBSD will reject them with EINVAL. 588 */ 589 s->needs_alignment = true; 590 } 591 #endif 592 593 #ifdef CONFIG_XFS 594 if (platform_test_xfs_fd(s->fd)) { 595 s->is_xfs = true; 596 } 597 #endif 598 599 ret = 0; 600 fail: 601 if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { 602 unlink(filename); 603 } 604 qemu_opts_del(opts); 605 return ret; 606 } 607 608 static int raw_open(BlockDriverState *bs, QDict *options, int flags, 609 Error **errp) 610 { 611 BDRVRawState *s = bs->opaque; 612 613 s->type = FTYPE_FILE; 614 return raw_open_common(bs, options, flags, 0, errp); 615 } 616 617 typedef enum { 618 RAW_PL_PREPARE, 619 RAW_PL_COMMIT, 620 RAW_PL_ABORT, 621 } RawPermLockOp; 622 623 #define PERM_FOREACH(i) \ 624 for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++) 625 626 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the 627 * file; if @unlock == true, also unlock the unneeded bytes. 628 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared. 629 */ 630 static int raw_apply_lock_bytes(BDRVRawState *s, 631 uint64_t perm_lock_bits, 632 uint64_t shared_perm_lock_bits, 633 bool unlock, Error **errp) 634 { 635 int ret; 636 int i; 637 638 PERM_FOREACH(i) { 639 int off = RAW_LOCK_PERM_BASE + i; 640 if (perm_lock_bits & (1ULL << i)) { 641 ret = qemu_lock_fd(s->lock_fd, off, 1, false); 642 if (ret) { 643 error_setg(errp, "Failed to lock byte %d", off); 644 return ret; 645 } 646 } else if (unlock) { 647 ret = qemu_unlock_fd(s->lock_fd, off, 1); 648 if (ret) { 649 error_setg(errp, "Failed to unlock byte %d", off); 650 return ret; 651 } 652 } 653 } 654 PERM_FOREACH(i) { 655 int off = RAW_LOCK_SHARED_BASE + i; 656 if (shared_perm_lock_bits & (1ULL << i)) { 657 ret = qemu_lock_fd(s->lock_fd, off, 1, false); 658 if (ret) { 659 error_setg(errp, "Failed to lock byte %d", off); 660 return ret; 661 } 662 } else if (unlock) { 663 ret = qemu_unlock_fd(s->lock_fd, off, 1); 664 if (ret) { 665 error_setg(errp, "Failed to unlock byte %d", off); 666 return ret; 667 } 668 } 669 } 670 return 0; 671 } 672 673 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */ 674 static int raw_check_lock_bytes(BDRVRawState *s, 675 uint64_t perm, uint64_t shared_perm, 676 Error **errp) 677 { 678 int ret; 679 int i; 680 681 PERM_FOREACH(i) { 682 int off = RAW_LOCK_SHARED_BASE + i; 683 uint64_t p = 1ULL << i; 684 if (perm & p) { 685 ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); 686 if (ret) { 687 char *perm_name = bdrv_perm_names(p); 688 error_setg(errp, 689 "Failed to get \"%s\" lock", 690 perm_name); 691 g_free(perm_name); 692 error_append_hint(errp, 693 "Is another process using the image?\n"); 694 return ret; 695 } 696 } 697 } 698 PERM_FOREACH(i) { 699 int off = RAW_LOCK_PERM_BASE + i; 700 uint64_t p = 1ULL << i; 701 if (!(shared_perm & p)) { 702 ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); 703 if (ret) { 704 char *perm_name = bdrv_perm_names(p); 705 error_setg(errp, 706 "Failed to get shared \"%s\" lock", 707 perm_name); 708 g_free(perm_name); 709 error_append_hint(errp, 710 "Is another process using the image?\n"); 711 return ret; 712 } 713 } 714 } 715 return 0; 716 } 717 718 static int raw_handle_perm_lock(BlockDriverState *bs, 719 RawPermLockOp op, 720 uint64_t new_perm, uint64_t new_shared, 721 Error **errp) 722 { 723 BDRVRawState *s = bs->opaque; 724 int ret = 0; 725 Error *local_err = NULL; 726 727 if (!s->use_lock) { 728 return 0; 729 } 730 731 if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) { 732 return 0; 733 } 734 735 assert(s->lock_fd > 0); 736 737 switch (op) { 738 case RAW_PL_PREPARE: 739 ret = raw_apply_lock_bytes(s, s->perm | new_perm, 740 ~s->shared_perm | ~new_shared, 741 false, errp); 742 if (!ret) { 743 ret = raw_check_lock_bytes(s, new_perm, new_shared, errp); 744 if (!ret) { 745 return 0; 746 } 747 } 748 op = RAW_PL_ABORT; 749 /* fall through to unlock bytes. */ 750 case RAW_PL_ABORT: 751 raw_apply_lock_bytes(s, s->perm, ~s->shared_perm, true, &local_err); 752 if (local_err) { 753 /* Theoretically the above call only unlocks bytes and it cannot 754 * fail. Something weird happened, report it. 755 */ 756 error_report_err(local_err); 757 } 758 break; 759 case RAW_PL_COMMIT: 760 raw_apply_lock_bytes(s, new_perm, ~new_shared, true, &local_err); 761 if (local_err) { 762 /* Theoretically the above call only unlocks bytes and it cannot 763 * fail. Something weird happened, report it. 764 */ 765 error_report_err(local_err); 766 } 767 break; 768 } 769 return ret; 770 } 771 772 static int raw_reopen_prepare(BDRVReopenState *state, 773 BlockReopenQueue *queue, Error **errp) 774 { 775 BDRVRawState *s; 776 BDRVRawReopenState *rs; 777 int ret = 0; 778 Error *local_err = NULL; 779 780 assert(state != NULL); 781 assert(state->bs != NULL); 782 783 s = state->bs->opaque; 784 785 state->opaque = g_new0(BDRVRawReopenState, 1); 786 rs = state->opaque; 787 788 if (s->type == FTYPE_CD) { 789 rs->open_flags |= O_NONBLOCK; 790 } 791 792 raw_parse_flags(state->flags, &rs->open_flags); 793 794 rs->fd = -1; 795 796 int fcntl_flags = O_APPEND | O_NONBLOCK; 797 #ifdef O_NOATIME 798 fcntl_flags |= O_NOATIME; 799 #endif 800 801 #ifdef O_ASYNC 802 /* Not all operating systems have O_ASYNC, and those that don't 803 * will not let us track the state into rs->open_flags (typically 804 * you achieve the same effect with an ioctl, for example I_SETSIG 805 * on Solaris). But we do not use O_ASYNC, so that's fine. 806 */ 807 assert((s->open_flags & O_ASYNC) == 0); 808 #endif 809 810 if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { 811 /* dup the original fd */ 812 rs->fd = qemu_dup(s->fd); 813 if (rs->fd >= 0) { 814 ret = fcntl_setfl(rs->fd, rs->open_flags); 815 if (ret) { 816 qemu_close(rs->fd); 817 rs->fd = -1; 818 } 819 } 820 } 821 822 /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ 823 if (rs->fd == -1) { 824 const char *normalized_filename = state->bs->filename; 825 ret = raw_normalize_devicepath(&normalized_filename); 826 if (ret < 0) { 827 error_setg_errno(errp, -ret, "Could not normalize device path"); 828 } else { 829 assert(!(rs->open_flags & O_CREAT)); 830 rs->fd = qemu_open(normalized_filename, rs->open_flags); 831 if (rs->fd == -1) { 832 error_setg_errno(errp, errno, "Could not reopen file"); 833 ret = -1; 834 } 835 } 836 } 837 838 /* Fail already reopen_prepare() if we can't get a working O_DIRECT 839 * alignment with the new fd. */ 840 if (rs->fd != -1) { 841 raw_probe_alignment(state->bs, rs->fd, &local_err); 842 if (local_err) { 843 qemu_close(rs->fd); 844 rs->fd = -1; 845 error_propagate(errp, local_err); 846 ret = -EINVAL; 847 } 848 } 849 850 return ret; 851 } 852 853 static void raw_reopen_commit(BDRVReopenState *state) 854 { 855 BDRVRawReopenState *rs = state->opaque; 856 BDRVRawState *s = state->bs->opaque; 857 858 s->open_flags = rs->open_flags; 859 860 qemu_close(s->fd); 861 s->fd = rs->fd; 862 863 g_free(state->opaque); 864 state->opaque = NULL; 865 } 866 867 868 static void raw_reopen_abort(BDRVReopenState *state) 869 { 870 BDRVRawReopenState *rs = state->opaque; 871 872 /* nothing to do if NULL, we didn't get far enough */ 873 if (rs == NULL) { 874 return; 875 } 876 877 if (rs->fd >= 0) { 878 qemu_close(rs->fd); 879 rs->fd = -1; 880 } 881 g_free(state->opaque); 882 state->opaque = NULL; 883 } 884 885 static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd) 886 { 887 #ifdef BLKSECTGET 888 int max_bytes = 0; 889 short max_sectors = 0; 890 if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) { 891 return max_bytes; 892 } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) { 893 return max_sectors << BDRV_SECTOR_BITS; 894 } else { 895 return -errno; 896 } 897 #else 898 return -ENOSYS; 899 #endif 900 } 901 902 static int hdev_get_max_segments(const struct stat *st) 903 { 904 #ifdef CONFIG_LINUX 905 char buf[32]; 906 const char *end; 907 char *sysfspath; 908 int ret; 909 int fd = -1; 910 long max_segments; 911 912 sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments", 913 major(st->st_rdev), minor(st->st_rdev)); 914 fd = open(sysfspath, O_RDONLY); 915 if (fd == -1) { 916 ret = -errno; 917 goto out; 918 } 919 do { 920 ret = read(fd, buf, sizeof(buf) - 1); 921 } while (ret == -1 && errno == EINTR); 922 if (ret < 0) { 923 ret = -errno; 924 goto out; 925 } else if (ret == 0) { 926 ret = -EIO; 927 goto out; 928 } 929 buf[ret] = 0; 930 /* The file is ended with '\n', pass 'end' to accept that. */ 931 ret = qemu_strtol(buf, &end, 10, &max_segments); 932 if (ret == 0 && end && *end == '\n') { 933 ret = max_segments; 934 } 935 936 out: 937 if (fd != -1) { 938 close(fd); 939 } 940 g_free(sysfspath); 941 return ret; 942 #else 943 return -ENOTSUP; 944 #endif 945 } 946 947 static void raw_refresh_limits(BlockDriverState *bs, Error **errp) 948 { 949 BDRVRawState *s = bs->opaque; 950 struct stat st; 951 952 if (!fstat(s->fd, &st)) { 953 if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { 954 int ret = hdev_get_max_transfer_length(bs, s->fd); 955 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { 956 bs->bl.max_transfer = pow2floor(ret); 957 } 958 ret = hdev_get_max_segments(&st); 959 if (ret > 0) { 960 bs->bl.max_transfer = MIN(bs->bl.max_transfer, 961 ret * getpagesize()); 962 } 963 } 964 } 965 966 raw_probe_alignment(bs, s->fd, errp); 967 bs->bl.min_mem_alignment = s->buf_align; 968 bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); 969 } 970 971 static int check_for_dasd(int fd) 972 { 973 #ifdef BIODASDINFO2 974 struct dasd_information2_t info = {0}; 975 976 return ioctl(fd, BIODASDINFO2, &info); 977 #else 978 return -1; 979 #endif 980 } 981 982 /** 983 * Try to get @bs's logical and physical block size. 984 * On success, store them in @bsz and return zero. 985 * On failure, return negative errno. 986 */ 987 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 988 { 989 BDRVRawState *s = bs->opaque; 990 int ret; 991 992 /* If DASD, get blocksizes */ 993 if (check_for_dasd(s->fd) < 0) { 994 return -ENOTSUP; 995 } 996 ret = probe_logical_blocksize(s->fd, &bsz->log); 997 if (ret < 0) { 998 return ret; 999 } 1000 return probe_physical_blocksize(s->fd, &bsz->phys); 1001 } 1002 1003 /** 1004 * Try to get @bs's geometry: cyls, heads, sectors. 1005 * On success, store them in @geo and return 0. 1006 * On failure return -errno. 1007 * (Allows block driver to assign default geometry values that guest sees) 1008 */ 1009 #ifdef __linux__ 1010 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1011 { 1012 BDRVRawState *s = bs->opaque; 1013 struct hd_geometry ioctl_geo = {0}; 1014 1015 /* If DASD, get its geometry */ 1016 if (check_for_dasd(s->fd) < 0) { 1017 return -ENOTSUP; 1018 } 1019 if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { 1020 return -errno; 1021 } 1022 /* HDIO_GETGEO may return success even though geo contains zeros 1023 (e.g. certain multipath setups) */ 1024 if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { 1025 return -ENOTSUP; 1026 } 1027 /* Do not return a geometry for partition */ 1028 if (ioctl_geo.start != 0) { 1029 return -ENOTSUP; 1030 } 1031 geo->heads = ioctl_geo.heads; 1032 geo->sectors = ioctl_geo.sectors; 1033 geo->cylinders = ioctl_geo.cylinders; 1034 1035 return 0; 1036 } 1037 #else /* __linux__ */ 1038 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1039 { 1040 return -ENOTSUP; 1041 } 1042 #endif 1043 1044 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) 1045 { 1046 int ret; 1047 1048 ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); 1049 if (ret == -1) { 1050 return -errno; 1051 } 1052 1053 return 0; 1054 } 1055 1056 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) 1057 { 1058 BDRVRawState *s = aiocb->bs->opaque; 1059 int ret; 1060 1061 if (s->page_cache_inconsistent) { 1062 return -EIO; 1063 } 1064 1065 ret = qemu_fdatasync(aiocb->aio_fildes); 1066 if (ret == -1) { 1067 /* There is no clear definition of the semantics of a failing fsync(), 1068 * so we may have to assume the worst. The sad truth is that this 1069 * assumption is correct for Linux. Some pages are now probably marked 1070 * clean in the page cache even though they are inconsistent with the 1071 * on-disk contents. The next fdatasync() call would succeed, but no 1072 * further writeback attempt will be made. We can't get back to a state 1073 * in which we know what is on disk (we would have to rewrite 1074 * everything that was touched since the last fdatasync() at least), so 1075 * make bdrv_flush() fail permanently. Given that the behaviour isn't 1076 * really defined, I have little hope that other OSes are doing better. 1077 * 1078 * Obviously, this doesn't affect O_DIRECT, which bypasses the page 1079 * cache. */ 1080 if ((s->open_flags & O_DIRECT) == 0) { 1081 s->page_cache_inconsistent = true; 1082 } 1083 return -errno; 1084 } 1085 return 0; 1086 } 1087 1088 #ifdef CONFIG_PREADV 1089 1090 static bool preadv_present = true; 1091 1092 static ssize_t 1093 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1094 { 1095 return preadv(fd, iov, nr_iov, offset); 1096 } 1097 1098 static ssize_t 1099 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1100 { 1101 return pwritev(fd, iov, nr_iov, offset); 1102 } 1103 1104 #else 1105 1106 static bool preadv_present = false; 1107 1108 static ssize_t 1109 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1110 { 1111 return -ENOSYS; 1112 } 1113 1114 static ssize_t 1115 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1116 { 1117 return -ENOSYS; 1118 } 1119 1120 #endif 1121 1122 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) 1123 { 1124 ssize_t len; 1125 1126 do { 1127 if (aiocb->aio_type & QEMU_AIO_WRITE) 1128 len = qemu_pwritev(aiocb->aio_fildes, 1129 aiocb->aio_iov, 1130 aiocb->aio_niov, 1131 aiocb->aio_offset); 1132 else 1133 len = qemu_preadv(aiocb->aio_fildes, 1134 aiocb->aio_iov, 1135 aiocb->aio_niov, 1136 aiocb->aio_offset); 1137 } while (len == -1 && errno == EINTR); 1138 1139 if (len == -1) { 1140 return -errno; 1141 } 1142 return len; 1143 } 1144 1145 /* 1146 * Read/writes the data to/from a given linear buffer. 1147 * 1148 * Returns the number of bytes handles or -errno in case of an error. Short 1149 * reads are only returned if the end of the file is reached. 1150 */ 1151 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) 1152 { 1153 ssize_t offset = 0; 1154 ssize_t len; 1155 1156 while (offset < aiocb->aio_nbytes) { 1157 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1158 len = pwrite(aiocb->aio_fildes, 1159 (const char *)buf + offset, 1160 aiocb->aio_nbytes - offset, 1161 aiocb->aio_offset + offset); 1162 } else { 1163 len = pread(aiocb->aio_fildes, 1164 buf + offset, 1165 aiocb->aio_nbytes - offset, 1166 aiocb->aio_offset + offset); 1167 } 1168 if (len == -1 && errno == EINTR) { 1169 continue; 1170 } else if (len == -1 && errno == EINVAL && 1171 (aiocb->bs->open_flags & BDRV_O_NOCACHE) && 1172 !(aiocb->aio_type & QEMU_AIO_WRITE) && 1173 offset > 0) { 1174 /* O_DIRECT pread() may fail with EINVAL when offset is unaligned 1175 * after a short read. Assume that O_DIRECT short reads only occur 1176 * at EOF. Therefore this is a short read, not an I/O error. 1177 */ 1178 break; 1179 } else if (len == -1) { 1180 offset = -errno; 1181 break; 1182 } else if (len == 0) { 1183 break; 1184 } 1185 offset += len; 1186 } 1187 1188 return offset; 1189 } 1190 1191 static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) 1192 { 1193 ssize_t nbytes; 1194 char *buf; 1195 1196 if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { 1197 /* 1198 * If there is just a single buffer, and it is properly aligned 1199 * we can just use plain pread/pwrite without any problems. 1200 */ 1201 if (aiocb->aio_niov == 1) { 1202 return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); 1203 } 1204 /* 1205 * We have more than one iovec, and all are properly aligned. 1206 * 1207 * Try preadv/pwritev first and fall back to linearizing the 1208 * buffer if it's not supported. 1209 */ 1210 if (preadv_present) { 1211 nbytes = handle_aiocb_rw_vector(aiocb); 1212 if (nbytes == aiocb->aio_nbytes || 1213 (nbytes < 0 && nbytes != -ENOSYS)) { 1214 return nbytes; 1215 } 1216 preadv_present = false; 1217 } 1218 1219 /* 1220 * XXX(hch): short read/write. no easy way to handle the reminder 1221 * using these interfaces. For now retry using plain 1222 * pread/pwrite? 1223 */ 1224 } 1225 1226 /* 1227 * Ok, we have to do it the hard way, copy all segments into 1228 * a single aligned buffer. 1229 */ 1230 buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); 1231 if (buf == NULL) { 1232 return -ENOMEM; 1233 } 1234 1235 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1236 char *p = buf; 1237 int i; 1238 1239 for (i = 0; i < aiocb->aio_niov; ++i) { 1240 memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); 1241 p += aiocb->aio_iov[i].iov_len; 1242 } 1243 assert(p - buf == aiocb->aio_nbytes); 1244 } 1245 1246 nbytes = handle_aiocb_rw_linear(aiocb, buf); 1247 if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { 1248 char *p = buf; 1249 size_t count = aiocb->aio_nbytes, copy; 1250 int i; 1251 1252 for (i = 0; i < aiocb->aio_niov && count; ++i) { 1253 copy = count; 1254 if (copy > aiocb->aio_iov[i].iov_len) { 1255 copy = aiocb->aio_iov[i].iov_len; 1256 } 1257 memcpy(aiocb->aio_iov[i].iov_base, p, copy); 1258 assert(count >= copy); 1259 p += copy; 1260 count -= copy; 1261 } 1262 assert(count == 0); 1263 } 1264 qemu_vfree(buf); 1265 1266 return nbytes; 1267 } 1268 1269 #ifdef CONFIG_XFS 1270 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) 1271 { 1272 struct xfs_flock64 fl; 1273 int err; 1274 1275 memset(&fl, 0, sizeof(fl)); 1276 fl.l_whence = SEEK_SET; 1277 fl.l_start = offset; 1278 fl.l_len = bytes; 1279 1280 if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { 1281 err = errno; 1282 DPRINTF("cannot write zero range (%s)\n", strerror(errno)); 1283 return -err; 1284 } 1285 1286 return 0; 1287 } 1288 1289 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) 1290 { 1291 struct xfs_flock64 fl; 1292 int err; 1293 1294 memset(&fl, 0, sizeof(fl)); 1295 fl.l_whence = SEEK_SET; 1296 fl.l_start = offset; 1297 fl.l_len = bytes; 1298 1299 if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { 1300 err = errno; 1301 DPRINTF("cannot punch hole (%s)\n", strerror(errno)); 1302 return -err; 1303 } 1304 1305 return 0; 1306 } 1307 #endif 1308 1309 static int translate_err(int err) 1310 { 1311 if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || 1312 err == -ENOTTY) { 1313 err = -ENOTSUP; 1314 } 1315 return err; 1316 } 1317 1318 #ifdef CONFIG_FALLOCATE 1319 static int do_fallocate(int fd, int mode, off_t offset, off_t len) 1320 { 1321 do { 1322 if (fallocate(fd, mode, offset, len) == 0) { 1323 return 0; 1324 } 1325 } while (errno == EINTR); 1326 return translate_err(-errno); 1327 } 1328 #endif 1329 1330 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) 1331 { 1332 int ret = -ENOTSUP; 1333 BDRVRawState *s = aiocb->bs->opaque; 1334 1335 if (!s->has_write_zeroes) { 1336 return -ENOTSUP; 1337 } 1338 1339 #ifdef BLKZEROOUT 1340 do { 1341 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1342 if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { 1343 return 0; 1344 } 1345 } while (errno == EINTR); 1346 1347 ret = translate_err(-errno); 1348 #endif 1349 1350 if (ret == -ENOTSUP) { 1351 s->has_write_zeroes = false; 1352 } 1353 return ret; 1354 } 1355 1356 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) 1357 { 1358 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) 1359 BDRVRawState *s = aiocb->bs->opaque; 1360 #endif 1361 #ifdef CONFIG_FALLOCATE 1362 int64_t len; 1363 #endif 1364 1365 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1366 return handle_aiocb_write_zeroes_block(aiocb); 1367 } 1368 1369 #ifdef CONFIG_XFS 1370 if (s->is_xfs) { 1371 return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); 1372 } 1373 #endif 1374 1375 #ifdef CONFIG_FALLOCATE_ZERO_RANGE 1376 if (s->has_write_zeroes) { 1377 int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, 1378 aiocb->aio_offset, aiocb->aio_nbytes); 1379 if (ret == 0 || ret != -ENOTSUP) { 1380 return ret; 1381 } 1382 s->has_write_zeroes = false; 1383 } 1384 #endif 1385 1386 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1387 if (s->has_discard && s->has_fallocate) { 1388 int ret = do_fallocate(s->fd, 1389 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1390 aiocb->aio_offset, aiocb->aio_nbytes); 1391 if (ret == 0) { 1392 ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1393 if (ret == 0 || ret != -ENOTSUP) { 1394 return ret; 1395 } 1396 s->has_fallocate = false; 1397 } else if (ret != -ENOTSUP) { 1398 return ret; 1399 } else { 1400 s->has_discard = false; 1401 } 1402 } 1403 #endif 1404 1405 #ifdef CONFIG_FALLOCATE 1406 /* Last resort: we are trying to extend the file with zeroed data. This 1407 * can be done via fallocate(fd, 0) */ 1408 len = bdrv_getlength(aiocb->bs); 1409 if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) { 1410 int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1411 if (ret == 0 || ret != -ENOTSUP) { 1412 return ret; 1413 } 1414 s->has_fallocate = false; 1415 } 1416 #endif 1417 1418 return -ENOTSUP; 1419 } 1420 1421 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) 1422 { 1423 int ret = -EOPNOTSUPP; 1424 BDRVRawState *s = aiocb->bs->opaque; 1425 1426 if (!s->has_discard) { 1427 return -ENOTSUP; 1428 } 1429 1430 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1431 #ifdef BLKDISCARD 1432 do { 1433 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1434 if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { 1435 return 0; 1436 } 1437 } while (errno == EINTR); 1438 1439 ret = -errno; 1440 #endif 1441 } else { 1442 #ifdef CONFIG_XFS 1443 if (s->is_xfs) { 1444 return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); 1445 } 1446 #endif 1447 1448 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1449 ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1450 aiocb->aio_offset, aiocb->aio_nbytes); 1451 #endif 1452 } 1453 1454 ret = translate_err(ret); 1455 if (ret == -ENOTSUP) { 1456 s->has_discard = false; 1457 } 1458 return ret; 1459 } 1460 1461 static int aio_worker(void *arg) 1462 { 1463 RawPosixAIOData *aiocb = arg; 1464 ssize_t ret = 0; 1465 1466 switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { 1467 case QEMU_AIO_READ: 1468 ret = handle_aiocb_rw(aiocb); 1469 if (ret >= 0 && ret < aiocb->aio_nbytes) { 1470 iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, 1471 0, aiocb->aio_nbytes - ret); 1472 1473 ret = aiocb->aio_nbytes; 1474 } 1475 if (ret == aiocb->aio_nbytes) { 1476 ret = 0; 1477 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1478 ret = -EINVAL; 1479 } 1480 break; 1481 case QEMU_AIO_WRITE: 1482 ret = handle_aiocb_rw(aiocb); 1483 if (ret == aiocb->aio_nbytes) { 1484 ret = 0; 1485 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1486 ret = -EINVAL; 1487 } 1488 break; 1489 case QEMU_AIO_FLUSH: 1490 ret = handle_aiocb_flush(aiocb); 1491 break; 1492 case QEMU_AIO_IOCTL: 1493 ret = handle_aiocb_ioctl(aiocb); 1494 break; 1495 case QEMU_AIO_DISCARD: 1496 ret = handle_aiocb_discard(aiocb); 1497 break; 1498 case QEMU_AIO_WRITE_ZEROES: 1499 ret = handle_aiocb_write_zeroes(aiocb); 1500 break; 1501 default: 1502 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); 1503 ret = -EINVAL; 1504 break; 1505 } 1506 1507 g_free(aiocb); 1508 return ret; 1509 } 1510 1511 static int paio_submit_co(BlockDriverState *bs, int fd, 1512 int64_t offset, QEMUIOVector *qiov, 1513 int bytes, int type) 1514 { 1515 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1516 ThreadPool *pool; 1517 1518 acb->bs = bs; 1519 acb->aio_type = type; 1520 acb->aio_fildes = fd; 1521 1522 acb->aio_nbytes = bytes; 1523 acb->aio_offset = offset; 1524 1525 if (qiov) { 1526 acb->aio_iov = qiov->iov; 1527 acb->aio_niov = qiov->niov; 1528 assert(qiov->size == bytes); 1529 } 1530 1531 trace_paio_submit_co(offset, bytes, type); 1532 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1533 return thread_pool_submit_co(pool, aio_worker, acb); 1534 } 1535 1536 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, 1537 int64_t offset, QEMUIOVector *qiov, int bytes, 1538 BlockCompletionFunc *cb, void *opaque, int type) 1539 { 1540 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1541 ThreadPool *pool; 1542 1543 acb->bs = bs; 1544 acb->aio_type = type; 1545 acb->aio_fildes = fd; 1546 1547 acb->aio_nbytes = bytes; 1548 acb->aio_offset = offset; 1549 1550 if (qiov) { 1551 acb->aio_iov = qiov->iov; 1552 acb->aio_niov = qiov->niov; 1553 assert(qiov->size == acb->aio_nbytes); 1554 } 1555 1556 trace_paio_submit(acb, opaque, offset, bytes, type); 1557 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1558 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 1559 } 1560 1561 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, 1562 uint64_t bytes, QEMUIOVector *qiov, int type) 1563 { 1564 BDRVRawState *s = bs->opaque; 1565 1566 if (fd_open(bs) < 0) 1567 return -EIO; 1568 1569 /* 1570 * Check if the underlying device requires requests to be aligned, 1571 * and if the request we are trying to submit is aligned or not. 1572 * If this is the case tell the low-level driver that it needs 1573 * to copy the buffer. 1574 */ 1575 if (s->needs_alignment) { 1576 if (!bdrv_qiov_is_aligned(bs, qiov)) { 1577 type |= QEMU_AIO_MISALIGNED; 1578 #ifdef CONFIG_LINUX_AIO 1579 } else if (s->use_linux_aio) { 1580 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1581 assert(qiov->size == bytes); 1582 return laio_co_submit(bs, aio, s->fd, offset, qiov, type); 1583 #endif 1584 } 1585 } 1586 1587 return paio_submit_co(bs, s->fd, offset, qiov, bytes, type); 1588 } 1589 1590 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, 1591 uint64_t bytes, QEMUIOVector *qiov, 1592 int flags) 1593 { 1594 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); 1595 } 1596 1597 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, 1598 uint64_t bytes, QEMUIOVector *qiov, 1599 int flags) 1600 { 1601 assert(flags == 0); 1602 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); 1603 } 1604 1605 static void raw_aio_plug(BlockDriverState *bs) 1606 { 1607 #ifdef CONFIG_LINUX_AIO 1608 BDRVRawState *s = bs->opaque; 1609 if (s->use_linux_aio) { 1610 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1611 laio_io_plug(bs, aio); 1612 } 1613 #endif 1614 } 1615 1616 static void raw_aio_unplug(BlockDriverState *bs) 1617 { 1618 #ifdef CONFIG_LINUX_AIO 1619 BDRVRawState *s = bs->opaque; 1620 if (s->use_linux_aio) { 1621 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1622 laio_io_unplug(bs, aio); 1623 } 1624 #endif 1625 } 1626 1627 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, 1628 BlockCompletionFunc *cb, void *opaque) 1629 { 1630 BDRVRawState *s = bs->opaque; 1631 1632 if (fd_open(bs) < 0) 1633 return NULL; 1634 1635 return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); 1636 } 1637 1638 static void raw_close(BlockDriverState *bs) 1639 { 1640 BDRVRawState *s = bs->opaque; 1641 1642 if (s->fd >= 0) { 1643 qemu_close(s->fd); 1644 s->fd = -1; 1645 } 1646 if (s->lock_fd >= 0) { 1647 qemu_close(s->lock_fd); 1648 s->lock_fd = -1; 1649 } 1650 } 1651 1652 /** 1653 * Truncates the given regular file @fd to @offset and, when growing, fills the 1654 * new space according to @prealloc. 1655 * 1656 * Returns: 0 on success, -errno on failure. 1657 */ 1658 static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc, 1659 Error **errp) 1660 { 1661 int result = 0; 1662 int64_t current_length = 0; 1663 char *buf = NULL; 1664 struct stat st; 1665 1666 if (fstat(fd, &st) < 0) { 1667 result = -errno; 1668 error_setg_errno(errp, -result, "Could not stat file"); 1669 return result; 1670 } 1671 1672 current_length = st.st_size; 1673 if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { 1674 error_setg(errp, "Cannot use preallocation for shrinking files"); 1675 return -ENOTSUP; 1676 } 1677 1678 switch (prealloc) { 1679 #ifdef CONFIG_POSIX_FALLOCATE 1680 case PREALLOC_MODE_FALLOC: 1681 /* 1682 * Truncating before posix_fallocate() makes it about twice slower on 1683 * file systems that do not support fallocate(), trying to check if a 1684 * block is allocated before allocating it, so don't do that here. 1685 */ 1686 result = -posix_fallocate(fd, current_length, offset - current_length); 1687 if (result != 0) { 1688 /* posix_fallocate() doesn't set errno. */ 1689 error_setg_errno(errp, -result, 1690 "Could not preallocate new data"); 1691 } 1692 goto out; 1693 #endif 1694 case PREALLOC_MODE_FULL: 1695 { 1696 int64_t num = 0, left = offset - current_length; 1697 1698 /* 1699 * Knowing the final size from the beginning could allow the file 1700 * system driver to do less allocations and possibly avoid 1701 * fragmentation of the file. 1702 */ 1703 if (ftruncate(fd, offset) != 0) { 1704 result = -errno; 1705 error_setg_errno(errp, -result, "Could not resize file"); 1706 goto out; 1707 } 1708 1709 buf = g_malloc0(65536); 1710 1711 result = lseek(fd, current_length, SEEK_SET); 1712 if (result < 0) { 1713 result = -errno; 1714 error_setg_errno(errp, -result, 1715 "Failed to seek to the old end of file"); 1716 goto out; 1717 } 1718 1719 while (left > 0) { 1720 num = MIN(left, 65536); 1721 result = write(fd, buf, num); 1722 if (result < 0) { 1723 result = -errno; 1724 error_setg_errno(errp, -result, 1725 "Could not write zeros for preallocation"); 1726 goto out; 1727 } 1728 left -= result; 1729 } 1730 if (result >= 0) { 1731 result = fsync(fd); 1732 if (result < 0) { 1733 result = -errno; 1734 error_setg_errno(errp, -result, 1735 "Could not flush file to disk"); 1736 goto out; 1737 } 1738 } 1739 goto out; 1740 } 1741 case PREALLOC_MODE_OFF: 1742 if (ftruncate(fd, offset) != 0) { 1743 result = -errno; 1744 error_setg_errno(errp, -result, "Could not resize file"); 1745 } 1746 return result; 1747 default: 1748 result = -ENOTSUP; 1749 error_setg(errp, "Unsupported preallocation mode: %s", 1750 PreallocMode_str(prealloc)); 1751 return result; 1752 } 1753 1754 out: 1755 if (result < 0) { 1756 if (ftruncate(fd, current_length) < 0) { 1757 error_report("Failed to restore old file length: %s", 1758 strerror(errno)); 1759 } 1760 } 1761 1762 g_free(buf); 1763 return result; 1764 } 1765 1766 static int raw_truncate(BlockDriverState *bs, int64_t offset, 1767 PreallocMode prealloc, Error **errp) 1768 { 1769 BDRVRawState *s = bs->opaque; 1770 struct stat st; 1771 int ret; 1772 1773 if (fstat(s->fd, &st)) { 1774 ret = -errno; 1775 error_setg_errno(errp, -ret, "Failed to fstat() the file"); 1776 return ret; 1777 } 1778 1779 if (S_ISREG(st.st_mode)) { 1780 return raw_regular_truncate(s->fd, offset, prealloc, errp); 1781 } 1782 1783 if (prealloc != PREALLOC_MODE_OFF) { 1784 error_setg(errp, "Preallocation mode '%s' unsupported for this " 1785 "non-regular file", PreallocMode_str(prealloc)); 1786 return -ENOTSUP; 1787 } 1788 1789 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1790 if (offset > raw_getlength(bs)) { 1791 error_setg(errp, "Cannot grow device files"); 1792 return -EINVAL; 1793 } 1794 } else { 1795 error_setg(errp, "Resizing this file is not supported"); 1796 return -ENOTSUP; 1797 } 1798 1799 return 0; 1800 } 1801 1802 #ifdef __OpenBSD__ 1803 static int64_t raw_getlength(BlockDriverState *bs) 1804 { 1805 BDRVRawState *s = bs->opaque; 1806 int fd = s->fd; 1807 struct stat st; 1808 1809 if (fstat(fd, &st)) 1810 return -errno; 1811 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1812 struct disklabel dl; 1813 1814 if (ioctl(fd, DIOCGDINFO, &dl)) 1815 return -errno; 1816 return (uint64_t)dl.d_secsize * 1817 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1818 } else 1819 return st.st_size; 1820 } 1821 #elif defined(__NetBSD__) 1822 static int64_t raw_getlength(BlockDriverState *bs) 1823 { 1824 BDRVRawState *s = bs->opaque; 1825 int fd = s->fd; 1826 struct stat st; 1827 1828 if (fstat(fd, &st)) 1829 return -errno; 1830 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1831 struct dkwedge_info dkw; 1832 1833 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { 1834 return dkw.dkw_size * 512; 1835 } else { 1836 struct disklabel dl; 1837 1838 if (ioctl(fd, DIOCGDINFO, &dl)) 1839 return -errno; 1840 return (uint64_t)dl.d_secsize * 1841 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1842 } 1843 } else 1844 return st.st_size; 1845 } 1846 #elif defined(__sun__) 1847 static int64_t raw_getlength(BlockDriverState *bs) 1848 { 1849 BDRVRawState *s = bs->opaque; 1850 struct dk_minfo minfo; 1851 int ret; 1852 int64_t size; 1853 1854 ret = fd_open(bs); 1855 if (ret < 0) { 1856 return ret; 1857 } 1858 1859 /* 1860 * Use the DKIOCGMEDIAINFO ioctl to read the size. 1861 */ 1862 ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); 1863 if (ret != -1) { 1864 return minfo.dki_lbsize * minfo.dki_capacity; 1865 } 1866 1867 /* 1868 * There are reports that lseek on some devices fails, but 1869 * irc discussion said that contingency on contingency was overkill. 1870 */ 1871 size = lseek(s->fd, 0, SEEK_END); 1872 if (size < 0) { 1873 return -errno; 1874 } 1875 return size; 1876 } 1877 #elif defined(CONFIG_BSD) 1878 static int64_t raw_getlength(BlockDriverState *bs) 1879 { 1880 BDRVRawState *s = bs->opaque; 1881 int fd = s->fd; 1882 int64_t size; 1883 struct stat sb; 1884 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1885 int reopened = 0; 1886 #endif 1887 int ret; 1888 1889 ret = fd_open(bs); 1890 if (ret < 0) 1891 return ret; 1892 1893 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1894 again: 1895 #endif 1896 if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { 1897 #ifdef DIOCGMEDIASIZE 1898 if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) 1899 #elif defined(DIOCGPART) 1900 { 1901 struct partinfo pi; 1902 if (ioctl(fd, DIOCGPART, &pi) == 0) 1903 size = pi.media_size; 1904 else 1905 size = 0; 1906 } 1907 if (size == 0) 1908 #endif 1909 #if defined(__APPLE__) && defined(__MACH__) 1910 { 1911 uint64_t sectors = 0; 1912 uint32_t sector_size = 0; 1913 1914 if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 1915 && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { 1916 size = sectors * sector_size; 1917 } else { 1918 size = lseek(fd, 0LL, SEEK_END); 1919 if (size < 0) { 1920 return -errno; 1921 } 1922 } 1923 } 1924 #else 1925 size = lseek(fd, 0LL, SEEK_END); 1926 if (size < 0) { 1927 return -errno; 1928 } 1929 #endif 1930 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 1931 switch(s->type) { 1932 case FTYPE_CD: 1933 /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ 1934 if (size == 2048LL * (unsigned)-1) 1935 size = 0; 1936 /* XXX no disc? maybe we need to reopen... */ 1937 if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { 1938 reopened = 1; 1939 goto again; 1940 } 1941 } 1942 #endif 1943 } else { 1944 size = lseek(fd, 0, SEEK_END); 1945 if (size < 0) { 1946 return -errno; 1947 } 1948 } 1949 return size; 1950 } 1951 #else 1952 static int64_t raw_getlength(BlockDriverState *bs) 1953 { 1954 BDRVRawState *s = bs->opaque; 1955 int ret; 1956 int64_t size; 1957 1958 ret = fd_open(bs); 1959 if (ret < 0) { 1960 return ret; 1961 } 1962 1963 size = lseek(s->fd, 0, SEEK_END); 1964 if (size < 0) { 1965 return -errno; 1966 } 1967 return size; 1968 } 1969 #endif 1970 1971 static int64_t raw_get_allocated_file_size(BlockDriverState *bs) 1972 { 1973 struct stat st; 1974 BDRVRawState *s = bs->opaque; 1975 1976 if (fstat(s->fd, &st) < 0) { 1977 return -errno; 1978 } 1979 return (int64_t)st.st_blocks * 512; 1980 } 1981 1982 static int raw_create(const char *filename, QemuOpts *opts, Error **errp) 1983 { 1984 int fd; 1985 int result = 0; 1986 int64_t total_size = 0; 1987 bool nocow = false; 1988 PreallocMode prealloc; 1989 char *buf = NULL; 1990 Error *local_err = NULL; 1991 1992 strstart(filename, "file:", &filename); 1993 1994 /* Read out options */ 1995 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 1996 BDRV_SECTOR_SIZE); 1997 nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); 1998 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 1999 prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, 2000 PREALLOC_MODE_OFF, &local_err); 2001 g_free(buf); 2002 if (local_err) { 2003 error_propagate(errp, local_err); 2004 result = -EINVAL; 2005 goto out; 2006 } 2007 2008 fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, 2009 0644); 2010 if (fd < 0) { 2011 result = -errno; 2012 error_setg_errno(errp, -result, "Could not create file"); 2013 goto out; 2014 } 2015 2016 if (nocow) { 2017 #ifdef __linux__ 2018 /* Set NOCOW flag to solve performance issue on fs like btrfs. 2019 * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value 2020 * will be ignored since any failure of this operation should not 2021 * block the left work. 2022 */ 2023 int attr; 2024 if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { 2025 attr |= FS_NOCOW_FL; 2026 ioctl(fd, FS_IOC_SETFLAGS, &attr); 2027 } 2028 #endif 2029 } 2030 2031 result = raw_regular_truncate(fd, total_size, prealloc, errp); 2032 if (result < 0) { 2033 goto out_close; 2034 } 2035 2036 out_close: 2037 if (qemu_close(fd) != 0 && result == 0) { 2038 result = -errno; 2039 error_setg_errno(errp, -result, "Could not close the new file"); 2040 } 2041 out: 2042 return result; 2043 } 2044 2045 /* 2046 * Find allocation range in @bs around offset @start. 2047 * May change underlying file descriptor's file offset. 2048 * If @start is not in a hole, store @start in @data, and the 2049 * beginning of the next hole in @hole, and return 0. 2050 * If @start is in a non-trailing hole, store @start in @hole and the 2051 * beginning of the next non-hole in @data, and return 0. 2052 * If @start is in a trailing hole or beyond EOF, return -ENXIO. 2053 * If we can't find out, return a negative errno other than -ENXIO. 2054 */ 2055 static int find_allocation(BlockDriverState *bs, off_t start, 2056 off_t *data, off_t *hole) 2057 { 2058 #if defined SEEK_HOLE && defined SEEK_DATA 2059 BDRVRawState *s = bs->opaque; 2060 off_t offs; 2061 2062 /* 2063 * SEEK_DATA cases: 2064 * D1. offs == start: start is in data 2065 * D2. offs > start: start is in a hole, next data at offs 2066 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole 2067 * or start is beyond EOF 2068 * If the latter happens, the file has been truncated behind 2069 * our back since we opened it. All bets are off then. 2070 * Treating like a trailing hole is simplest. 2071 * D4. offs < 0, errno != ENXIO: we learned nothing 2072 */ 2073 offs = lseek(s->fd, start, SEEK_DATA); 2074 if (offs < 0) { 2075 return -errno; /* D3 or D4 */ 2076 } 2077 assert(offs >= start); 2078 2079 if (offs > start) { 2080 /* D2: in hole, next data at offs */ 2081 *hole = start; 2082 *data = offs; 2083 return 0; 2084 } 2085 2086 /* D1: in data, end not yet known */ 2087 2088 /* 2089 * SEEK_HOLE cases: 2090 * H1. offs == start: start is in a hole 2091 * If this happens here, a hole has been dug behind our back 2092 * since the previous lseek(). 2093 * H2. offs > start: either start is in data, next hole at offs, 2094 * or start is in trailing hole, EOF at offs 2095 * Linux treats trailing holes like any other hole: offs == 2096 * start. Solaris seeks to EOF instead: offs > start (blech). 2097 * If that happens here, a hole has been dug behind our back 2098 * since the previous lseek(). 2099 * H3. offs < 0, errno = ENXIO: start is beyond EOF 2100 * If this happens, the file has been truncated behind our 2101 * back since we opened it. Treat it like a trailing hole. 2102 * H4. offs < 0, errno != ENXIO: we learned nothing 2103 * Pretend we know nothing at all, i.e. "forget" about D1. 2104 */ 2105 offs = lseek(s->fd, start, SEEK_HOLE); 2106 if (offs < 0) { 2107 return -errno; /* D1 and (H3 or H4) */ 2108 } 2109 assert(offs >= start); 2110 2111 if (offs > start) { 2112 /* 2113 * D1 and H2: either in data, next hole at offs, or it was in 2114 * data but is now in a trailing hole. In the latter case, 2115 * all bets are off. Treating it as if it there was data all 2116 * the way to EOF is safe, so simply do that. 2117 */ 2118 *data = start; 2119 *hole = offs; 2120 return 0; 2121 } 2122 2123 /* D1 and H1 */ 2124 return -EBUSY; 2125 #else 2126 return -ENOTSUP; 2127 #endif 2128 } 2129 2130 /* 2131 * Returns the allocation status of the specified sectors. 2132 * 2133 * If 'sector_num' is beyond the end of the disk image the return value is 0 2134 * and 'pnum' is set to 0. 2135 * 2136 * 'pnum' is set to the number of sectors (including and immediately following 2137 * the specified sector) that are known to be in the same 2138 * allocated/unallocated state. 2139 * 2140 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 2141 * beyond the end of the disk image it will be clamped. 2142 */ 2143 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, 2144 int64_t sector_num, 2145 int nb_sectors, int *pnum, 2146 BlockDriverState **file) 2147 { 2148 off_t start, data = 0, hole = 0; 2149 int64_t total_size; 2150 int ret; 2151 2152 ret = fd_open(bs); 2153 if (ret < 0) { 2154 return ret; 2155 } 2156 2157 start = sector_num * BDRV_SECTOR_SIZE; 2158 total_size = bdrv_getlength(bs); 2159 if (total_size < 0) { 2160 return total_size; 2161 } else if (start >= total_size) { 2162 *pnum = 0; 2163 return 0; 2164 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { 2165 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); 2166 } 2167 2168 ret = find_allocation(bs, start, &data, &hole); 2169 if (ret == -ENXIO) { 2170 /* Trailing hole */ 2171 *pnum = nb_sectors; 2172 ret = BDRV_BLOCK_ZERO; 2173 } else if (ret < 0) { 2174 /* No info available, so pretend there are no holes */ 2175 *pnum = nb_sectors; 2176 ret = BDRV_BLOCK_DATA; 2177 } else if (data == start) { 2178 /* On a data extent, compute sectors to the end of the extent, 2179 * possibly including a partial sector at EOF. */ 2180 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); 2181 ret = BDRV_BLOCK_DATA; 2182 } else { 2183 /* On a hole, compute sectors to the beginning of the next extent. */ 2184 assert(hole == start); 2185 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); 2186 ret = BDRV_BLOCK_ZERO; 2187 } 2188 *file = bs; 2189 return ret | BDRV_BLOCK_OFFSET_VALID | start; 2190 } 2191 2192 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs, 2193 int64_t offset, int bytes, 2194 BlockCompletionFunc *cb, void *opaque) 2195 { 2196 BDRVRawState *s = bs->opaque; 2197 2198 return paio_submit(bs, s->fd, offset, NULL, bytes, 2199 cb, opaque, QEMU_AIO_DISCARD); 2200 } 2201 2202 static int coroutine_fn raw_co_pwrite_zeroes( 2203 BlockDriverState *bs, int64_t offset, 2204 int bytes, BdrvRequestFlags flags) 2205 { 2206 BDRVRawState *s = bs->opaque; 2207 2208 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 2209 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2210 QEMU_AIO_WRITE_ZEROES); 2211 } else if (s->discard_zeroes) { 2212 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2213 QEMU_AIO_DISCARD); 2214 } 2215 return -ENOTSUP; 2216 } 2217 2218 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 2219 { 2220 BDRVRawState *s = bs->opaque; 2221 2222 bdi->unallocated_blocks_are_zero = s->discard_zeroes; 2223 bdi->can_write_zeroes_with_unmap = s->discard_zeroes; 2224 return 0; 2225 } 2226 2227 static QemuOptsList raw_create_opts = { 2228 .name = "raw-create-opts", 2229 .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), 2230 .desc = { 2231 { 2232 .name = BLOCK_OPT_SIZE, 2233 .type = QEMU_OPT_SIZE, 2234 .help = "Virtual disk size" 2235 }, 2236 { 2237 .name = BLOCK_OPT_NOCOW, 2238 .type = QEMU_OPT_BOOL, 2239 .help = "Turn off copy-on-write (valid only on btrfs)" 2240 }, 2241 { 2242 .name = BLOCK_OPT_PREALLOC, 2243 .type = QEMU_OPT_STRING, 2244 .help = "Preallocation mode (allowed values: off, falloc, full)" 2245 }, 2246 { /* end of list */ } 2247 } 2248 }; 2249 2250 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared, 2251 Error **errp) 2252 { 2253 return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp); 2254 } 2255 2256 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) 2257 { 2258 BDRVRawState *s = bs->opaque; 2259 raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL); 2260 s->perm = perm; 2261 s->shared_perm = shared; 2262 } 2263 2264 static void raw_abort_perm_update(BlockDriverState *bs) 2265 { 2266 raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); 2267 } 2268 2269 BlockDriver bdrv_file = { 2270 .format_name = "file", 2271 .protocol_name = "file", 2272 .instance_size = sizeof(BDRVRawState), 2273 .bdrv_needs_filename = true, 2274 .bdrv_probe = NULL, /* no probe for protocols */ 2275 .bdrv_parse_filename = raw_parse_filename, 2276 .bdrv_file_open = raw_open, 2277 .bdrv_reopen_prepare = raw_reopen_prepare, 2278 .bdrv_reopen_commit = raw_reopen_commit, 2279 .bdrv_reopen_abort = raw_reopen_abort, 2280 .bdrv_close = raw_close, 2281 .bdrv_create = raw_create, 2282 .bdrv_has_zero_init = bdrv_has_zero_init_1, 2283 .bdrv_co_get_block_status = raw_co_get_block_status, 2284 .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, 2285 2286 .bdrv_co_preadv = raw_co_preadv, 2287 .bdrv_co_pwritev = raw_co_pwritev, 2288 .bdrv_aio_flush = raw_aio_flush, 2289 .bdrv_aio_pdiscard = raw_aio_pdiscard, 2290 .bdrv_refresh_limits = raw_refresh_limits, 2291 .bdrv_io_plug = raw_aio_plug, 2292 .bdrv_io_unplug = raw_aio_unplug, 2293 2294 .bdrv_truncate = raw_truncate, 2295 .bdrv_getlength = raw_getlength, 2296 .bdrv_get_info = raw_get_info, 2297 .bdrv_get_allocated_file_size 2298 = raw_get_allocated_file_size, 2299 .bdrv_check_perm = raw_check_perm, 2300 .bdrv_set_perm = raw_set_perm, 2301 .bdrv_abort_perm_update = raw_abort_perm_update, 2302 .create_opts = &raw_create_opts, 2303 }; 2304 2305 /***********************************************/ 2306 /* host device */ 2307 2308 #if defined(__APPLE__) && defined(__MACH__) 2309 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 2310 CFIndex maxPathSize, int flags); 2311 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) 2312 { 2313 kern_return_t kernResult = KERN_FAILURE; 2314 mach_port_t masterPort; 2315 CFMutableDictionaryRef classesToMatch; 2316 const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; 2317 char *mediaType = NULL; 2318 2319 kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); 2320 if ( KERN_SUCCESS != kernResult ) { 2321 printf( "IOMasterPort returned %d\n", kernResult ); 2322 } 2323 2324 int index; 2325 for (index = 0; index < ARRAY_SIZE(matching_array); index++) { 2326 classesToMatch = IOServiceMatching(matching_array[index]); 2327 if (classesToMatch == NULL) { 2328 error_report("IOServiceMatching returned NULL for %s", 2329 matching_array[index]); 2330 continue; 2331 } 2332 CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), 2333 kCFBooleanTrue); 2334 kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, 2335 mediaIterator); 2336 if (kernResult != KERN_SUCCESS) { 2337 error_report("Note: IOServiceGetMatchingServices returned %d", 2338 kernResult); 2339 continue; 2340 } 2341 2342 /* If a match was found, leave the loop */ 2343 if (*mediaIterator != 0) { 2344 DPRINTF("Matching using %s\n", matching_array[index]); 2345 mediaType = g_strdup(matching_array[index]); 2346 break; 2347 } 2348 } 2349 return mediaType; 2350 } 2351 2352 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 2353 CFIndex maxPathSize, int flags) 2354 { 2355 io_object_t nextMedia; 2356 kern_return_t kernResult = KERN_FAILURE; 2357 *bsdPath = '\0'; 2358 nextMedia = IOIteratorNext( mediaIterator ); 2359 if ( nextMedia ) 2360 { 2361 CFTypeRef bsdPathAsCFString; 2362 bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); 2363 if ( bsdPathAsCFString ) { 2364 size_t devPathLength; 2365 strcpy( bsdPath, _PATH_DEV ); 2366 if (flags & BDRV_O_NOCACHE) { 2367 strcat(bsdPath, "r"); 2368 } 2369 devPathLength = strlen( bsdPath ); 2370 if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { 2371 kernResult = KERN_SUCCESS; 2372 } 2373 CFRelease( bsdPathAsCFString ); 2374 } 2375 IOObjectRelease( nextMedia ); 2376 } 2377 2378 return kernResult; 2379 } 2380 2381 /* Sets up a real cdrom for use in QEMU */ 2382 static bool setup_cdrom(char *bsd_path, Error **errp) 2383 { 2384 int index, num_of_test_partitions = 2, fd; 2385 char test_partition[MAXPATHLEN]; 2386 bool partition_found = false; 2387 2388 /* look for a working partition */ 2389 for (index = 0; index < num_of_test_partitions; index++) { 2390 snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, 2391 index); 2392 fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE); 2393 if (fd >= 0) { 2394 partition_found = true; 2395 qemu_close(fd); 2396 break; 2397 } 2398 } 2399 2400 /* if a working partition on the device was not found */ 2401 if (partition_found == false) { 2402 error_setg(errp, "Failed to find a working partition on disc"); 2403 } else { 2404 DPRINTF("Using %s as optical disc\n", test_partition); 2405 pstrcpy(bsd_path, MAXPATHLEN, test_partition); 2406 } 2407 return partition_found; 2408 } 2409 2410 /* Prints directions on mounting and unmounting a device */ 2411 static void print_unmounting_directions(const char *file_name) 2412 { 2413 error_report("If device %s is mounted on the desktop, unmount" 2414 " it first before using it in QEMU", file_name); 2415 error_report("Command to unmount device: diskutil unmountDisk %s", 2416 file_name); 2417 error_report("Command to mount device: diskutil mountDisk %s", file_name); 2418 } 2419 2420 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2421 2422 static int hdev_probe_device(const char *filename) 2423 { 2424 struct stat st; 2425 2426 /* allow a dedicated CD-ROM driver to match with a higher priority */ 2427 if (strstart(filename, "/dev/cdrom", NULL)) 2428 return 50; 2429 2430 if (stat(filename, &st) >= 0 && 2431 (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { 2432 return 100; 2433 } 2434 2435 return 0; 2436 } 2437 2438 static int check_hdev_writable(BDRVRawState *s) 2439 { 2440 #if defined(BLKROGET) 2441 /* Linux block devices can be configured "read-only" using blockdev(8). 2442 * This is independent of device node permissions and therefore open(2) 2443 * with O_RDWR succeeds. Actual writes fail with EPERM. 2444 * 2445 * bdrv_open() is supposed to fail if the disk is read-only. Explicitly 2446 * check for read-only block devices so that Linux block devices behave 2447 * properly. 2448 */ 2449 struct stat st; 2450 int readonly = 0; 2451 2452 if (fstat(s->fd, &st)) { 2453 return -errno; 2454 } 2455 2456 if (!S_ISBLK(st.st_mode)) { 2457 return 0; 2458 } 2459 2460 if (ioctl(s->fd, BLKROGET, &readonly) < 0) { 2461 return -errno; 2462 } 2463 2464 if (readonly) { 2465 return -EACCES; 2466 } 2467 #endif /* defined(BLKROGET) */ 2468 return 0; 2469 } 2470 2471 static void hdev_parse_filename(const char *filename, QDict *options, 2472 Error **errp) 2473 { 2474 bdrv_parse_filename_strip_prefix(filename, "host_device:", options); 2475 } 2476 2477 static bool hdev_is_sg(BlockDriverState *bs) 2478 { 2479 2480 #if defined(__linux__) 2481 2482 BDRVRawState *s = bs->opaque; 2483 struct stat st; 2484 struct sg_scsi_id scsiid; 2485 int sg_version; 2486 int ret; 2487 2488 if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) { 2489 return false; 2490 } 2491 2492 ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version); 2493 if (ret < 0) { 2494 return false; 2495 } 2496 2497 ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid); 2498 if (ret >= 0) { 2499 DPRINTF("SG device found: type=%d, version=%d\n", 2500 scsiid.scsi_type, sg_version); 2501 return true; 2502 } 2503 2504 #endif 2505 2506 return false; 2507 } 2508 2509 static int hdev_open(BlockDriverState *bs, QDict *options, int flags, 2510 Error **errp) 2511 { 2512 BDRVRawState *s = bs->opaque; 2513 Error *local_err = NULL; 2514 int ret; 2515 2516 #if defined(__APPLE__) && defined(__MACH__) 2517 /* 2518 * Caution: while qdict_get_str() is fine, getting non-string types 2519 * would require more care. When @options come from -blockdev or 2520 * blockdev_add, its members are typed according to the QAPI 2521 * schema, but when they come from -drive, they're all QString. 2522 */ 2523 const char *filename = qdict_get_str(options, "filename"); 2524 char bsd_path[MAXPATHLEN] = ""; 2525 bool error_occurred = false; 2526 2527 /* If using a real cdrom */ 2528 if (strcmp(filename, "/dev/cdrom") == 0) { 2529 char *mediaType = NULL; 2530 kern_return_t ret_val; 2531 io_iterator_t mediaIterator = 0; 2532 2533 mediaType = FindEjectableOpticalMedia(&mediaIterator); 2534 if (mediaType == NULL) { 2535 error_setg(errp, "Please make sure your CD/DVD is in the optical" 2536 " drive"); 2537 error_occurred = true; 2538 goto hdev_open_Mac_error; 2539 } 2540 2541 ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags); 2542 if (ret_val != KERN_SUCCESS) { 2543 error_setg(errp, "Could not get BSD path for optical drive"); 2544 error_occurred = true; 2545 goto hdev_open_Mac_error; 2546 } 2547 2548 /* If a real optical drive was not found */ 2549 if (bsd_path[0] == '\0') { 2550 error_setg(errp, "Failed to obtain bsd path for optical drive"); 2551 error_occurred = true; 2552 goto hdev_open_Mac_error; 2553 } 2554 2555 /* If using a cdrom disc and finding a partition on the disc failed */ 2556 if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 && 2557 setup_cdrom(bsd_path, errp) == false) { 2558 print_unmounting_directions(bsd_path); 2559 error_occurred = true; 2560 goto hdev_open_Mac_error; 2561 } 2562 2563 qdict_put_str(options, "filename", bsd_path); 2564 2565 hdev_open_Mac_error: 2566 g_free(mediaType); 2567 if (mediaIterator) { 2568 IOObjectRelease(mediaIterator); 2569 } 2570 if (error_occurred) { 2571 return -ENOENT; 2572 } 2573 } 2574 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2575 2576 s->type = FTYPE_FILE; 2577 2578 ret = raw_open_common(bs, options, flags, 0, &local_err); 2579 if (ret < 0) { 2580 error_propagate(errp, local_err); 2581 #if defined(__APPLE__) && defined(__MACH__) 2582 if (*bsd_path) { 2583 filename = bsd_path; 2584 } 2585 /* if a physical device experienced an error while being opened */ 2586 if (strncmp(filename, "/dev/", 5) == 0) { 2587 print_unmounting_directions(filename); 2588 } 2589 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2590 return ret; 2591 } 2592 2593 /* Since this does ioctl the device must be already opened */ 2594 bs->sg = hdev_is_sg(bs); 2595 2596 if (flags & BDRV_O_RDWR) { 2597 ret = check_hdev_writable(s); 2598 if (ret < 0) { 2599 raw_close(bs); 2600 error_setg_errno(errp, -ret, "The device is not writable"); 2601 return ret; 2602 } 2603 } 2604 2605 return ret; 2606 } 2607 2608 #if defined(__linux__) 2609 2610 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, 2611 unsigned long int req, void *buf, 2612 BlockCompletionFunc *cb, void *opaque) 2613 { 2614 BDRVRawState *s = bs->opaque; 2615 RawPosixAIOData *acb; 2616 ThreadPool *pool; 2617 2618 if (fd_open(bs) < 0) 2619 return NULL; 2620 2621 if (req == SG_IO && s->pr_mgr) { 2622 struct sg_io_hdr *io_hdr = buf; 2623 if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || 2624 io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { 2625 return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), 2626 s->fd, io_hdr, cb, opaque); 2627 } 2628 } 2629 2630 acb = g_new(RawPosixAIOData, 1); 2631 acb->bs = bs; 2632 acb->aio_type = QEMU_AIO_IOCTL; 2633 acb->aio_fildes = s->fd; 2634 acb->aio_offset = 0; 2635 acb->aio_ioctl_buf = buf; 2636 acb->aio_ioctl_cmd = req; 2637 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 2638 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 2639 } 2640 #endif /* linux */ 2641 2642 static int fd_open(BlockDriverState *bs) 2643 { 2644 BDRVRawState *s = bs->opaque; 2645 2646 /* this is just to ensure s->fd is sane (its called by io ops) */ 2647 if (s->fd >= 0) 2648 return 0; 2649 return -EIO; 2650 } 2651 2652 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs, 2653 int64_t offset, int bytes, 2654 BlockCompletionFunc *cb, void *opaque) 2655 { 2656 BDRVRawState *s = bs->opaque; 2657 2658 if (fd_open(bs) < 0) { 2659 return NULL; 2660 } 2661 return paio_submit(bs, s->fd, offset, NULL, bytes, 2662 cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2663 } 2664 2665 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, 2666 int64_t offset, int bytes, BdrvRequestFlags flags) 2667 { 2668 BDRVRawState *s = bs->opaque; 2669 int rc; 2670 2671 rc = fd_open(bs); 2672 if (rc < 0) { 2673 return rc; 2674 } 2675 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 2676 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2677 QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); 2678 } else if (s->discard_zeroes) { 2679 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2680 QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2681 } 2682 return -ENOTSUP; 2683 } 2684 2685 static int hdev_create(const char *filename, QemuOpts *opts, 2686 Error **errp) 2687 { 2688 int fd; 2689 int ret = 0; 2690 struct stat stat_buf; 2691 int64_t total_size = 0; 2692 bool has_prefix; 2693 2694 /* This function is used by both protocol block drivers and therefore either 2695 * of these prefixes may be given. 2696 * The return value has to be stored somewhere, otherwise this is an error 2697 * due to -Werror=unused-value. */ 2698 has_prefix = 2699 strstart(filename, "host_device:", &filename) || 2700 strstart(filename, "host_cdrom:" , &filename); 2701 2702 (void)has_prefix; 2703 2704 ret = raw_normalize_devicepath(&filename); 2705 if (ret < 0) { 2706 error_setg_errno(errp, -ret, "Could not normalize device path"); 2707 return ret; 2708 } 2709 2710 /* Read out options */ 2711 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 2712 BDRV_SECTOR_SIZE); 2713 2714 fd = qemu_open(filename, O_WRONLY | O_BINARY); 2715 if (fd < 0) { 2716 ret = -errno; 2717 error_setg_errno(errp, -ret, "Could not open device"); 2718 return ret; 2719 } 2720 2721 if (fstat(fd, &stat_buf) < 0) { 2722 ret = -errno; 2723 error_setg_errno(errp, -ret, "Could not stat device"); 2724 } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { 2725 error_setg(errp, 2726 "The given file is neither a block nor a character device"); 2727 ret = -ENODEV; 2728 } else if (lseek(fd, 0, SEEK_END) < total_size) { 2729 error_setg(errp, "Device is too small"); 2730 ret = -ENOSPC; 2731 } 2732 2733 if (!ret && total_size) { 2734 uint8_t buf[BDRV_SECTOR_SIZE] = { 0 }; 2735 int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size); 2736 if (lseek(fd, 0, SEEK_SET) == -1) { 2737 ret = -errno; 2738 } else { 2739 ret = qemu_write_full(fd, buf, zero_size); 2740 ret = ret == zero_size ? 0 : -errno; 2741 } 2742 } 2743 qemu_close(fd); 2744 return ret; 2745 } 2746 2747 static BlockDriver bdrv_host_device = { 2748 .format_name = "host_device", 2749 .protocol_name = "host_device", 2750 .instance_size = sizeof(BDRVRawState), 2751 .bdrv_needs_filename = true, 2752 .bdrv_probe_device = hdev_probe_device, 2753 .bdrv_parse_filename = hdev_parse_filename, 2754 .bdrv_file_open = hdev_open, 2755 .bdrv_close = raw_close, 2756 .bdrv_reopen_prepare = raw_reopen_prepare, 2757 .bdrv_reopen_commit = raw_reopen_commit, 2758 .bdrv_reopen_abort = raw_reopen_abort, 2759 .bdrv_create = hdev_create, 2760 .create_opts = &raw_create_opts, 2761 .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, 2762 2763 .bdrv_co_preadv = raw_co_preadv, 2764 .bdrv_co_pwritev = raw_co_pwritev, 2765 .bdrv_aio_flush = raw_aio_flush, 2766 .bdrv_aio_pdiscard = hdev_aio_pdiscard, 2767 .bdrv_refresh_limits = raw_refresh_limits, 2768 .bdrv_io_plug = raw_aio_plug, 2769 .bdrv_io_unplug = raw_aio_unplug, 2770 2771 .bdrv_truncate = raw_truncate, 2772 .bdrv_getlength = raw_getlength, 2773 .bdrv_get_info = raw_get_info, 2774 .bdrv_get_allocated_file_size 2775 = raw_get_allocated_file_size, 2776 .bdrv_check_perm = raw_check_perm, 2777 .bdrv_set_perm = raw_set_perm, 2778 .bdrv_abort_perm_update = raw_abort_perm_update, 2779 .bdrv_probe_blocksizes = hdev_probe_blocksizes, 2780 .bdrv_probe_geometry = hdev_probe_geometry, 2781 2782 /* generic scsi device */ 2783 #ifdef __linux__ 2784 .bdrv_aio_ioctl = hdev_aio_ioctl, 2785 #endif 2786 }; 2787 2788 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 2789 static void cdrom_parse_filename(const char *filename, QDict *options, 2790 Error **errp) 2791 { 2792 bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options); 2793 } 2794 #endif 2795 2796 #ifdef __linux__ 2797 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 2798 Error **errp) 2799 { 2800 BDRVRawState *s = bs->opaque; 2801 2802 s->type = FTYPE_CD; 2803 2804 /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ 2805 return raw_open_common(bs, options, flags, O_NONBLOCK, errp); 2806 } 2807 2808 static int cdrom_probe_device(const char *filename) 2809 { 2810 int fd, ret; 2811 int prio = 0; 2812 struct stat st; 2813 2814 fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); 2815 if (fd < 0) { 2816 goto out; 2817 } 2818 ret = fstat(fd, &st); 2819 if (ret == -1 || !S_ISBLK(st.st_mode)) { 2820 goto outc; 2821 } 2822 2823 /* Attempt to detect via a CDROM specific ioctl */ 2824 ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 2825 if (ret >= 0) 2826 prio = 100; 2827 2828 outc: 2829 qemu_close(fd); 2830 out: 2831 return prio; 2832 } 2833 2834 static bool cdrom_is_inserted(BlockDriverState *bs) 2835 { 2836 BDRVRawState *s = bs->opaque; 2837 int ret; 2838 2839 ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 2840 return ret == CDS_DISC_OK; 2841 } 2842 2843 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 2844 { 2845 BDRVRawState *s = bs->opaque; 2846 2847 if (eject_flag) { 2848 if (ioctl(s->fd, CDROMEJECT, NULL) < 0) 2849 perror("CDROMEJECT"); 2850 } else { 2851 if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) 2852 perror("CDROMEJECT"); 2853 } 2854 } 2855 2856 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 2857 { 2858 BDRVRawState *s = bs->opaque; 2859 2860 if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { 2861 /* 2862 * Note: an error can happen if the distribution automatically 2863 * mounts the CD-ROM 2864 */ 2865 /* perror("CDROM_LOCKDOOR"); */ 2866 } 2867 } 2868 2869 static BlockDriver bdrv_host_cdrom = { 2870 .format_name = "host_cdrom", 2871 .protocol_name = "host_cdrom", 2872 .instance_size = sizeof(BDRVRawState), 2873 .bdrv_needs_filename = true, 2874 .bdrv_probe_device = cdrom_probe_device, 2875 .bdrv_parse_filename = cdrom_parse_filename, 2876 .bdrv_file_open = cdrom_open, 2877 .bdrv_close = raw_close, 2878 .bdrv_reopen_prepare = raw_reopen_prepare, 2879 .bdrv_reopen_commit = raw_reopen_commit, 2880 .bdrv_reopen_abort = raw_reopen_abort, 2881 .bdrv_create = hdev_create, 2882 .create_opts = &raw_create_opts, 2883 2884 2885 .bdrv_co_preadv = raw_co_preadv, 2886 .bdrv_co_pwritev = raw_co_pwritev, 2887 .bdrv_aio_flush = raw_aio_flush, 2888 .bdrv_refresh_limits = raw_refresh_limits, 2889 .bdrv_io_plug = raw_aio_plug, 2890 .bdrv_io_unplug = raw_aio_unplug, 2891 2892 .bdrv_truncate = raw_truncate, 2893 .bdrv_getlength = raw_getlength, 2894 .has_variable_length = true, 2895 .bdrv_get_allocated_file_size 2896 = raw_get_allocated_file_size, 2897 2898 /* removable device support */ 2899 .bdrv_is_inserted = cdrom_is_inserted, 2900 .bdrv_eject = cdrom_eject, 2901 .bdrv_lock_medium = cdrom_lock_medium, 2902 2903 /* generic scsi device */ 2904 .bdrv_aio_ioctl = hdev_aio_ioctl, 2905 }; 2906 #endif /* __linux__ */ 2907 2908 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 2909 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 2910 Error **errp) 2911 { 2912 BDRVRawState *s = bs->opaque; 2913 Error *local_err = NULL; 2914 int ret; 2915 2916 s->type = FTYPE_CD; 2917 2918 ret = raw_open_common(bs, options, flags, 0, &local_err); 2919 if (ret) { 2920 error_propagate(errp, local_err); 2921 return ret; 2922 } 2923 2924 /* make sure the door isn't locked at this time */ 2925 ioctl(s->fd, CDIOCALLOW); 2926 return 0; 2927 } 2928 2929 static int cdrom_probe_device(const char *filename) 2930 { 2931 if (strstart(filename, "/dev/cd", NULL) || 2932 strstart(filename, "/dev/acd", NULL)) 2933 return 100; 2934 return 0; 2935 } 2936 2937 static int cdrom_reopen(BlockDriverState *bs) 2938 { 2939 BDRVRawState *s = bs->opaque; 2940 int fd; 2941 2942 /* 2943 * Force reread of possibly changed/newly loaded disc, 2944 * FreeBSD seems to not notice sometimes... 2945 */ 2946 if (s->fd >= 0) 2947 qemu_close(s->fd); 2948 fd = qemu_open(bs->filename, s->open_flags, 0644); 2949 if (fd < 0) { 2950 s->fd = -1; 2951 return -EIO; 2952 } 2953 s->fd = fd; 2954 2955 /* make sure the door isn't locked at this time */ 2956 ioctl(s->fd, CDIOCALLOW); 2957 return 0; 2958 } 2959 2960 static bool cdrom_is_inserted(BlockDriverState *bs) 2961 { 2962 return raw_getlength(bs) > 0; 2963 } 2964 2965 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 2966 { 2967 BDRVRawState *s = bs->opaque; 2968 2969 if (s->fd < 0) 2970 return; 2971 2972 (void) ioctl(s->fd, CDIOCALLOW); 2973 2974 if (eject_flag) { 2975 if (ioctl(s->fd, CDIOCEJECT) < 0) 2976 perror("CDIOCEJECT"); 2977 } else { 2978 if (ioctl(s->fd, CDIOCCLOSE) < 0) 2979 perror("CDIOCCLOSE"); 2980 } 2981 2982 cdrom_reopen(bs); 2983 } 2984 2985 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 2986 { 2987 BDRVRawState *s = bs->opaque; 2988 2989 if (s->fd < 0) 2990 return; 2991 if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { 2992 /* 2993 * Note: an error can happen if the distribution automatically 2994 * mounts the CD-ROM 2995 */ 2996 /* perror("CDROM_LOCKDOOR"); */ 2997 } 2998 } 2999 3000 static BlockDriver bdrv_host_cdrom = { 3001 .format_name = "host_cdrom", 3002 .protocol_name = "host_cdrom", 3003 .instance_size = sizeof(BDRVRawState), 3004 .bdrv_needs_filename = true, 3005 .bdrv_probe_device = cdrom_probe_device, 3006 .bdrv_parse_filename = cdrom_parse_filename, 3007 .bdrv_file_open = cdrom_open, 3008 .bdrv_close = raw_close, 3009 .bdrv_reopen_prepare = raw_reopen_prepare, 3010 .bdrv_reopen_commit = raw_reopen_commit, 3011 .bdrv_reopen_abort = raw_reopen_abort, 3012 .bdrv_create = hdev_create, 3013 .create_opts = &raw_create_opts, 3014 3015 .bdrv_co_preadv = raw_co_preadv, 3016 .bdrv_co_pwritev = raw_co_pwritev, 3017 .bdrv_aio_flush = raw_aio_flush, 3018 .bdrv_refresh_limits = raw_refresh_limits, 3019 .bdrv_io_plug = raw_aio_plug, 3020 .bdrv_io_unplug = raw_aio_unplug, 3021 3022 .bdrv_truncate = raw_truncate, 3023 .bdrv_getlength = raw_getlength, 3024 .has_variable_length = true, 3025 .bdrv_get_allocated_file_size 3026 = raw_get_allocated_file_size, 3027 3028 /* removable device support */ 3029 .bdrv_is_inserted = cdrom_is_inserted, 3030 .bdrv_eject = cdrom_eject, 3031 .bdrv_lock_medium = cdrom_lock_medium, 3032 }; 3033 #endif /* __FreeBSD__ */ 3034 3035 static void bdrv_file_init(void) 3036 { 3037 /* 3038 * Register all the drivers. Note that order is important, the driver 3039 * registered last will get probed first. 3040 */ 3041 bdrv_register(&bdrv_file); 3042 bdrv_register(&bdrv_host_device); 3043 #ifdef __linux__ 3044 bdrv_register(&bdrv_host_cdrom); 3045 #endif 3046 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 3047 bdrv_register(&bdrv_host_cdrom); 3048 #endif 3049 } 3050 3051 block_init(bdrv_file_init); 3052