1 /* 2 * Block driver for RAW files (posix) 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "qapi/error.h" 27 #include "qemu/cutils.h" 28 #include "qemu/error-report.h" 29 #include "block/block_int.h" 30 #include "qemu/module.h" 31 #include "qemu/option.h" 32 #include "trace.h" 33 #include "block/thread-pool.h" 34 #include "qemu/iov.h" 35 #include "block/raw-aio.h" 36 #include "qapi/qmp/qdict.h" 37 #include "qapi/qmp/qstring.h" 38 39 #include "scsi/pr-manager.h" 40 #include "scsi/constants.h" 41 42 #if defined(__APPLE__) && (__MACH__) 43 #include <paths.h> 44 #include <sys/param.h> 45 #include <IOKit/IOKitLib.h> 46 #include <IOKit/IOBSD.h> 47 #include <IOKit/storage/IOMediaBSDClient.h> 48 #include <IOKit/storage/IOMedia.h> 49 #include <IOKit/storage/IOCDMedia.h> 50 //#include <IOKit/storage/IOCDTypes.h> 51 #include <IOKit/storage/IODVDMedia.h> 52 #include <CoreFoundation/CoreFoundation.h> 53 #endif 54 55 #ifdef __sun__ 56 #define _POSIX_PTHREAD_SEMANTICS 1 57 #include <sys/dkio.h> 58 #endif 59 #ifdef __linux__ 60 #include <sys/ioctl.h> 61 #include <sys/param.h> 62 #include <linux/cdrom.h> 63 #include <linux/fd.h> 64 #include <linux/fs.h> 65 #include <linux/hdreg.h> 66 #include <scsi/sg.h> 67 #ifdef __s390__ 68 #include <asm/dasd.h> 69 #endif 70 #ifndef FS_NOCOW_FL 71 #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ 72 #endif 73 #endif 74 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) 75 #include <linux/falloc.h> 76 #endif 77 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 78 #include <sys/disk.h> 79 #include <sys/cdio.h> 80 #endif 81 82 #ifdef __OpenBSD__ 83 #include <sys/ioctl.h> 84 #include <sys/disklabel.h> 85 #include <sys/dkio.h> 86 #endif 87 88 #ifdef __NetBSD__ 89 #include <sys/ioctl.h> 90 #include <sys/disklabel.h> 91 #include <sys/dkio.h> 92 #include <sys/disk.h> 93 #endif 94 95 #ifdef __DragonFly__ 96 #include <sys/ioctl.h> 97 #include <sys/diskslice.h> 98 #endif 99 100 #ifdef CONFIG_XFS 101 #include <xfs/xfs.h> 102 #endif 103 104 //#define DEBUG_BLOCK 105 106 #ifdef DEBUG_BLOCK 107 # define DEBUG_BLOCK_PRINT 1 108 #else 109 # define DEBUG_BLOCK_PRINT 0 110 #endif 111 #define DPRINTF(fmt, ...) \ 112 do { \ 113 if (DEBUG_BLOCK_PRINT) { \ 114 printf(fmt, ## __VA_ARGS__); \ 115 } \ 116 } while (0) 117 118 /* OS X does not have O_DSYNC */ 119 #ifndef O_DSYNC 120 #ifdef O_SYNC 121 #define O_DSYNC O_SYNC 122 #elif defined(O_FSYNC) 123 #define O_DSYNC O_FSYNC 124 #endif 125 #endif 126 127 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ 128 #ifndef O_DIRECT 129 #define O_DIRECT O_DSYNC 130 #endif 131 132 #define FTYPE_FILE 0 133 #define FTYPE_CD 1 134 135 #define MAX_BLOCKSIZE 4096 136 137 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, 138 * leaving a few more bytes for its future use. */ 139 #define RAW_LOCK_PERM_BASE 100 140 #define RAW_LOCK_SHARED_BASE 200 141 142 typedef struct BDRVRawState { 143 int fd; 144 int lock_fd; 145 bool use_lock; 146 int type; 147 int open_flags; 148 size_t buf_align; 149 150 /* The current permissions. */ 151 uint64_t perm; 152 uint64_t shared_perm; 153 154 #ifdef CONFIG_XFS 155 bool is_xfs:1; 156 #endif 157 bool has_discard:1; 158 bool has_write_zeroes:1; 159 bool discard_zeroes:1; 160 bool use_linux_aio:1; 161 bool page_cache_inconsistent:1; 162 bool has_fallocate; 163 bool needs_alignment; 164 bool check_cache_dropped; 165 166 PRManager *pr_mgr; 167 } BDRVRawState; 168 169 typedef struct BDRVRawReopenState { 170 int fd; 171 int open_flags; 172 bool check_cache_dropped; 173 } BDRVRawReopenState; 174 175 static int fd_open(BlockDriverState *bs); 176 static int64_t raw_getlength(BlockDriverState *bs); 177 178 typedef struct RawPosixAIOData { 179 BlockDriverState *bs; 180 int aio_fildes; 181 union { 182 struct iovec *aio_iov; 183 void *aio_ioctl_buf; 184 }; 185 int aio_niov; 186 uint64_t aio_nbytes; 187 #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ 188 off_t aio_offset; 189 int aio_type; 190 } RawPosixAIOData; 191 192 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 193 static int cdrom_reopen(BlockDriverState *bs); 194 #endif 195 196 #if defined(__NetBSD__) 197 static int raw_normalize_devicepath(const char **filename) 198 { 199 static char namebuf[PATH_MAX]; 200 const char *dp, *fname; 201 struct stat sb; 202 203 fname = *filename; 204 dp = strrchr(fname, '/'); 205 if (lstat(fname, &sb) < 0) { 206 fprintf(stderr, "%s: stat failed: %s\n", 207 fname, strerror(errno)); 208 return -errno; 209 } 210 211 if (!S_ISBLK(sb.st_mode)) { 212 return 0; 213 } 214 215 if (dp == NULL) { 216 snprintf(namebuf, PATH_MAX, "r%s", fname); 217 } else { 218 snprintf(namebuf, PATH_MAX, "%.*s/r%s", 219 (int)(dp - fname), fname, dp + 1); 220 } 221 fprintf(stderr, "%s is a block device", fname); 222 *filename = namebuf; 223 fprintf(stderr, ", using %s\n", *filename); 224 225 return 0; 226 } 227 #else 228 static int raw_normalize_devicepath(const char **filename) 229 { 230 return 0; 231 } 232 #endif 233 234 /* 235 * Get logical block size via ioctl. On success store it in @sector_size_p. 236 */ 237 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) 238 { 239 unsigned int sector_size; 240 bool success = false; 241 int i; 242 243 errno = ENOTSUP; 244 static const unsigned long ioctl_list[] = { 245 #ifdef BLKSSZGET 246 BLKSSZGET, 247 #endif 248 #ifdef DKIOCGETBLOCKSIZE 249 DKIOCGETBLOCKSIZE, 250 #endif 251 #ifdef DIOCGSECTORSIZE 252 DIOCGSECTORSIZE, 253 #endif 254 }; 255 256 /* Try a few ioctls to get the right size */ 257 for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) { 258 if (ioctl(fd, ioctl_list[i], §or_size) >= 0) { 259 *sector_size_p = sector_size; 260 success = true; 261 } 262 } 263 264 return success ? 0 : -errno; 265 } 266 267 /** 268 * Get physical block size of @fd. 269 * On success, store it in @blk_size and return 0. 270 * On failure, return -errno. 271 */ 272 static int probe_physical_blocksize(int fd, unsigned int *blk_size) 273 { 274 #ifdef BLKPBSZGET 275 if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { 276 return -errno; 277 } 278 return 0; 279 #else 280 return -ENOTSUP; 281 #endif 282 } 283 284 /* Check if read is allowed with given memory buffer and length. 285 * 286 * This function is used to check O_DIRECT memory buffer and request alignment. 287 */ 288 static bool raw_is_io_aligned(int fd, void *buf, size_t len) 289 { 290 ssize_t ret = pread(fd, buf, len, 0); 291 292 if (ret >= 0) { 293 return true; 294 } 295 296 #ifdef __linux__ 297 /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore 298 * other errors (e.g. real I/O error), which could happen on a failed 299 * drive, since we only care about probing alignment. 300 */ 301 if (errno != EINVAL) { 302 return true; 303 } 304 #endif 305 306 return false; 307 } 308 309 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) 310 { 311 BDRVRawState *s = bs->opaque; 312 char *buf; 313 size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); 314 315 /* For SCSI generic devices the alignment is not really used. 316 With buffered I/O, we don't have any restrictions. */ 317 if (bdrv_is_sg(bs) || !s->needs_alignment) { 318 bs->bl.request_alignment = 1; 319 s->buf_align = 1; 320 return; 321 } 322 323 bs->bl.request_alignment = 0; 324 s->buf_align = 0; 325 /* Let's try to use the logical blocksize for the alignment. */ 326 if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) { 327 bs->bl.request_alignment = 0; 328 } 329 #ifdef CONFIG_XFS 330 if (s->is_xfs) { 331 struct dioattr da; 332 if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { 333 bs->bl.request_alignment = da.d_miniosz; 334 /* The kernel returns wrong information for d_mem */ 335 /* s->buf_align = da.d_mem; */ 336 } 337 } 338 #endif 339 340 /* If we could not get the sizes so far, we can only guess them */ 341 if (!s->buf_align) { 342 size_t align; 343 buf = qemu_memalign(max_align, 2 * max_align); 344 for (align = 512; align <= max_align; align <<= 1) { 345 if (raw_is_io_aligned(fd, buf + align, max_align)) { 346 s->buf_align = align; 347 break; 348 } 349 } 350 qemu_vfree(buf); 351 } 352 353 if (!bs->bl.request_alignment) { 354 size_t align; 355 buf = qemu_memalign(s->buf_align, max_align); 356 for (align = 512; align <= max_align; align <<= 1) { 357 if (raw_is_io_aligned(fd, buf, align)) { 358 bs->bl.request_alignment = align; 359 break; 360 } 361 } 362 qemu_vfree(buf); 363 } 364 365 if (!s->buf_align || !bs->bl.request_alignment) { 366 error_setg(errp, "Could not find working O_DIRECT alignment"); 367 error_append_hint(errp, "Try cache.direct=off\n"); 368 } 369 } 370 371 static void raw_parse_flags(int bdrv_flags, int *open_flags) 372 { 373 assert(open_flags != NULL); 374 375 *open_flags |= O_BINARY; 376 *open_flags &= ~O_ACCMODE; 377 if (bdrv_flags & BDRV_O_RDWR) { 378 *open_flags |= O_RDWR; 379 } else { 380 *open_flags |= O_RDONLY; 381 } 382 383 /* Use O_DSYNC for write-through caching, no flags for write-back caching, 384 * and O_DIRECT for no caching. */ 385 if ((bdrv_flags & BDRV_O_NOCACHE)) { 386 *open_flags |= O_DIRECT; 387 } 388 } 389 390 static void raw_parse_filename(const char *filename, QDict *options, 391 Error **errp) 392 { 393 bdrv_parse_filename_strip_prefix(filename, "file:", options); 394 } 395 396 static QemuOptsList raw_runtime_opts = { 397 .name = "raw", 398 .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), 399 .desc = { 400 { 401 .name = "filename", 402 .type = QEMU_OPT_STRING, 403 .help = "File name of the image", 404 }, 405 { 406 .name = "aio", 407 .type = QEMU_OPT_STRING, 408 .help = "host AIO implementation (threads, native)", 409 }, 410 { 411 .name = "locking", 412 .type = QEMU_OPT_STRING, 413 .help = "file locking mode (on/off/auto, default: auto)", 414 }, 415 { 416 .name = "pr-manager", 417 .type = QEMU_OPT_STRING, 418 .help = "id of persistent reservation manager object (default: none)", 419 }, 420 { 421 .name = "x-check-cache-dropped", 422 .type = QEMU_OPT_BOOL, 423 .help = "check that page cache was dropped on live migration (default: off)" 424 }, 425 { /* end of list */ } 426 }, 427 }; 428 429 static int raw_open_common(BlockDriverState *bs, QDict *options, 430 int bdrv_flags, int open_flags, Error **errp) 431 { 432 BDRVRawState *s = bs->opaque; 433 QemuOpts *opts; 434 Error *local_err = NULL; 435 const char *filename = NULL; 436 const char *str; 437 BlockdevAioOptions aio, aio_default; 438 int fd, ret; 439 struct stat st; 440 OnOffAuto locking; 441 442 opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); 443 qemu_opts_absorb_qdict(opts, options, &local_err); 444 if (local_err) { 445 error_propagate(errp, local_err); 446 ret = -EINVAL; 447 goto fail; 448 } 449 450 filename = qemu_opt_get(opts, "filename"); 451 452 ret = raw_normalize_devicepath(&filename); 453 if (ret != 0) { 454 error_setg_errno(errp, -ret, "Could not normalize device path"); 455 goto fail; 456 } 457 458 aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) 459 ? BLOCKDEV_AIO_OPTIONS_NATIVE 460 : BLOCKDEV_AIO_OPTIONS_THREADS; 461 aio = qapi_enum_parse(&BlockdevAioOptions_lookup, 462 qemu_opt_get(opts, "aio"), 463 aio_default, &local_err); 464 if (local_err) { 465 error_propagate(errp, local_err); 466 ret = -EINVAL; 467 goto fail; 468 } 469 s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); 470 471 locking = qapi_enum_parse(&OnOffAuto_lookup, 472 qemu_opt_get(opts, "locking"), 473 ON_OFF_AUTO_AUTO, &local_err); 474 if (local_err) { 475 error_propagate(errp, local_err); 476 ret = -EINVAL; 477 goto fail; 478 } 479 switch (locking) { 480 case ON_OFF_AUTO_ON: 481 s->use_lock = true; 482 if (!qemu_has_ofd_lock()) { 483 fprintf(stderr, 484 "File lock requested but OFD locking syscall is " 485 "unavailable, falling back to POSIX file locks.\n" 486 "Due to the implementation, locks can be lost " 487 "unexpectedly.\n"); 488 } 489 break; 490 case ON_OFF_AUTO_OFF: 491 s->use_lock = false; 492 break; 493 case ON_OFF_AUTO_AUTO: 494 s->use_lock = qemu_has_ofd_lock(); 495 break; 496 default: 497 abort(); 498 } 499 500 str = qemu_opt_get(opts, "pr-manager"); 501 if (str) { 502 s->pr_mgr = pr_manager_lookup(str, &local_err); 503 if (local_err) { 504 error_propagate(errp, local_err); 505 ret = -EINVAL; 506 goto fail; 507 } 508 } 509 510 s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped", 511 false); 512 513 s->open_flags = open_flags; 514 raw_parse_flags(bdrv_flags, &s->open_flags); 515 516 s->fd = -1; 517 fd = qemu_open(filename, s->open_flags, 0644); 518 if (fd < 0) { 519 ret = -errno; 520 error_setg_errno(errp, errno, "Could not open '%s'", filename); 521 if (ret == -EROFS) { 522 ret = -EACCES; 523 } 524 goto fail; 525 } 526 s->fd = fd; 527 528 s->lock_fd = -1; 529 if (s->use_lock) { 530 fd = qemu_open(filename, s->open_flags); 531 if (fd < 0) { 532 ret = -errno; 533 error_setg_errno(errp, errno, "Could not open '%s' for locking", 534 filename); 535 qemu_close(s->fd); 536 goto fail; 537 } 538 s->lock_fd = fd; 539 } 540 s->perm = 0; 541 s->shared_perm = BLK_PERM_ALL; 542 543 #ifdef CONFIG_LINUX_AIO 544 /* Currently Linux does AIO only for files opened with O_DIRECT */ 545 if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) { 546 error_setg(errp, "aio=native was specified, but it requires " 547 "cache.direct=on, which was not specified."); 548 ret = -EINVAL; 549 goto fail; 550 } 551 #else 552 if (s->use_linux_aio) { 553 error_setg(errp, "aio=native was specified, but is not supported " 554 "in this build."); 555 ret = -EINVAL; 556 goto fail; 557 } 558 #endif /* !defined(CONFIG_LINUX_AIO) */ 559 560 s->has_discard = true; 561 s->has_write_zeroes = true; 562 if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { 563 s->needs_alignment = true; 564 } 565 566 if (fstat(s->fd, &st) < 0) { 567 ret = -errno; 568 error_setg_errno(errp, errno, "Could not stat file"); 569 goto fail; 570 } 571 if (S_ISREG(st.st_mode)) { 572 s->discard_zeroes = true; 573 s->has_fallocate = true; 574 } 575 if (S_ISBLK(st.st_mode)) { 576 #ifdef BLKDISCARDZEROES 577 unsigned int arg; 578 if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { 579 s->discard_zeroes = true; 580 } 581 #endif 582 #ifdef __linux__ 583 /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do 584 * not rely on the contents of discarded blocks unless using O_DIRECT. 585 * Same for BLKZEROOUT. 586 */ 587 if (!(bs->open_flags & BDRV_O_NOCACHE)) { 588 s->discard_zeroes = false; 589 s->has_write_zeroes = false; 590 } 591 #endif 592 } 593 #ifdef __FreeBSD__ 594 if (S_ISCHR(st.st_mode)) { 595 /* 596 * The file is a char device (disk), which on FreeBSD isn't behind 597 * a pager, so force all requests to be aligned. This is needed 598 * so QEMU makes sure all IO operations on the device are aligned 599 * to sector size, or else FreeBSD will reject them with EINVAL. 600 */ 601 s->needs_alignment = true; 602 } 603 #endif 604 605 #ifdef CONFIG_XFS 606 if (platform_test_xfs_fd(s->fd)) { 607 s->is_xfs = true; 608 } 609 #endif 610 611 bs->supported_zero_flags = s->discard_zeroes ? BDRV_REQ_MAY_UNMAP : 0; 612 ret = 0; 613 fail: 614 if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { 615 unlink(filename); 616 } 617 qemu_opts_del(opts); 618 return ret; 619 } 620 621 static int raw_open(BlockDriverState *bs, QDict *options, int flags, 622 Error **errp) 623 { 624 BDRVRawState *s = bs->opaque; 625 626 s->type = FTYPE_FILE; 627 return raw_open_common(bs, options, flags, 0, errp); 628 } 629 630 typedef enum { 631 RAW_PL_PREPARE, 632 RAW_PL_COMMIT, 633 RAW_PL_ABORT, 634 } RawPermLockOp; 635 636 #define PERM_FOREACH(i) \ 637 for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++) 638 639 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the 640 * file; if @unlock == true, also unlock the unneeded bytes. 641 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared. 642 */ 643 static int raw_apply_lock_bytes(BDRVRawState *s, 644 uint64_t perm_lock_bits, 645 uint64_t shared_perm_lock_bits, 646 bool unlock, Error **errp) 647 { 648 int ret; 649 int i; 650 651 PERM_FOREACH(i) { 652 int off = RAW_LOCK_PERM_BASE + i; 653 if (perm_lock_bits & (1ULL << i)) { 654 ret = qemu_lock_fd(s->lock_fd, off, 1, false); 655 if (ret) { 656 error_setg(errp, "Failed to lock byte %d", off); 657 return ret; 658 } 659 } else if (unlock) { 660 ret = qemu_unlock_fd(s->lock_fd, off, 1); 661 if (ret) { 662 error_setg(errp, "Failed to unlock byte %d", off); 663 return ret; 664 } 665 } 666 } 667 PERM_FOREACH(i) { 668 int off = RAW_LOCK_SHARED_BASE + i; 669 if (shared_perm_lock_bits & (1ULL << i)) { 670 ret = qemu_lock_fd(s->lock_fd, off, 1, false); 671 if (ret) { 672 error_setg(errp, "Failed to lock byte %d", off); 673 return ret; 674 } 675 } else if (unlock) { 676 ret = qemu_unlock_fd(s->lock_fd, off, 1); 677 if (ret) { 678 error_setg(errp, "Failed to unlock byte %d", off); 679 return ret; 680 } 681 } 682 } 683 return 0; 684 } 685 686 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */ 687 static int raw_check_lock_bytes(BDRVRawState *s, 688 uint64_t perm, uint64_t shared_perm, 689 Error **errp) 690 { 691 int ret; 692 int i; 693 694 PERM_FOREACH(i) { 695 int off = RAW_LOCK_SHARED_BASE + i; 696 uint64_t p = 1ULL << i; 697 if (perm & p) { 698 ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); 699 if (ret) { 700 char *perm_name = bdrv_perm_names(p); 701 error_setg(errp, 702 "Failed to get \"%s\" lock", 703 perm_name); 704 g_free(perm_name); 705 error_append_hint(errp, 706 "Is another process using the image?\n"); 707 return ret; 708 } 709 } 710 } 711 PERM_FOREACH(i) { 712 int off = RAW_LOCK_PERM_BASE + i; 713 uint64_t p = 1ULL << i; 714 if (!(shared_perm & p)) { 715 ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); 716 if (ret) { 717 char *perm_name = bdrv_perm_names(p); 718 error_setg(errp, 719 "Failed to get shared \"%s\" lock", 720 perm_name); 721 g_free(perm_name); 722 error_append_hint(errp, 723 "Is another process using the image?\n"); 724 return ret; 725 } 726 } 727 } 728 return 0; 729 } 730 731 static int raw_handle_perm_lock(BlockDriverState *bs, 732 RawPermLockOp op, 733 uint64_t new_perm, uint64_t new_shared, 734 Error **errp) 735 { 736 BDRVRawState *s = bs->opaque; 737 int ret = 0; 738 Error *local_err = NULL; 739 740 if (!s->use_lock) { 741 return 0; 742 } 743 744 if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) { 745 return 0; 746 } 747 748 assert(s->lock_fd > 0); 749 750 switch (op) { 751 case RAW_PL_PREPARE: 752 ret = raw_apply_lock_bytes(s, s->perm | new_perm, 753 ~s->shared_perm | ~new_shared, 754 false, errp); 755 if (!ret) { 756 ret = raw_check_lock_bytes(s, new_perm, new_shared, errp); 757 if (!ret) { 758 return 0; 759 } 760 } 761 op = RAW_PL_ABORT; 762 /* fall through to unlock bytes. */ 763 case RAW_PL_ABORT: 764 raw_apply_lock_bytes(s, s->perm, ~s->shared_perm, true, &local_err); 765 if (local_err) { 766 /* Theoretically the above call only unlocks bytes and it cannot 767 * fail. Something weird happened, report it. 768 */ 769 error_report_err(local_err); 770 } 771 break; 772 case RAW_PL_COMMIT: 773 raw_apply_lock_bytes(s, new_perm, ~new_shared, true, &local_err); 774 if (local_err) { 775 /* Theoretically the above call only unlocks bytes and it cannot 776 * fail. Something weird happened, report it. 777 */ 778 error_report_err(local_err); 779 } 780 break; 781 } 782 return ret; 783 } 784 785 static int raw_reopen_prepare(BDRVReopenState *state, 786 BlockReopenQueue *queue, Error **errp) 787 { 788 BDRVRawState *s; 789 BDRVRawReopenState *rs; 790 QemuOpts *opts; 791 int ret = 0; 792 Error *local_err = NULL; 793 794 assert(state != NULL); 795 assert(state->bs != NULL); 796 797 s = state->bs->opaque; 798 799 state->opaque = g_new0(BDRVRawReopenState, 1); 800 rs = state->opaque; 801 rs->fd = -1; 802 803 /* Handle options changes */ 804 opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); 805 qemu_opts_absorb_qdict(opts, state->options, &local_err); 806 if (local_err) { 807 error_propagate(errp, local_err); 808 ret = -EINVAL; 809 goto out; 810 } 811 812 rs->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped", 813 s->check_cache_dropped); 814 815 if (s->type == FTYPE_CD) { 816 rs->open_flags |= O_NONBLOCK; 817 } 818 819 raw_parse_flags(state->flags, &rs->open_flags); 820 821 int fcntl_flags = O_APPEND | O_NONBLOCK; 822 #ifdef O_NOATIME 823 fcntl_flags |= O_NOATIME; 824 #endif 825 826 #ifdef O_ASYNC 827 /* Not all operating systems have O_ASYNC, and those that don't 828 * will not let us track the state into rs->open_flags (typically 829 * you achieve the same effect with an ioctl, for example I_SETSIG 830 * on Solaris). But we do not use O_ASYNC, so that's fine. 831 */ 832 assert((s->open_flags & O_ASYNC) == 0); 833 #endif 834 835 if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { 836 /* dup the original fd */ 837 rs->fd = qemu_dup(s->fd); 838 if (rs->fd >= 0) { 839 ret = fcntl_setfl(rs->fd, rs->open_flags); 840 if (ret) { 841 qemu_close(rs->fd); 842 rs->fd = -1; 843 } 844 } 845 } 846 847 /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ 848 if (rs->fd == -1) { 849 const char *normalized_filename = state->bs->filename; 850 ret = raw_normalize_devicepath(&normalized_filename); 851 if (ret < 0) { 852 error_setg_errno(errp, -ret, "Could not normalize device path"); 853 } else { 854 assert(!(rs->open_flags & O_CREAT)); 855 rs->fd = qemu_open(normalized_filename, rs->open_flags); 856 if (rs->fd == -1) { 857 error_setg_errno(errp, errno, "Could not reopen file"); 858 ret = -1; 859 } 860 } 861 } 862 863 /* Fail already reopen_prepare() if we can't get a working O_DIRECT 864 * alignment with the new fd. */ 865 if (rs->fd != -1) { 866 raw_probe_alignment(state->bs, rs->fd, &local_err); 867 if (local_err) { 868 qemu_close(rs->fd); 869 rs->fd = -1; 870 error_propagate(errp, local_err); 871 ret = -EINVAL; 872 } 873 } 874 875 out: 876 qemu_opts_del(opts); 877 return ret; 878 } 879 880 static void raw_reopen_commit(BDRVReopenState *state) 881 { 882 BDRVRawReopenState *rs = state->opaque; 883 BDRVRawState *s = state->bs->opaque; 884 885 s->check_cache_dropped = rs->check_cache_dropped; 886 s->open_flags = rs->open_flags; 887 888 qemu_close(s->fd); 889 s->fd = rs->fd; 890 891 g_free(state->opaque); 892 state->opaque = NULL; 893 } 894 895 896 static void raw_reopen_abort(BDRVReopenState *state) 897 { 898 BDRVRawReopenState *rs = state->opaque; 899 900 /* nothing to do if NULL, we didn't get far enough */ 901 if (rs == NULL) { 902 return; 903 } 904 905 if (rs->fd >= 0) { 906 qemu_close(rs->fd); 907 rs->fd = -1; 908 } 909 g_free(state->opaque); 910 state->opaque = NULL; 911 } 912 913 static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd) 914 { 915 #ifdef BLKSECTGET 916 int max_bytes = 0; 917 short max_sectors = 0; 918 if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) { 919 return max_bytes; 920 } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) { 921 return max_sectors << BDRV_SECTOR_BITS; 922 } else { 923 return -errno; 924 } 925 #else 926 return -ENOSYS; 927 #endif 928 } 929 930 static int hdev_get_max_segments(const struct stat *st) 931 { 932 #ifdef CONFIG_LINUX 933 char buf[32]; 934 const char *end; 935 char *sysfspath; 936 int ret; 937 int fd = -1; 938 long max_segments; 939 940 sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments", 941 major(st->st_rdev), minor(st->st_rdev)); 942 fd = open(sysfspath, O_RDONLY); 943 if (fd == -1) { 944 ret = -errno; 945 goto out; 946 } 947 do { 948 ret = read(fd, buf, sizeof(buf) - 1); 949 } while (ret == -1 && errno == EINTR); 950 if (ret < 0) { 951 ret = -errno; 952 goto out; 953 } else if (ret == 0) { 954 ret = -EIO; 955 goto out; 956 } 957 buf[ret] = 0; 958 /* The file is ended with '\n', pass 'end' to accept that. */ 959 ret = qemu_strtol(buf, &end, 10, &max_segments); 960 if (ret == 0 && end && *end == '\n') { 961 ret = max_segments; 962 } 963 964 out: 965 if (fd != -1) { 966 close(fd); 967 } 968 g_free(sysfspath); 969 return ret; 970 #else 971 return -ENOTSUP; 972 #endif 973 } 974 975 static void raw_refresh_limits(BlockDriverState *bs, Error **errp) 976 { 977 BDRVRawState *s = bs->opaque; 978 struct stat st; 979 980 if (!fstat(s->fd, &st)) { 981 if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { 982 int ret = hdev_get_max_transfer_length(bs, s->fd); 983 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { 984 bs->bl.max_transfer = pow2floor(ret); 985 } 986 ret = hdev_get_max_segments(&st); 987 if (ret > 0) { 988 bs->bl.max_transfer = MIN(bs->bl.max_transfer, 989 ret * getpagesize()); 990 } 991 } 992 } 993 994 raw_probe_alignment(bs, s->fd, errp); 995 bs->bl.min_mem_alignment = s->buf_align; 996 bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); 997 } 998 999 static int check_for_dasd(int fd) 1000 { 1001 #ifdef BIODASDINFO2 1002 struct dasd_information2_t info = {0}; 1003 1004 return ioctl(fd, BIODASDINFO2, &info); 1005 #else 1006 return -1; 1007 #endif 1008 } 1009 1010 /** 1011 * Try to get @bs's logical and physical block size. 1012 * On success, store them in @bsz and return zero. 1013 * On failure, return negative errno. 1014 */ 1015 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 1016 { 1017 BDRVRawState *s = bs->opaque; 1018 int ret; 1019 1020 /* If DASD, get blocksizes */ 1021 if (check_for_dasd(s->fd) < 0) { 1022 return -ENOTSUP; 1023 } 1024 ret = probe_logical_blocksize(s->fd, &bsz->log); 1025 if (ret < 0) { 1026 return ret; 1027 } 1028 return probe_physical_blocksize(s->fd, &bsz->phys); 1029 } 1030 1031 /** 1032 * Try to get @bs's geometry: cyls, heads, sectors. 1033 * On success, store them in @geo and return 0. 1034 * On failure return -errno. 1035 * (Allows block driver to assign default geometry values that guest sees) 1036 */ 1037 #ifdef __linux__ 1038 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1039 { 1040 BDRVRawState *s = bs->opaque; 1041 struct hd_geometry ioctl_geo = {0}; 1042 1043 /* If DASD, get its geometry */ 1044 if (check_for_dasd(s->fd) < 0) { 1045 return -ENOTSUP; 1046 } 1047 if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { 1048 return -errno; 1049 } 1050 /* HDIO_GETGEO may return success even though geo contains zeros 1051 (e.g. certain multipath setups) */ 1052 if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { 1053 return -ENOTSUP; 1054 } 1055 /* Do not return a geometry for partition */ 1056 if (ioctl_geo.start != 0) { 1057 return -ENOTSUP; 1058 } 1059 geo->heads = ioctl_geo.heads; 1060 geo->sectors = ioctl_geo.sectors; 1061 geo->cylinders = ioctl_geo.cylinders; 1062 1063 return 0; 1064 } 1065 #else /* __linux__ */ 1066 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1067 { 1068 return -ENOTSUP; 1069 } 1070 #endif 1071 1072 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) 1073 { 1074 int ret; 1075 1076 ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); 1077 if (ret == -1) { 1078 return -errno; 1079 } 1080 1081 return 0; 1082 } 1083 1084 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) 1085 { 1086 BDRVRawState *s = aiocb->bs->opaque; 1087 int ret; 1088 1089 if (s->page_cache_inconsistent) { 1090 return -EIO; 1091 } 1092 1093 ret = qemu_fdatasync(aiocb->aio_fildes); 1094 if (ret == -1) { 1095 /* There is no clear definition of the semantics of a failing fsync(), 1096 * so we may have to assume the worst. The sad truth is that this 1097 * assumption is correct for Linux. Some pages are now probably marked 1098 * clean in the page cache even though they are inconsistent with the 1099 * on-disk contents. The next fdatasync() call would succeed, but no 1100 * further writeback attempt will be made. We can't get back to a state 1101 * in which we know what is on disk (we would have to rewrite 1102 * everything that was touched since the last fdatasync() at least), so 1103 * make bdrv_flush() fail permanently. Given that the behaviour isn't 1104 * really defined, I have little hope that other OSes are doing better. 1105 * 1106 * Obviously, this doesn't affect O_DIRECT, which bypasses the page 1107 * cache. */ 1108 if ((s->open_flags & O_DIRECT) == 0) { 1109 s->page_cache_inconsistent = true; 1110 } 1111 return -errno; 1112 } 1113 return 0; 1114 } 1115 1116 #ifdef CONFIG_PREADV 1117 1118 static bool preadv_present = true; 1119 1120 static ssize_t 1121 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1122 { 1123 return preadv(fd, iov, nr_iov, offset); 1124 } 1125 1126 static ssize_t 1127 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1128 { 1129 return pwritev(fd, iov, nr_iov, offset); 1130 } 1131 1132 #else 1133 1134 static bool preadv_present = false; 1135 1136 static ssize_t 1137 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1138 { 1139 return -ENOSYS; 1140 } 1141 1142 static ssize_t 1143 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1144 { 1145 return -ENOSYS; 1146 } 1147 1148 #endif 1149 1150 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) 1151 { 1152 ssize_t len; 1153 1154 do { 1155 if (aiocb->aio_type & QEMU_AIO_WRITE) 1156 len = qemu_pwritev(aiocb->aio_fildes, 1157 aiocb->aio_iov, 1158 aiocb->aio_niov, 1159 aiocb->aio_offset); 1160 else 1161 len = qemu_preadv(aiocb->aio_fildes, 1162 aiocb->aio_iov, 1163 aiocb->aio_niov, 1164 aiocb->aio_offset); 1165 } while (len == -1 && errno == EINTR); 1166 1167 if (len == -1) { 1168 return -errno; 1169 } 1170 return len; 1171 } 1172 1173 /* 1174 * Read/writes the data to/from a given linear buffer. 1175 * 1176 * Returns the number of bytes handles or -errno in case of an error. Short 1177 * reads are only returned if the end of the file is reached. 1178 */ 1179 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) 1180 { 1181 ssize_t offset = 0; 1182 ssize_t len; 1183 1184 while (offset < aiocb->aio_nbytes) { 1185 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1186 len = pwrite(aiocb->aio_fildes, 1187 (const char *)buf + offset, 1188 aiocb->aio_nbytes - offset, 1189 aiocb->aio_offset + offset); 1190 } else { 1191 len = pread(aiocb->aio_fildes, 1192 buf + offset, 1193 aiocb->aio_nbytes - offset, 1194 aiocb->aio_offset + offset); 1195 } 1196 if (len == -1 && errno == EINTR) { 1197 continue; 1198 } else if (len == -1 && errno == EINVAL && 1199 (aiocb->bs->open_flags & BDRV_O_NOCACHE) && 1200 !(aiocb->aio_type & QEMU_AIO_WRITE) && 1201 offset > 0) { 1202 /* O_DIRECT pread() may fail with EINVAL when offset is unaligned 1203 * after a short read. Assume that O_DIRECT short reads only occur 1204 * at EOF. Therefore this is a short read, not an I/O error. 1205 */ 1206 break; 1207 } else if (len == -1) { 1208 offset = -errno; 1209 break; 1210 } else if (len == 0) { 1211 break; 1212 } 1213 offset += len; 1214 } 1215 1216 return offset; 1217 } 1218 1219 static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) 1220 { 1221 ssize_t nbytes; 1222 char *buf; 1223 1224 if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { 1225 /* 1226 * If there is just a single buffer, and it is properly aligned 1227 * we can just use plain pread/pwrite without any problems. 1228 */ 1229 if (aiocb->aio_niov == 1) { 1230 return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); 1231 } 1232 /* 1233 * We have more than one iovec, and all are properly aligned. 1234 * 1235 * Try preadv/pwritev first and fall back to linearizing the 1236 * buffer if it's not supported. 1237 */ 1238 if (preadv_present) { 1239 nbytes = handle_aiocb_rw_vector(aiocb); 1240 if (nbytes == aiocb->aio_nbytes || 1241 (nbytes < 0 && nbytes != -ENOSYS)) { 1242 return nbytes; 1243 } 1244 preadv_present = false; 1245 } 1246 1247 /* 1248 * XXX(hch): short read/write. no easy way to handle the reminder 1249 * using these interfaces. For now retry using plain 1250 * pread/pwrite? 1251 */ 1252 } 1253 1254 /* 1255 * Ok, we have to do it the hard way, copy all segments into 1256 * a single aligned buffer. 1257 */ 1258 buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); 1259 if (buf == NULL) { 1260 return -ENOMEM; 1261 } 1262 1263 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1264 char *p = buf; 1265 int i; 1266 1267 for (i = 0; i < aiocb->aio_niov; ++i) { 1268 memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); 1269 p += aiocb->aio_iov[i].iov_len; 1270 } 1271 assert(p - buf == aiocb->aio_nbytes); 1272 } 1273 1274 nbytes = handle_aiocb_rw_linear(aiocb, buf); 1275 if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { 1276 char *p = buf; 1277 size_t count = aiocb->aio_nbytes, copy; 1278 int i; 1279 1280 for (i = 0; i < aiocb->aio_niov && count; ++i) { 1281 copy = count; 1282 if (copy > aiocb->aio_iov[i].iov_len) { 1283 copy = aiocb->aio_iov[i].iov_len; 1284 } 1285 memcpy(aiocb->aio_iov[i].iov_base, p, copy); 1286 assert(count >= copy); 1287 p += copy; 1288 count -= copy; 1289 } 1290 assert(count == 0); 1291 } 1292 qemu_vfree(buf); 1293 1294 return nbytes; 1295 } 1296 1297 #ifdef CONFIG_XFS 1298 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) 1299 { 1300 struct xfs_flock64 fl; 1301 int err; 1302 1303 memset(&fl, 0, sizeof(fl)); 1304 fl.l_whence = SEEK_SET; 1305 fl.l_start = offset; 1306 fl.l_len = bytes; 1307 1308 if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { 1309 err = errno; 1310 DPRINTF("cannot write zero range (%s)\n", strerror(errno)); 1311 return -err; 1312 } 1313 1314 return 0; 1315 } 1316 1317 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) 1318 { 1319 struct xfs_flock64 fl; 1320 int err; 1321 1322 memset(&fl, 0, sizeof(fl)); 1323 fl.l_whence = SEEK_SET; 1324 fl.l_start = offset; 1325 fl.l_len = bytes; 1326 1327 if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { 1328 err = errno; 1329 DPRINTF("cannot punch hole (%s)\n", strerror(errno)); 1330 return -err; 1331 } 1332 1333 return 0; 1334 } 1335 #endif 1336 1337 static int translate_err(int err) 1338 { 1339 if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || 1340 err == -ENOTTY) { 1341 err = -ENOTSUP; 1342 } 1343 return err; 1344 } 1345 1346 #ifdef CONFIG_FALLOCATE 1347 static int do_fallocate(int fd, int mode, off_t offset, off_t len) 1348 { 1349 do { 1350 if (fallocate(fd, mode, offset, len) == 0) { 1351 return 0; 1352 } 1353 } while (errno == EINTR); 1354 return translate_err(-errno); 1355 } 1356 #endif 1357 1358 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) 1359 { 1360 int ret = -ENOTSUP; 1361 BDRVRawState *s = aiocb->bs->opaque; 1362 1363 if (!s->has_write_zeroes) { 1364 return -ENOTSUP; 1365 } 1366 1367 #ifdef BLKZEROOUT 1368 do { 1369 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1370 if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { 1371 return 0; 1372 } 1373 } while (errno == EINTR); 1374 1375 ret = translate_err(-errno); 1376 #endif 1377 1378 if (ret == -ENOTSUP) { 1379 s->has_write_zeroes = false; 1380 } 1381 return ret; 1382 } 1383 1384 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) 1385 { 1386 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) 1387 BDRVRawState *s = aiocb->bs->opaque; 1388 #endif 1389 #ifdef CONFIG_FALLOCATE 1390 int64_t len; 1391 #endif 1392 1393 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1394 return handle_aiocb_write_zeroes_block(aiocb); 1395 } 1396 1397 #ifdef CONFIG_XFS 1398 if (s->is_xfs) { 1399 return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); 1400 } 1401 #endif 1402 1403 #ifdef CONFIG_FALLOCATE_ZERO_RANGE 1404 if (s->has_write_zeroes) { 1405 int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, 1406 aiocb->aio_offset, aiocb->aio_nbytes); 1407 if (ret == 0 || ret != -ENOTSUP) { 1408 return ret; 1409 } 1410 s->has_write_zeroes = false; 1411 } 1412 #endif 1413 1414 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1415 if (s->has_discard && s->has_fallocate) { 1416 int ret = do_fallocate(s->fd, 1417 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1418 aiocb->aio_offset, aiocb->aio_nbytes); 1419 if (ret == 0) { 1420 ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1421 if (ret == 0 || ret != -ENOTSUP) { 1422 return ret; 1423 } 1424 s->has_fallocate = false; 1425 } else if (ret != -ENOTSUP) { 1426 return ret; 1427 } else { 1428 s->has_discard = false; 1429 } 1430 } 1431 #endif 1432 1433 #ifdef CONFIG_FALLOCATE 1434 /* Last resort: we are trying to extend the file with zeroed data. This 1435 * can be done via fallocate(fd, 0) */ 1436 len = bdrv_getlength(aiocb->bs); 1437 if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) { 1438 int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1439 if (ret == 0 || ret != -ENOTSUP) { 1440 return ret; 1441 } 1442 s->has_fallocate = false; 1443 } 1444 #endif 1445 1446 return -ENOTSUP; 1447 } 1448 1449 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) 1450 { 1451 int ret = -EOPNOTSUPP; 1452 BDRVRawState *s = aiocb->bs->opaque; 1453 1454 if (!s->has_discard) { 1455 return -ENOTSUP; 1456 } 1457 1458 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1459 #ifdef BLKDISCARD 1460 do { 1461 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1462 if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { 1463 return 0; 1464 } 1465 } while (errno == EINTR); 1466 1467 ret = -errno; 1468 #endif 1469 } else { 1470 #ifdef CONFIG_XFS 1471 if (s->is_xfs) { 1472 return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); 1473 } 1474 #endif 1475 1476 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1477 ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1478 aiocb->aio_offset, aiocb->aio_nbytes); 1479 #endif 1480 } 1481 1482 ret = translate_err(ret); 1483 if (ret == -ENOTSUP) { 1484 s->has_discard = false; 1485 } 1486 return ret; 1487 } 1488 1489 static int aio_worker(void *arg) 1490 { 1491 RawPosixAIOData *aiocb = arg; 1492 ssize_t ret = 0; 1493 1494 switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { 1495 case QEMU_AIO_READ: 1496 ret = handle_aiocb_rw(aiocb); 1497 if (ret >= 0 && ret < aiocb->aio_nbytes) { 1498 iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, 1499 0, aiocb->aio_nbytes - ret); 1500 1501 ret = aiocb->aio_nbytes; 1502 } 1503 if (ret == aiocb->aio_nbytes) { 1504 ret = 0; 1505 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1506 ret = -EINVAL; 1507 } 1508 break; 1509 case QEMU_AIO_WRITE: 1510 ret = handle_aiocb_rw(aiocb); 1511 if (ret == aiocb->aio_nbytes) { 1512 ret = 0; 1513 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1514 ret = -EINVAL; 1515 } 1516 break; 1517 case QEMU_AIO_FLUSH: 1518 ret = handle_aiocb_flush(aiocb); 1519 break; 1520 case QEMU_AIO_IOCTL: 1521 ret = handle_aiocb_ioctl(aiocb); 1522 break; 1523 case QEMU_AIO_DISCARD: 1524 ret = handle_aiocb_discard(aiocb); 1525 break; 1526 case QEMU_AIO_WRITE_ZEROES: 1527 ret = handle_aiocb_write_zeroes(aiocb); 1528 break; 1529 default: 1530 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); 1531 ret = -EINVAL; 1532 break; 1533 } 1534 1535 g_free(aiocb); 1536 return ret; 1537 } 1538 1539 static int paio_submit_co(BlockDriverState *bs, int fd, 1540 int64_t offset, QEMUIOVector *qiov, 1541 int bytes, int type) 1542 { 1543 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1544 ThreadPool *pool; 1545 1546 acb->bs = bs; 1547 acb->aio_type = type; 1548 acb->aio_fildes = fd; 1549 1550 acb->aio_nbytes = bytes; 1551 acb->aio_offset = offset; 1552 1553 if (qiov) { 1554 acb->aio_iov = qiov->iov; 1555 acb->aio_niov = qiov->niov; 1556 assert(qiov->size == bytes); 1557 } 1558 1559 trace_paio_submit_co(offset, bytes, type); 1560 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1561 return thread_pool_submit_co(pool, aio_worker, acb); 1562 } 1563 1564 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, 1565 int64_t offset, QEMUIOVector *qiov, int bytes, 1566 BlockCompletionFunc *cb, void *opaque, int type) 1567 { 1568 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1569 ThreadPool *pool; 1570 1571 acb->bs = bs; 1572 acb->aio_type = type; 1573 acb->aio_fildes = fd; 1574 1575 acb->aio_nbytes = bytes; 1576 acb->aio_offset = offset; 1577 1578 if (qiov) { 1579 acb->aio_iov = qiov->iov; 1580 acb->aio_niov = qiov->niov; 1581 assert(qiov->size == acb->aio_nbytes); 1582 } 1583 1584 trace_paio_submit(acb, opaque, offset, bytes, type); 1585 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1586 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 1587 } 1588 1589 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, 1590 uint64_t bytes, QEMUIOVector *qiov, int type) 1591 { 1592 BDRVRawState *s = bs->opaque; 1593 1594 if (fd_open(bs) < 0) 1595 return -EIO; 1596 1597 /* 1598 * Check if the underlying device requires requests to be aligned, 1599 * and if the request we are trying to submit is aligned or not. 1600 * If this is the case tell the low-level driver that it needs 1601 * to copy the buffer. 1602 */ 1603 if (s->needs_alignment) { 1604 if (!bdrv_qiov_is_aligned(bs, qiov)) { 1605 type |= QEMU_AIO_MISALIGNED; 1606 #ifdef CONFIG_LINUX_AIO 1607 } else if (s->use_linux_aio) { 1608 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1609 assert(qiov->size == bytes); 1610 return laio_co_submit(bs, aio, s->fd, offset, qiov, type); 1611 #endif 1612 } 1613 } 1614 1615 return paio_submit_co(bs, s->fd, offset, qiov, bytes, type); 1616 } 1617 1618 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, 1619 uint64_t bytes, QEMUIOVector *qiov, 1620 int flags) 1621 { 1622 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); 1623 } 1624 1625 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, 1626 uint64_t bytes, QEMUIOVector *qiov, 1627 int flags) 1628 { 1629 assert(flags == 0); 1630 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); 1631 } 1632 1633 static void raw_aio_plug(BlockDriverState *bs) 1634 { 1635 #ifdef CONFIG_LINUX_AIO 1636 BDRVRawState *s = bs->opaque; 1637 if (s->use_linux_aio) { 1638 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1639 laio_io_plug(bs, aio); 1640 } 1641 #endif 1642 } 1643 1644 static void raw_aio_unplug(BlockDriverState *bs) 1645 { 1646 #ifdef CONFIG_LINUX_AIO 1647 BDRVRawState *s = bs->opaque; 1648 if (s->use_linux_aio) { 1649 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1650 laio_io_unplug(bs, aio); 1651 } 1652 #endif 1653 } 1654 1655 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, 1656 BlockCompletionFunc *cb, void *opaque) 1657 { 1658 BDRVRawState *s = bs->opaque; 1659 1660 if (fd_open(bs) < 0) 1661 return NULL; 1662 1663 return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); 1664 } 1665 1666 static void raw_close(BlockDriverState *bs) 1667 { 1668 BDRVRawState *s = bs->opaque; 1669 1670 if (s->fd >= 0) { 1671 qemu_close(s->fd); 1672 s->fd = -1; 1673 } 1674 if (s->lock_fd >= 0) { 1675 qemu_close(s->lock_fd); 1676 s->lock_fd = -1; 1677 } 1678 } 1679 1680 /** 1681 * Truncates the given regular file @fd to @offset and, when growing, fills the 1682 * new space according to @prealloc. 1683 * 1684 * Returns: 0 on success, -errno on failure. 1685 */ 1686 static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc, 1687 Error **errp) 1688 { 1689 int result = 0; 1690 int64_t current_length = 0; 1691 char *buf = NULL; 1692 struct stat st; 1693 1694 if (fstat(fd, &st) < 0) { 1695 result = -errno; 1696 error_setg_errno(errp, -result, "Could not stat file"); 1697 return result; 1698 } 1699 1700 current_length = st.st_size; 1701 if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { 1702 error_setg(errp, "Cannot use preallocation for shrinking files"); 1703 return -ENOTSUP; 1704 } 1705 1706 switch (prealloc) { 1707 #ifdef CONFIG_POSIX_FALLOCATE 1708 case PREALLOC_MODE_FALLOC: 1709 /* 1710 * Truncating before posix_fallocate() makes it about twice slower on 1711 * file systems that do not support fallocate(), trying to check if a 1712 * block is allocated before allocating it, so don't do that here. 1713 */ 1714 if (offset != current_length) { 1715 result = -posix_fallocate(fd, current_length, offset - current_length); 1716 if (result != 0) { 1717 /* posix_fallocate() doesn't set errno. */ 1718 error_setg_errno(errp, -result, 1719 "Could not preallocate new data"); 1720 } 1721 } else { 1722 result = 0; 1723 } 1724 goto out; 1725 #endif 1726 case PREALLOC_MODE_FULL: 1727 { 1728 int64_t num = 0, left = offset - current_length; 1729 off_t seek_result; 1730 1731 /* 1732 * Knowing the final size from the beginning could allow the file 1733 * system driver to do less allocations and possibly avoid 1734 * fragmentation of the file. 1735 */ 1736 if (ftruncate(fd, offset) != 0) { 1737 result = -errno; 1738 error_setg_errno(errp, -result, "Could not resize file"); 1739 goto out; 1740 } 1741 1742 buf = g_malloc0(65536); 1743 1744 seek_result = lseek(fd, current_length, SEEK_SET); 1745 if (seek_result < 0) { 1746 result = -errno; 1747 error_setg_errno(errp, -result, 1748 "Failed to seek to the old end of file"); 1749 goto out; 1750 } 1751 1752 while (left > 0) { 1753 num = MIN(left, 65536); 1754 result = write(fd, buf, num); 1755 if (result < 0) { 1756 result = -errno; 1757 error_setg_errno(errp, -result, 1758 "Could not write zeros for preallocation"); 1759 goto out; 1760 } 1761 left -= result; 1762 } 1763 if (result >= 0) { 1764 result = fsync(fd); 1765 if (result < 0) { 1766 result = -errno; 1767 error_setg_errno(errp, -result, 1768 "Could not flush file to disk"); 1769 goto out; 1770 } 1771 } 1772 goto out; 1773 } 1774 case PREALLOC_MODE_OFF: 1775 if (ftruncate(fd, offset) != 0) { 1776 result = -errno; 1777 error_setg_errno(errp, -result, "Could not resize file"); 1778 } 1779 return result; 1780 default: 1781 result = -ENOTSUP; 1782 error_setg(errp, "Unsupported preallocation mode: %s", 1783 PreallocMode_str(prealloc)); 1784 return result; 1785 } 1786 1787 out: 1788 if (result < 0) { 1789 if (ftruncate(fd, current_length) < 0) { 1790 error_report("Failed to restore old file length: %s", 1791 strerror(errno)); 1792 } 1793 } 1794 1795 g_free(buf); 1796 return result; 1797 } 1798 1799 static int raw_truncate(BlockDriverState *bs, int64_t offset, 1800 PreallocMode prealloc, Error **errp) 1801 { 1802 BDRVRawState *s = bs->opaque; 1803 struct stat st; 1804 int ret; 1805 1806 if (fstat(s->fd, &st)) { 1807 ret = -errno; 1808 error_setg_errno(errp, -ret, "Failed to fstat() the file"); 1809 return ret; 1810 } 1811 1812 if (S_ISREG(st.st_mode)) { 1813 return raw_regular_truncate(s->fd, offset, prealloc, errp); 1814 } 1815 1816 if (prealloc != PREALLOC_MODE_OFF) { 1817 error_setg(errp, "Preallocation mode '%s' unsupported for this " 1818 "non-regular file", PreallocMode_str(prealloc)); 1819 return -ENOTSUP; 1820 } 1821 1822 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1823 if (offset > raw_getlength(bs)) { 1824 error_setg(errp, "Cannot grow device files"); 1825 return -EINVAL; 1826 } 1827 } else { 1828 error_setg(errp, "Resizing this file is not supported"); 1829 return -ENOTSUP; 1830 } 1831 1832 return 0; 1833 } 1834 1835 #ifdef __OpenBSD__ 1836 static int64_t raw_getlength(BlockDriverState *bs) 1837 { 1838 BDRVRawState *s = bs->opaque; 1839 int fd = s->fd; 1840 struct stat st; 1841 1842 if (fstat(fd, &st)) 1843 return -errno; 1844 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1845 struct disklabel dl; 1846 1847 if (ioctl(fd, DIOCGDINFO, &dl)) 1848 return -errno; 1849 return (uint64_t)dl.d_secsize * 1850 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1851 } else 1852 return st.st_size; 1853 } 1854 #elif defined(__NetBSD__) 1855 static int64_t raw_getlength(BlockDriverState *bs) 1856 { 1857 BDRVRawState *s = bs->opaque; 1858 int fd = s->fd; 1859 struct stat st; 1860 1861 if (fstat(fd, &st)) 1862 return -errno; 1863 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1864 struct dkwedge_info dkw; 1865 1866 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { 1867 return dkw.dkw_size * 512; 1868 } else { 1869 struct disklabel dl; 1870 1871 if (ioctl(fd, DIOCGDINFO, &dl)) 1872 return -errno; 1873 return (uint64_t)dl.d_secsize * 1874 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1875 } 1876 } else 1877 return st.st_size; 1878 } 1879 #elif defined(__sun__) 1880 static int64_t raw_getlength(BlockDriverState *bs) 1881 { 1882 BDRVRawState *s = bs->opaque; 1883 struct dk_minfo minfo; 1884 int ret; 1885 int64_t size; 1886 1887 ret = fd_open(bs); 1888 if (ret < 0) { 1889 return ret; 1890 } 1891 1892 /* 1893 * Use the DKIOCGMEDIAINFO ioctl to read the size. 1894 */ 1895 ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); 1896 if (ret != -1) { 1897 return minfo.dki_lbsize * minfo.dki_capacity; 1898 } 1899 1900 /* 1901 * There are reports that lseek on some devices fails, but 1902 * irc discussion said that contingency on contingency was overkill. 1903 */ 1904 size = lseek(s->fd, 0, SEEK_END); 1905 if (size < 0) { 1906 return -errno; 1907 } 1908 return size; 1909 } 1910 #elif defined(CONFIG_BSD) 1911 static int64_t raw_getlength(BlockDriverState *bs) 1912 { 1913 BDRVRawState *s = bs->opaque; 1914 int fd = s->fd; 1915 int64_t size; 1916 struct stat sb; 1917 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1918 int reopened = 0; 1919 #endif 1920 int ret; 1921 1922 ret = fd_open(bs); 1923 if (ret < 0) 1924 return ret; 1925 1926 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1927 again: 1928 #endif 1929 if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { 1930 #ifdef DIOCGMEDIASIZE 1931 if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) 1932 #elif defined(DIOCGPART) 1933 { 1934 struct partinfo pi; 1935 if (ioctl(fd, DIOCGPART, &pi) == 0) 1936 size = pi.media_size; 1937 else 1938 size = 0; 1939 } 1940 if (size == 0) 1941 #endif 1942 #if defined(__APPLE__) && defined(__MACH__) 1943 { 1944 uint64_t sectors = 0; 1945 uint32_t sector_size = 0; 1946 1947 if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 1948 && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { 1949 size = sectors * sector_size; 1950 } else { 1951 size = lseek(fd, 0LL, SEEK_END); 1952 if (size < 0) { 1953 return -errno; 1954 } 1955 } 1956 } 1957 #else 1958 size = lseek(fd, 0LL, SEEK_END); 1959 if (size < 0) { 1960 return -errno; 1961 } 1962 #endif 1963 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 1964 switch(s->type) { 1965 case FTYPE_CD: 1966 /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ 1967 if (size == 2048LL * (unsigned)-1) 1968 size = 0; 1969 /* XXX no disc? maybe we need to reopen... */ 1970 if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { 1971 reopened = 1; 1972 goto again; 1973 } 1974 } 1975 #endif 1976 } else { 1977 size = lseek(fd, 0, SEEK_END); 1978 if (size < 0) { 1979 return -errno; 1980 } 1981 } 1982 return size; 1983 } 1984 #else 1985 static int64_t raw_getlength(BlockDriverState *bs) 1986 { 1987 BDRVRawState *s = bs->opaque; 1988 int ret; 1989 int64_t size; 1990 1991 ret = fd_open(bs); 1992 if (ret < 0) { 1993 return ret; 1994 } 1995 1996 size = lseek(s->fd, 0, SEEK_END); 1997 if (size < 0) { 1998 return -errno; 1999 } 2000 return size; 2001 } 2002 #endif 2003 2004 static int64_t raw_get_allocated_file_size(BlockDriverState *bs) 2005 { 2006 struct stat st; 2007 BDRVRawState *s = bs->opaque; 2008 2009 if (fstat(s->fd, &st) < 0) { 2010 return -errno; 2011 } 2012 return (int64_t)st.st_blocks * 512; 2013 } 2014 2015 static int raw_co_create(BlockdevCreateOptions *options, Error **errp) 2016 { 2017 BlockdevCreateOptionsFile *file_opts; 2018 int fd; 2019 int result = 0; 2020 2021 /* Validate options and set default values */ 2022 assert(options->driver == BLOCKDEV_DRIVER_FILE); 2023 file_opts = &options->u.file; 2024 2025 if (!file_opts->has_nocow) { 2026 file_opts->nocow = false; 2027 } 2028 if (!file_opts->has_preallocation) { 2029 file_opts->preallocation = PREALLOC_MODE_OFF; 2030 } 2031 2032 /* Create file */ 2033 fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, 2034 0644); 2035 if (fd < 0) { 2036 result = -errno; 2037 error_setg_errno(errp, -result, "Could not create file"); 2038 goto out; 2039 } 2040 2041 if (file_opts->nocow) { 2042 #ifdef __linux__ 2043 /* Set NOCOW flag to solve performance issue on fs like btrfs. 2044 * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value 2045 * will be ignored since any failure of this operation should not 2046 * block the left work. 2047 */ 2048 int attr; 2049 if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { 2050 attr |= FS_NOCOW_FL; 2051 ioctl(fd, FS_IOC_SETFLAGS, &attr); 2052 } 2053 #endif 2054 } 2055 2056 result = raw_regular_truncate(fd, file_opts->size, file_opts->preallocation, 2057 errp); 2058 if (result < 0) { 2059 goto out_close; 2060 } 2061 2062 out_close: 2063 if (qemu_close(fd) != 0 && result == 0) { 2064 result = -errno; 2065 error_setg_errno(errp, -result, "Could not close the new file"); 2066 } 2067 out: 2068 return result; 2069 } 2070 2071 static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, 2072 Error **errp) 2073 { 2074 BlockdevCreateOptions options; 2075 int64_t total_size = 0; 2076 bool nocow = false; 2077 PreallocMode prealloc; 2078 char *buf = NULL; 2079 Error *local_err = NULL; 2080 2081 /* Skip file: protocol prefix */ 2082 strstart(filename, "file:", &filename); 2083 2084 /* Read out options */ 2085 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 2086 BDRV_SECTOR_SIZE); 2087 nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); 2088 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 2089 prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, 2090 PREALLOC_MODE_OFF, &local_err); 2091 g_free(buf); 2092 if (local_err) { 2093 error_propagate(errp, local_err); 2094 return -EINVAL; 2095 } 2096 2097 options = (BlockdevCreateOptions) { 2098 .driver = BLOCKDEV_DRIVER_FILE, 2099 .u.file = { 2100 .filename = (char *) filename, 2101 .size = total_size, 2102 .has_preallocation = true, 2103 .preallocation = prealloc, 2104 .has_nocow = true, 2105 .nocow = nocow, 2106 }, 2107 }; 2108 return raw_co_create(&options, errp); 2109 } 2110 2111 /* 2112 * Find allocation range in @bs around offset @start. 2113 * May change underlying file descriptor's file offset. 2114 * If @start is not in a hole, store @start in @data, and the 2115 * beginning of the next hole in @hole, and return 0. 2116 * If @start is in a non-trailing hole, store @start in @hole and the 2117 * beginning of the next non-hole in @data, and return 0. 2118 * If @start is in a trailing hole or beyond EOF, return -ENXIO. 2119 * If we can't find out, return a negative errno other than -ENXIO. 2120 */ 2121 static int find_allocation(BlockDriverState *bs, off_t start, 2122 off_t *data, off_t *hole) 2123 { 2124 #if defined SEEK_HOLE && defined SEEK_DATA 2125 BDRVRawState *s = bs->opaque; 2126 off_t offs; 2127 2128 /* 2129 * SEEK_DATA cases: 2130 * D1. offs == start: start is in data 2131 * D2. offs > start: start is in a hole, next data at offs 2132 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole 2133 * or start is beyond EOF 2134 * If the latter happens, the file has been truncated behind 2135 * our back since we opened it. All bets are off then. 2136 * Treating like a trailing hole is simplest. 2137 * D4. offs < 0, errno != ENXIO: we learned nothing 2138 */ 2139 offs = lseek(s->fd, start, SEEK_DATA); 2140 if (offs < 0) { 2141 return -errno; /* D3 or D4 */ 2142 } 2143 2144 if (offs < start) { 2145 /* This is not a valid return by lseek(). We are safe to just return 2146 * -EIO in this case, and we'll treat it like D4. */ 2147 return -EIO; 2148 } 2149 2150 if (offs > start) { 2151 /* D2: in hole, next data at offs */ 2152 *hole = start; 2153 *data = offs; 2154 return 0; 2155 } 2156 2157 /* D1: in data, end not yet known */ 2158 2159 /* 2160 * SEEK_HOLE cases: 2161 * H1. offs == start: start is in a hole 2162 * If this happens here, a hole has been dug behind our back 2163 * since the previous lseek(). 2164 * H2. offs > start: either start is in data, next hole at offs, 2165 * or start is in trailing hole, EOF at offs 2166 * Linux treats trailing holes like any other hole: offs == 2167 * start. Solaris seeks to EOF instead: offs > start (blech). 2168 * If that happens here, a hole has been dug behind our back 2169 * since the previous lseek(). 2170 * H3. offs < 0, errno = ENXIO: start is beyond EOF 2171 * If this happens, the file has been truncated behind our 2172 * back since we opened it. Treat it like a trailing hole. 2173 * H4. offs < 0, errno != ENXIO: we learned nothing 2174 * Pretend we know nothing at all, i.e. "forget" about D1. 2175 */ 2176 offs = lseek(s->fd, start, SEEK_HOLE); 2177 if (offs < 0) { 2178 return -errno; /* D1 and (H3 or H4) */ 2179 } 2180 2181 if (offs < start) { 2182 /* This is not a valid return by lseek(). We are safe to just return 2183 * -EIO in this case, and we'll treat it like H4. */ 2184 return -EIO; 2185 } 2186 2187 if (offs > start) { 2188 /* 2189 * D1 and H2: either in data, next hole at offs, or it was in 2190 * data but is now in a trailing hole. In the latter case, 2191 * all bets are off. Treating it as if it there was data all 2192 * the way to EOF is safe, so simply do that. 2193 */ 2194 *data = start; 2195 *hole = offs; 2196 return 0; 2197 } 2198 2199 /* D1 and H1 */ 2200 return -EBUSY; 2201 #else 2202 return -ENOTSUP; 2203 #endif 2204 } 2205 2206 /* 2207 * Returns the allocation status of the specified offset. 2208 * 2209 * The block layer guarantees 'offset' and 'bytes' are within bounds. 2210 * 2211 * 'pnum' is set to the number of bytes (including and immediately following 2212 * the specified offset) that are known to be in the same 2213 * allocated/unallocated state. 2214 * 2215 * 'bytes' is the max value 'pnum' should be set to. 2216 */ 2217 static int coroutine_fn raw_co_block_status(BlockDriverState *bs, 2218 bool want_zero, 2219 int64_t offset, 2220 int64_t bytes, int64_t *pnum, 2221 int64_t *map, 2222 BlockDriverState **file) 2223 { 2224 off_t data = 0, hole = 0; 2225 int ret; 2226 2227 ret = fd_open(bs); 2228 if (ret < 0) { 2229 return ret; 2230 } 2231 2232 if (!want_zero) { 2233 *pnum = bytes; 2234 *map = offset; 2235 *file = bs; 2236 return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 2237 } 2238 2239 ret = find_allocation(bs, offset, &data, &hole); 2240 if (ret == -ENXIO) { 2241 /* Trailing hole */ 2242 *pnum = bytes; 2243 ret = BDRV_BLOCK_ZERO; 2244 } else if (ret < 0) { 2245 /* No info available, so pretend there are no holes */ 2246 *pnum = bytes; 2247 ret = BDRV_BLOCK_DATA; 2248 } else if (data == offset) { 2249 /* On a data extent, compute bytes to the end of the extent, 2250 * possibly including a partial sector at EOF. */ 2251 *pnum = MIN(bytes, hole - offset); 2252 ret = BDRV_BLOCK_DATA; 2253 } else { 2254 /* On a hole, compute bytes to the beginning of the next extent. */ 2255 assert(hole == offset); 2256 *pnum = MIN(bytes, data - offset); 2257 ret = BDRV_BLOCK_ZERO; 2258 } 2259 *map = offset; 2260 *file = bs; 2261 return ret | BDRV_BLOCK_OFFSET_VALID; 2262 } 2263 2264 #if defined(__linux__) 2265 /* Verify that the file is not in the page cache */ 2266 static void check_cache_dropped(BlockDriverState *bs, Error **errp) 2267 { 2268 const size_t window_size = 128 * 1024 * 1024; 2269 BDRVRawState *s = bs->opaque; 2270 void *window = NULL; 2271 size_t length = 0; 2272 unsigned char *vec; 2273 size_t page_size; 2274 off_t offset; 2275 off_t end; 2276 2277 /* mincore(2) page status information requires 1 byte per page */ 2278 page_size = sysconf(_SC_PAGESIZE); 2279 vec = g_malloc(DIV_ROUND_UP(window_size, page_size)); 2280 2281 end = raw_getlength(bs); 2282 2283 for (offset = 0; offset < end; offset += window_size) { 2284 void *new_window; 2285 size_t new_length; 2286 size_t vec_end; 2287 size_t i; 2288 int ret; 2289 2290 /* Unmap previous window if size has changed */ 2291 new_length = MIN(end - offset, window_size); 2292 if (new_length != length) { 2293 munmap(window, length); 2294 window = NULL; 2295 length = 0; 2296 } 2297 2298 new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE, 2299 s->fd, offset); 2300 if (new_window == MAP_FAILED) { 2301 error_setg_errno(errp, errno, "mmap failed"); 2302 break; 2303 } 2304 2305 window = new_window; 2306 length = new_length; 2307 2308 ret = mincore(window, length, vec); 2309 if (ret < 0) { 2310 error_setg_errno(errp, errno, "mincore failed"); 2311 break; 2312 } 2313 2314 vec_end = DIV_ROUND_UP(length, page_size); 2315 for (i = 0; i < vec_end; i++) { 2316 if (vec[i] & 0x1) { 2317 error_setg(errp, "page cache still in use!"); 2318 break; 2319 } 2320 } 2321 } 2322 2323 if (window) { 2324 munmap(window, length); 2325 } 2326 2327 g_free(vec); 2328 } 2329 #endif /* __linux__ */ 2330 2331 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, 2332 Error **errp) 2333 { 2334 BDRVRawState *s = bs->opaque; 2335 int ret; 2336 2337 ret = fd_open(bs); 2338 if (ret < 0) { 2339 error_setg_errno(errp, -ret, "The file descriptor is not open"); 2340 return; 2341 } 2342 2343 if (s->open_flags & O_DIRECT) { 2344 return; /* No host kernel page cache */ 2345 } 2346 2347 #if defined(__linux__) 2348 /* This sets the scene for the next syscall... */ 2349 ret = bdrv_co_flush(bs); 2350 if (ret < 0) { 2351 error_setg_errno(errp, -ret, "flush failed"); 2352 return; 2353 } 2354 2355 /* Linux does not invalidate pages that are dirty, locked, or mmapped by a 2356 * process. These limitations are okay because we just fsynced the file, 2357 * we don't use mmap, and the file should not be in use by other processes. 2358 */ 2359 ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED); 2360 if (ret != 0) { /* the return value is a positive errno */ 2361 error_setg_errno(errp, ret, "fadvise failed"); 2362 return; 2363 } 2364 2365 if (s->check_cache_dropped) { 2366 check_cache_dropped(bs, errp); 2367 } 2368 #else /* __linux__ */ 2369 /* Do nothing. Live migration to a remote host with cache.direct=off is 2370 * unsupported on other host operating systems. Cache consistency issues 2371 * may occur but no error is reported here, partly because that's the 2372 * historical behavior and partly because it's hard to differentiate valid 2373 * configurations that should not cause errors. 2374 */ 2375 #endif /* !__linux__ */ 2376 } 2377 2378 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs, 2379 int64_t offset, int bytes, 2380 BlockCompletionFunc *cb, void *opaque) 2381 { 2382 BDRVRawState *s = bs->opaque; 2383 2384 return paio_submit(bs, s->fd, offset, NULL, bytes, 2385 cb, opaque, QEMU_AIO_DISCARD); 2386 } 2387 2388 static int coroutine_fn raw_co_pwrite_zeroes( 2389 BlockDriverState *bs, int64_t offset, 2390 int bytes, BdrvRequestFlags flags) 2391 { 2392 BDRVRawState *s = bs->opaque; 2393 2394 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 2395 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2396 QEMU_AIO_WRITE_ZEROES); 2397 } else if (s->discard_zeroes) { 2398 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2399 QEMU_AIO_DISCARD); 2400 } 2401 return -ENOTSUP; 2402 } 2403 2404 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 2405 { 2406 BDRVRawState *s = bs->opaque; 2407 2408 bdi->unallocated_blocks_are_zero = s->discard_zeroes; 2409 return 0; 2410 } 2411 2412 static QemuOptsList raw_create_opts = { 2413 .name = "raw-create-opts", 2414 .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), 2415 .desc = { 2416 { 2417 .name = BLOCK_OPT_SIZE, 2418 .type = QEMU_OPT_SIZE, 2419 .help = "Virtual disk size" 2420 }, 2421 { 2422 .name = BLOCK_OPT_NOCOW, 2423 .type = QEMU_OPT_BOOL, 2424 .help = "Turn off copy-on-write (valid only on btrfs)" 2425 }, 2426 { 2427 .name = BLOCK_OPT_PREALLOC, 2428 .type = QEMU_OPT_STRING, 2429 .help = "Preallocation mode (allowed values: off, falloc, full)" 2430 }, 2431 { /* end of list */ } 2432 } 2433 }; 2434 2435 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared, 2436 Error **errp) 2437 { 2438 return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp); 2439 } 2440 2441 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) 2442 { 2443 BDRVRawState *s = bs->opaque; 2444 raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL); 2445 s->perm = perm; 2446 s->shared_perm = shared; 2447 } 2448 2449 static void raw_abort_perm_update(BlockDriverState *bs) 2450 { 2451 raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); 2452 } 2453 2454 BlockDriver bdrv_file = { 2455 .format_name = "file", 2456 .protocol_name = "file", 2457 .instance_size = sizeof(BDRVRawState), 2458 .bdrv_needs_filename = true, 2459 .bdrv_probe = NULL, /* no probe for protocols */ 2460 .bdrv_parse_filename = raw_parse_filename, 2461 .bdrv_file_open = raw_open, 2462 .bdrv_reopen_prepare = raw_reopen_prepare, 2463 .bdrv_reopen_commit = raw_reopen_commit, 2464 .bdrv_reopen_abort = raw_reopen_abort, 2465 .bdrv_close = raw_close, 2466 .bdrv_co_create = raw_co_create, 2467 .bdrv_co_create_opts = raw_co_create_opts, 2468 .bdrv_has_zero_init = bdrv_has_zero_init_1, 2469 .bdrv_co_block_status = raw_co_block_status, 2470 .bdrv_co_invalidate_cache = raw_co_invalidate_cache, 2471 .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, 2472 2473 .bdrv_co_preadv = raw_co_preadv, 2474 .bdrv_co_pwritev = raw_co_pwritev, 2475 .bdrv_aio_flush = raw_aio_flush, 2476 .bdrv_aio_pdiscard = raw_aio_pdiscard, 2477 .bdrv_refresh_limits = raw_refresh_limits, 2478 .bdrv_io_plug = raw_aio_plug, 2479 .bdrv_io_unplug = raw_aio_unplug, 2480 2481 .bdrv_truncate = raw_truncate, 2482 .bdrv_getlength = raw_getlength, 2483 .bdrv_get_info = raw_get_info, 2484 .bdrv_get_allocated_file_size 2485 = raw_get_allocated_file_size, 2486 .bdrv_check_perm = raw_check_perm, 2487 .bdrv_set_perm = raw_set_perm, 2488 .bdrv_abort_perm_update = raw_abort_perm_update, 2489 .create_opts = &raw_create_opts, 2490 }; 2491 2492 /***********************************************/ 2493 /* host device */ 2494 2495 #if defined(__APPLE__) && defined(__MACH__) 2496 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 2497 CFIndex maxPathSize, int flags); 2498 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) 2499 { 2500 kern_return_t kernResult = KERN_FAILURE; 2501 mach_port_t masterPort; 2502 CFMutableDictionaryRef classesToMatch; 2503 const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; 2504 char *mediaType = NULL; 2505 2506 kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); 2507 if ( KERN_SUCCESS != kernResult ) { 2508 printf( "IOMasterPort returned %d\n", kernResult ); 2509 } 2510 2511 int index; 2512 for (index = 0; index < ARRAY_SIZE(matching_array); index++) { 2513 classesToMatch = IOServiceMatching(matching_array[index]); 2514 if (classesToMatch == NULL) { 2515 error_report("IOServiceMatching returned NULL for %s", 2516 matching_array[index]); 2517 continue; 2518 } 2519 CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), 2520 kCFBooleanTrue); 2521 kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, 2522 mediaIterator); 2523 if (kernResult != KERN_SUCCESS) { 2524 error_report("Note: IOServiceGetMatchingServices returned %d", 2525 kernResult); 2526 continue; 2527 } 2528 2529 /* If a match was found, leave the loop */ 2530 if (*mediaIterator != 0) { 2531 DPRINTF("Matching using %s\n", matching_array[index]); 2532 mediaType = g_strdup(matching_array[index]); 2533 break; 2534 } 2535 } 2536 return mediaType; 2537 } 2538 2539 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 2540 CFIndex maxPathSize, int flags) 2541 { 2542 io_object_t nextMedia; 2543 kern_return_t kernResult = KERN_FAILURE; 2544 *bsdPath = '\0'; 2545 nextMedia = IOIteratorNext( mediaIterator ); 2546 if ( nextMedia ) 2547 { 2548 CFTypeRef bsdPathAsCFString; 2549 bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); 2550 if ( bsdPathAsCFString ) { 2551 size_t devPathLength; 2552 strcpy( bsdPath, _PATH_DEV ); 2553 if (flags & BDRV_O_NOCACHE) { 2554 strcat(bsdPath, "r"); 2555 } 2556 devPathLength = strlen( bsdPath ); 2557 if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { 2558 kernResult = KERN_SUCCESS; 2559 } 2560 CFRelease( bsdPathAsCFString ); 2561 } 2562 IOObjectRelease( nextMedia ); 2563 } 2564 2565 return kernResult; 2566 } 2567 2568 /* Sets up a real cdrom for use in QEMU */ 2569 static bool setup_cdrom(char *bsd_path, Error **errp) 2570 { 2571 int index, num_of_test_partitions = 2, fd; 2572 char test_partition[MAXPATHLEN]; 2573 bool partition_found = false; 2574 2575 /* look for a working partition */ 2576 for (index = 0; index < num_of_test_partitions; index++) { 2577 snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, 2578 index); 2579 fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE); 2580 if (fd >= 0) { 2581 partition_found = true; 2582 qemu_close(fd); 2583 break; 2584 } 2585 } 2586 2587 /* if a working partition on the device was not found */ 2588 if (partition_found == false) { 2589 error_setg(errp, "Failed to find a working partition on disc"); 2590 } else { 2591 DPRINTF("Using %s as optical disc\n", test_partition); 2592 pstrcpy(bsd_path, MAXPATHLEN, test_partition); 2593 } 2594 return partition_found; 2595 } 2596 2597 /* Prints directions on mounting and unmounting a device */ 2598 static void print_unmounting_directions(const char *file_name) 2599 { 2600 error_report("If device %s is mounted on the desktop, unmount" 2601 " it first before using it in QEMU", file_name); 2602 error_report("Command to unmount device: diskutil unmountDisk %s", 2603 file_name); 2604 error_report("Command to mount device: diskutil mountDisk %s", file_name); 2605 } 2606 2607 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2608 2609 static int hdev_probe_device(const char *filename) 2610 { 2611 struct stat st; 2612 2613 /* allow a dedicated CD-ROM driver to match with a higher priority */ 2614 if (strstart(filename, "/dev/cdrom", NULL)) 2615 return 50; 2616 2617 if (stat(filename, &st) >= 0 && 2618 (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { 2619 return 100; 2620 } 2621 2622 return 0; 2623 } 2624 2625 static int check_hdev_writable(BDRVRawState *s) 2626 { 2627 #if defined(BLKROGET) 2628 /* Linux block devices can be configured "read-only" using blockdev(8). 2629 * This is independent of device node permissions and therefore open(2) 2630 * with O_RDWR succeeds. Actual writes fail with EPERM. 2631 * 2632 * bdrv_open() is supposed to fail if the disk is read-only. Explicitly 2633 * check for read-only block devices so that Linux block devices behave 2634 * properly. 2635 */ 2636 struct stat st; 2637 int readonly = 0; 2638 2639 if (fstat(s->fd, &st)) { 2640 return -errno; 2641 } 2642 2643 if (!S_ISBLK(st.st_mode)) { 2644 return 0; 2645 } 2646 2647 if (ioctl(s->fd, BLKROGET, &readonly) < 0) { 2648 return -errno; 2649 } 2650 2651 if (readonly) { 2652 return -EACCES; 2653 } 2654 #endif /* defined(BLKROGET) */ 2655 return 0; 2656 } 2657 2658 static void hdev_parse_filename(const char *filename, QDict *options, 2659 Error **errp) 2660 { 2661 bdrv_parse_filename_strip_prefix(filename, "host_device:", options); 2662 } 2663 2664 static bool hdev_is_sg(BlockDriverState *bs) 2665 { 2666 2667 #if defined(__linux__) 2668 2669 BDRVRawState *s = bs->opaque; 2670 struct stat st; 2671 struct sg_scsi_id scsiid; 2672 int sg_version; 2673 int ret; 2674 2675 if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) { 2676 return false; 2677 } 2678 2679 ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version); 2680 if (ret < 0) { 2681 return false; 2682 } 2683 2684 ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid); 2685 if (ret >= 0) { 2686 DPRINTF("SG device found: type=%d, version=%d\n", 2687 scsiid.scsi_type, sg_version); 2688 return true; 2689 } 2690 2691 #endif 2692 2693 return false; 2694 } 2695 2696 static int hdev_open(BlockDriverState *bs, QDict *options, int flags, 2697 Error **errp) 2698 { 2699 BDRVRawState *s = bs->opaque; 2700 Error *local_err = NULL; 2701 int ret; 2702 2703 #if defined(__APPLE__) && defined(__MACH__) 2704 /* 2705 * Caution: while qdict_get_str() is fine, getting non-string types 2706 * would require more care. When @options come from -blockdev or 2707 * blockdev_add, its members are typed according to the QAPI 2708 * schema, but when they come from -drive, they're all QString. 2709 */ 2710 const char *filename = qdict_get_str(options, "filename"); 2711 char bsd_path[MAXPATHLEN] = ""; 2712 bool error_occurred = false; 2713 2714 /* If using a real cdrom */ 2715 if (strcmp(filename, "/dev/cdrom") == 0) { 2716 char *mediaType = NULL; 2717 kern_return_t ret_val; 2718 io_iterator_t mediaIterator = 0; 2719 2720 mediaType = FindEjectableOpticalMedia(&mediaIterator); 2721 if (mediaType == NULL) { 2722 error_setg(errp, "Please make sure your CD/DVD is in the optical" 2723 " drive"); 2724 error_occurred = true; 2725 goto hdev_open_Mac_error; 2726 } 2727 2728 ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags); 2729 if (ret_val != KERN_SUCCESS) { 2730 error_setg(errp, "Could not get BSD path for optical drive"); 2731 error_occurred = true; 2732 goto hdev_open_Mac_error; 2733 } 2734 2735 /* If a real optical drive was not found */ 2736 if (bsd_path[0] == '\0') { 2737 error_setg(errp, "Failed to obtain bsd path for optical drive"); 2738 error_occurred = true; 2739 goto hdev_open_Mac_error; 2740 } 2741 2742 /* If using a cdrom disc and finding a partition on the disc failed */ 2743 if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 && 2744 setup_cdrom(bsd_path, errp) == false) { 2745 print_unmounting_directions(bsd_path); 2746 error_occurred = true; 2747 goto hdev_open_Mac_error; 2748 } 2749 2750 qdict_put_str(options, "filename", bsd_path); 2751 2752 hdev_open_Mac_error: 2753 g_free(mediaType); 2754 if (mediaIterator) { 2755 IOObjectRelease(mediaIterator); 2756 } 2757 if (error_occurred) { 2758 return -ENOENT; 2759 } 2760 } 2761 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2762 2763 s->type = FTYPE_FILE; 2764 2765 ret = raw_open_common(bs, options, flags, 0, &local_err); 2766 if (ret < 0) { 2767 error_propagate(errp, local_err); 2768 #if defined(__APPLE__) && defined(__MACH__) 2769 if (*bsd_path) { 2770 filename = bsd_path; 2771 } 2772 /* if a physical device experienced an error while being opened */ 2773 if (strncmp(filename, "/dev/", 5) == 0) { 2774 print_unmounting_directions(filename); 2775 } 2776 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2777 return ret; 2778 } 2779 2780 /* Since this does ioctl the device must be already opened */ 2781 bs->sg = hdev_is_sg(bs); 2782 2783 if (flags & BDRV_O_RDWR) { 2784 ret = check_hdev_writable(s); 2785 if (ret < 0) { 2786 raw_close(bs); 2787 error_setg_errno(errp, -ret, "The device is not writable"); 2788 return ret; 2789 } 2790 } 2791 2792 return ret; 2793 } 2794 2795 #if defined(__linux__) 2796 2797 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, 2798 unsigned long int req, void *buf, 2799 BlockCompletionFunc *cb, void *opaque) 2800 { 2801 BDRVRawState *s = bs->opaque; 2802 RawPosixAIOData *acb; 2803 ThreadPool *pool; 2804 2805 if (fd_open(bs) < 0) 2806 return NULL; 2807 2808 if (req == SG_IO && s->pr_mgr) { 2809 struct sg_io_hdr *io_hdr = buf; 2810 if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || 2811 io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { 2812 return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), 2813 s->fd, io_hdr, cb, opaque); 2814 } 2815 } 2816 2817 acb = g_new(RawPosixAIOData, 1); 2818 acb->bs = bs; 2819 acb->aio_type = QEMU_AIO_IOCTL; 2820 acb->aio_fildes = s->fd; 2821 acb->aio_offset = 0; 2822 acb->aio_ioctl_buf = buf; 2823 acb->aio_ioctl_cmd = req; 2824 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 2825 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 2826 } 2827 #endif /* linux */ 2828 2829 static int fd_open(BlockDriverState *bs) 2830 { 2831 BDRVRawState *s = bs->opaque; 2832 2833 /* this is just to ensure s->fd is sane (its called by io ops) */ 2834 if (s->fd >= 0) 2835 return 0; 2836 return -EIO; 2837 } 2838 2839 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs, 2840 int64_t offset, int bytes, 2841 BlockCompletionFunc *cb, void *opaque) 2842 { 2843 BDRVRawState *s = bs->opaque; 2844 2845 if (fd_open(bs) < 0) { 2846 return NULL; 2847 } 2848 return paio_submit(bs, s->fd, offset, NULL, bytes, 2849 cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2850 } 2851 2852 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, 2853 int64_t offset, int bytes, BdrvRequestFlags flags) 2854 { 2855 BDRVRawState *s = bs->opaque; 2856 int rc; 2857 2858 rc = fd_open(bs); 2859 if (rc < 0) { 2860 return rc; 2861 } 2862 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 2863 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2864 QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); 2865 } else if (s->discard_zeroes) { 2866 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2867 QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2868 } 2869 return -ENOTSUP; 2870 } 2871 2872 static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts, 2873 Error **errp) 2874 { 2875 int fd; 2876 int ret = 0; 2877 struct stat stat_buf; 2878 int64_t total_size = 0; 2879 bool has_prefix; 2880 2881 /* This function is used by both protocol block drivers and therefore either 2882 * of these prefixes may be given. 2883 * The return value has to be stored somewhere, otherwise this is an error 2884 * due to -Werror=unused-value. */ 2885 has_prefix = 2886 strstart(filename, "host_device:", &filename) || 2887 strstart(filename, "host_cdrom:" , &filename); 2888 2889 (void)has_prefix; 2890 2891 ret = raw_normalize_devicepath(&filename); 2892 if (ret < 0) { 2893 error_setg_errno(errp, -ret, "Could not normalize device path"); 2894 return ret; 2895 } 2896 2897 /* Read out options */ 2898 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 2899 BDRV_SECTOR_SIZE); 2900 2901 fd = qemu_open(filename, O_WRONLY | O_BINARY); 2902 if (fd < 0) { 2903 ret = -errno; 2904 error_setg_errno(errp, -ret, "Could not open device"); 2905 return ret; 2906 } 2907 2908 if (fstat(fd, &stat_buf) < 0) { 2909 ret = -errno; 2910 error_setg_errno(errp, -ret, "Could not stat device"); 2911 } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { 2912 error_setg(errp, 2913 "The given file is neither a block nor a character device"); 2914 ret = -ENODEV; 2915 } else if (lseek(fd, 0, SEEK_END) < total_size) { 2916 error_setg(errp, "Device is too small"); 2917 ret = -ENOSPC; 2918 } 2919 2920 if (!ret && total_size) { 2921 uint8_t buf[BDRV_SECTOR_SIZE] = { 0 }; 2922 int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size); 2923 if (lseek(fd, 0, SEEK_SET) == -1) { 2924 ret = -errno; 2925 } else { 2926 ret = qemu_write_full(fd, buf, zero_size); 2927 ret = ret == zero_size ? 0 : -errno; 2928 } 2929 } 2930 qemu_close(fd); 2931 return ret; 2932 } 2933 2934 static BlockDriver bdrv_host_device = { 2935 .format_name = "host_device", 2936 .protocol_name = "host_device", 2937 .instance_size = sizeof(BDRVRawState), 2938 .bdrv_needs_filename = true, 2939 .bdrv_probe_device = hdev_probe_device, 2940 .bdrv_parse_filename = hdev_parse_filename, 2941 .bdrv_file_open = hdev_open, 2942 .bdrv_close = raw_close, 2943 .bdrv_reopen_prepare = raw_reopen_prepare, 2944 .bdrv_reopen_commit = raw_reopen_commit, 2945 .bdrv_reopen_abort = raw_reopen_abort, 2946 .bdrv_co_create_opts = hdev_co_create_opts, 2947 .create_opts = &raw_create_opts, 2948 .bdrv_co_invalidate_cache = raw_co_invalidate_cache, 2949 .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, 2950 2951 .bdrv_co_preadv = raw_co_preadv, 2952 .bdrv_co_pwritev = raw_co_pwritev, 2953 .bdrv_aio_flush = raw_aio_flush, 2954 .bdrv_aio_pdiscard = hdev_aio_pdiscard, 2955 .bdrv_refresh_limits = raw_refresh_limits, 2956 .bdrv_io_plug = raw_aio_plug, 2957 .bdrv_io_unplug = raw_aio_unplug, 2958 2959 .bdrv_truncate = raw_truncate, 2960 .bdrv_getlength = raw_getlength, 2961 .bdrv_get_info = raw_get_info, 2962 .bdrv_get_allocated_file_size 2963 = raw_get_allocated_file_size, 2964 .bdrv_check_perm = raw_check_perm, 2965 .bdrv_set_perm = raw_set_perm, 2966 .bdrv_abort_perm_update = raw_abort_perm_update, 2967 .bdrv_probe_blocksizes = hdev_probe_blocksizes, 2968 .bdrv_probe_geometry = hdev_probe_geometry, 2969 2970 /* generic scsi device */ 2971 #ifdef __linux__ 2972 .bdrv_aio_ioctl = hdev_aio_ioctl, 2973 #endif 2974 }; 2975 2976 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 2977 static void cdrom_parse_filename(const char *filename, QDict *options, 2978 Error **errp) 2979 { 2980 bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options); 2981 } 2982 #endif 2983 2984 #ifdef __linux__ 2985 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 2986 Error **errp) 2987 { 2988 BDRVRawState *s = bs->opaque; 2989 2990 s->type = FTYPE_CD; 2991 2992 /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ 2993 return raw_open_common(bs, options, flags, O_NONBLOCK, errp); 2994 } 2995 2996 static int cdrom_probe_device(const char *filename) 2997 { 2998 int fd, ret; 2999 int prio = 0; 3000 struct stat st; 3001 3002 fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); 3003 if (fd < 0) { 3004 goto out; 3005 } 3006 ret = fstat(fd, &st); 3007 if (ret == -1 || !S_ISBLK(st.st_mode)) { 3008 goto outc; 3009 } 3010 3011 /* Attempt to detect via a CDROM specific ioctl */ 3012 ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 3013 if (ret >= 0) 3014 prio = 100; 3015 3016 outc: 3017 qemu_close(fd); 3018 out: 3019 return prio; 3020 } 3021 3022 static bool cdrom_is_inserted(BlockDriverState *bs) 3023 { 3024 BDRVRawState *s = bs->opaque; 3025 int ret; 3026 3027 ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 3028 return ret == CDS_DISC_OK; 3029 } 3030 3031 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 3032 { 3033 BDRVRawState *s = bs->opaque; 3034 3035 if (eject_flag) { 3036 if (ioctl(s->fd, CDROMEJECT, NULL) < 0) 3037 perror("CDROMEJECT"); 3038 } else { 3039 if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) 3040 perror("CDROMEJECT"); 3041 } 3042 } 3043 3044 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 3045 { 3046 BDRVRawState *s = bs->opaque; 3047 3048 if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { 3049 /* 3050 * Note: an error can happen if the distribution automatically 3051 * mounts the CD-ROM 3052 */ 3053 /* perror("CDROM_LOCKDOOR"); */ 3054 } 3055 } 3056 3057 static BlockDriver bdrv_host_cdrom = { 3058 .format_name = "host_cdrom", 3059 .protocol_name = "host_cdrom", 3060 .instance_size = sizeof(BDRVRawState), 3061 .bdrv_needs_filename = true, 3062 .bdrv_probe_device = cdrom_probe_device, 3063 .bdrv_parse_filename = cdrom_parse_filename, 3064 .bdrv_file_open = cdrom_open, 3065 .bdrv_close = raw_close, 3066 .bdrv_reopen_prepare = raw_reopen_prepare, 3067 .bdrv_reopen_commit = raw_reopen_commit, 3068 .bdrv_reopen_abort = raw_reopen_abort, 3069 .bdrv_co_create_opts = hdev_co_create_opts, 3070 .create_opts = &raw_create_opts, 3071 .bdrv_co_invalidate_cache = raw_co_invalidate_cache, 3072 3073 3074 .bdrv_co_preadv = raw_co_preadv, 3075 .bdrv_co_pwritev = raw_co_pwritev, 3076 .bdrv_aio_flush = raw_aio_flush, 3077 .bdrv_refresh_limits = raw_refresh_limits, 3078 .bdrv_io_plug = raw_aio_plug, 3079 .bdrv_io_unplug = raw_aio_unplug, 3080 3081 .bdrv_truncate = raw_truncate, 3082 .bdrv_getlength = raw_getlength, 3083 .has_variable_length = true, 3084 .bdrv_get_allocated_file_size 3085 = raw_get_allocated_file_size, 3086 3087 /* removable device support */ 3088 .bdrv_is_inserted = cdrom_is_inserted, 3089 .bdrv_eject = cdrom_eject, 3090 .bdrv_lock_medium = cdrom_lock_medium, 3091 3092 /* generic scsi device */ 3093 .bdrv_aio_ioctl = hdev_aio_ioctl, 3094 }; 3095 #endif /* __linux__ */ 3096 3097 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 3098 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 3099 Error **errp) 3100 { 3101 BDRVRawState *s = bs->opaque; 3102 Error *local_err = NULL; 3103 int ret; 3104 3105 s->type = FTYPE_CD; 3106 3107 ret = raw_open_common(bs, options, flags, 0, &local_err); 3108 if (ret) { 3109 error_propagate(errp, local_err); 3110 return ret; 3111 } 3112 3113 /* make sure the door isn't locked at this time */ 3114 ioctl(s->fd, CDIOCALLOW); 3115 return 0; 3116 } 3117 3118 static int cdrom_probe_device(const char *filename) 3119 { 3120 if (strstart(filename, "/dev/cd", NULL) || 3121 strstart(filename, "/dev/acd", NULL)) 3122 return 100; 3123 return 0; 3124 } 3125 3126 static int cdrom_reopen(BlockDriverState *bs) 3127 { 3128 BDRVRawState *s = bs->opaque; 3129 int fd; 3130 3131 /* 3132 * Force reread of possibly changed/newly loaded disc, 3133 * FreeBSD seems to not notice sometimes... 3134 */ 3135 if (s->fd >= 0) 3136 qemu_close(s->fd); 3137 fd = qemu_open(bs->filename, s->open_flags, 0644); 3138 if (fd < 0) { 3139 s->fd = -1; 3140 return -EIO; 3141 } 3142 s->fd = fd; 3143 3144 /* make sure the door isn't locked at this time */ 3145 ioctl(s->fd, CDIOCALLOW); 3146 return 0; 3147 } 3148 3149 static bool cdrom_is_inserted(BlockDriverState *bs) 3150 { 3151 return raw_getlength(bs) > 0; 3152 } 3153 3154 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 3155 { 3156 BDRVRawState *s = bs->opaque; 3157 3158 if (s->fd < 0) 3159 return; 3160 3161 (void) ioctl(s->fd, CDIOCALLOW); 3162 3163 if (eject_flag) { 3164 if (ioctl(s->fd, CDIOCEJECT) < 0) 3165 perror("CDIOCEJECT"); 3166 } else { 3167 if (ioctl(s->fd, CDIOCCLOSE) < 0) 3168 perror("CDIOCCLOSE"); 3169 } 3170 3171 cdrom_reopen(bs); 3172 } 3173 3174 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 3175 { 3176 BDRVRawState *s = bs->opaque; 3177 3178 if (s->fd < 0) 3179 return; 3180 if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { 3181 /* 3182 * Note: an error can happen if the distribution automatically 3183 * mounts the CD-ROM 3184 */ 3185 /* perror("CDROM_LOCKDOOR"); */ 3186 } 3187 } 3188 3189 static BlockDriver bdrv_host_cdrom = { 3190 .format_name = "host_cdrom", 3191 .protocol_name = "host_cdrom", 3192 .instance_size = sizeof(BDRVRawState), 3193 .bdrv_needs_filename = true, 3194 .bdrv_probe_device = cdrom_probe_device, 3195 .bdrv_parse_filename = cdrom_parse_filename, 3196 .bdrv_file_open = cdrom_open, 3197 .bdrv_close = raw_close, 3198 .bdrv_reopen_prepare = raw_reopen_prepare, 3199 .bdrv_reopen_commit = raw_reopen_commit, 3200 .bdrv_reopen_abort = raw_reopen_abort, 3201 .bdrv_co_create_opts = hdev_co_create_opts, 3202 .create_opts = &raw_create_opts, 3203 3204 .bdrv_co_preadv = raw_co_preadv, 3205 .bdrv_co_pwritev = raw_co_pwritev, 3206 .bdrv_aio_flush = raw_aio_flush, 3207 .bdrv_refresh_limits = raw_refresh_limits, 3208 .bdrv_io_plug = raw_aio_plug, 3209 .bdrv_io_unplug = raw_aio_unplug, 3210 3211 .bdrv_truncate = raw_truncate, 3212 .bdrv_getlength = raw_getlength, 3213 .has_variable_length = true, 3214 .bdrv_get_allocated_file_size 3215 = raw_get_allocated_file_size, 3216 3217 /* removable device support */ 3218 .bdrv_is_inserted = cdrom_is_inserted, 3219 .bdrv_eject = cdrom_eject, 3220 .bdrv_lock_medium = cdrom_lock_medium, 3221 }; 3222 #endif /* __FreeBSD__ */ 3223 3224 static void bdrv_file_init(void) 3225 { 3226 /* 3227 * Register all the drivers. Note that order is important, the driver 3228 * registered last will get probed first. 3229 */ 3230 bdrv_register(&bdrv_file); 3231 bdrv_register(&bdrv_host_device); 3232 #ifdef __linux__ 3233 bdrv_register(&bdrv_host_cdrom); 3234 #endif 3235 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 3236 bdrv_register(&bdrv_host_cdrom); 3237 #endif 3238 } 3239 3240 block_init(bdrv_file_init); 3241