1 /* 2 * Block driver for RAW files (posix) 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "qemu-common.h" 27 #include "qapi/error.h" 28 #include "qemu/cutils.h" 29 #include "qemu/error-report.h" 30 #include "block/block_int.h" 31 #include "qemu/module.h" 32 #include "qemu/option.h" 33 #include "qemu/units.h" 34 #include "trace.h" 35 #include "block/thread-pool.h" 36 #include "qemu/iov.h" 37 #include "block/raw-aio.h" 38 #include "qapi/qmp/qdict.h" 39 #include "qapi/qmp/qstring.h" 40 41 #include "scsi/pr-manager.h" 42 #include "scsi/constants.h" 43 44 #if defined(__APPLE__) && (__MACH__) 45 #include <paths.h> 46 #include <sys/param.h> 47 #include <IOKit/IOKitLib.h> 48 #include <IOKit/IOBSD.h> 49 #include <IOKit/storage/IOMediaBSDClient.h> 50 #include <IOKit/storage/IOMedia.h> 51 #include <IOKit/storage/IOCDMedia.h> 52 //#include <IOKit/storage/IOCDTypes.h> 53 #include <IOKit/storage/IODVDMedia.h> 54 #include <CoreFoundation/CoreFoundation.h> 55 #endif 56 57 #ifdef __sun__ 58 #define _POSIX_PTHREAD_SEMANTICS 1 59 #include <sys/dkio.h> 60 #endif 61 #ifdef __linux__ 62 #include <sys/ioctl.h> 63 #include <sys/param.h> 64 #include <sys/syscall.h> 65 #include <sys/vfs.h> 66 #include <linux/cdrom.h> 67 #include <linux/fd.h> 68 #include <linux/fs.h> 69 #include <linux/hdreg.h> 70 #include <linux/magic.h> 71 #include <scsi/sg.h> 72 #ifdef __s390__ 73 #include <asm/dasd.h> 74 #endif 75 #ifndef FS_NOCOW_FL 76 #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ 77 #endif 78 #endif 79 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) 80 #include <linux/falloc.h> 81 #endif 82 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 83 #include <sys/disk.h> 84 #include <sys/cdio.h> 85 #endif 86 87 #ifdef __OpenBSD__ 88 #include <sys/ioctl.h> 89 #include <sys/disklabel.h> 90 #include <sys/dkio.h> 91 #endif 92 93 #ifdef __NetBSD__ 94 #include <sys/ioctl.h> 95 #include <sys/disklabel.h> 96 #include <sys/dkio.h> 97 #include <sys/disk.h> 98 #endif 99 100 #ifdef __DragonFly__ 101 #include <sys/ioctl.h> 102 #include <sys/diskslice.h> 103 #endif 104 105 #ifdef CONFIG_XFS 106 #include <xfs/xfs.h> 107 #endif 108 109 #include "trace.h" 110 111 /* OS X does not have O_DSYNC */ 112 #ifndef O_DSYNC 113 #ifdef O_SYNC 114 #define O_DSYNC O_SYNC 115 #elif defined(O_FSYNC) 116 #define O_DSYNC O_FSYNC 117 #endif 118 #endif 119 120 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ 121 #ifndef O_DIRECT 122 #define O_DIRECT O_DSYNC 123 #endif 124 125 #define FTYPE_FILE 0 126 #define FTYPE_CD 1 127 128 #define MAX_BLOCKSIZE 4096 129 130 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, 131 * leaving a few more bytes for its future use. */ 132 #define RAW_LOCK_PERM_BASE 100 133 #define RAW_LOCK_SHARED_BASE 200 134 135 typedef struct BDRVRawState { 136 int fd; 137 bool use_lock; 138 int type; 139 int open_flags; 140 size_t buf_align; 141 142 /* The current permissions. */ 143 uint64_t perm; 144 uint64_t shared_perm; 145 146 /* The perms bits whose corresponding bytes are already locked in 147 * s->fd. */ 148 uint64_t locked_perm; 149 uint64_t locked_shared_perm; 150 151 int perm_change_fd; 152 int perm_change_flags; 153 BDRVReopenState *reopen_state; 154 155 #ifdef CONFIG_XFS 156 bool is_xfs:1; 157 #endif 158 bool has_discard:1; 159 bool has_write_zeroes:1; 160 bool discard_zeroes:1; 161 bool use_linux_aio:1; 162 bool use_linux_io_uring:1; 163 bool page_cache_inconsistent:1; 164 bool has_fallocate; 165 bool needs_alignment; 166 bool drop_cache; 167 bool check_cache_dropped; 168 struct { 169 uint64_t discard_nb_ok; 170 uint64_t discard_nb_failed; 171 uint64_t discard_bytes_ok; 172 } stats; 173 174 PRManager *pr_mgr; 175 } BDRVRawState; 176 177 typedef struct BDRVRawReopenState { 178 int fd; 179 int open_flags; 180 bool drop_cache; 181 bool check_cache_dropped; 182 } BDRVRawReopenState; 183 184 static int fd_open(BlockDriverState *bs); 185 static int64_t raw_getlength(BlockDriverState *bs); 186 187 typedef struct RawPosixAIOData { 188 BlockDriverState *bs; 189 int aio_type; 190 int aio_fildes; 191 192 off_t aio_offset; 193 uint64_t aio_nbytes; 194 195 union { 196 struct { 197 struct iovec *iov; 198 int niov; 199 } io; 200 struct { 201 uint64_t cmd; 202 void *buf; 203 } ioctl; 204 struct { 205 int aio_fd2; 206 off_t aio_offset2; 207 } copy_range; 208 struct { 209 PreallocMode prealloc; 210 Error **errp; 211 } truncate; 212 }; 213 } RawPosixAIOData; 214 215 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 216 static int cdrom_reopen(BlockDriverState *bs); 217 #endif 218 219 /* 220 * Elide EAGAIN and EACCES details when failing to lock, as this 221 * indicates that the specified file region is already locked by 222 * another process, which is considered a common scenario. 223 */ 224 #define raw_lock_error_setg_errno(errp, err, fmt, ...) \ 225 do { \ 226 if ((err) == EAGAIN || (err) == EACCES) { \ 227 error_setg((errp), (fmt), ## __VA_ARGS__); \ 228 } else { \ 229 error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__); \ 230 } \ 231 } while (0) 232 233 #if defined(__NetBSD__) 234 static int raw_normalize_devicepath(const char **filename, Error **errp) 235 { 236 static char namebuf[PATH_MAX]; 237 const char *dp, *fname; 238 struct stat sb; 239 240 fname = *filename; 241 dp = strrchr(fname, '/'); 242 if (lstat(fname, &sb) < 0) { 243 error_setg_file_open(errp, errno, fname); 244 return -errno; 245 } 246 247 if (!S_ISBLK(sb.st_mode)) { 248 return 0; 249 } 250 251 if (dp == NULL) { 252 snprintf(namebuf, PATH_MAX, "r%s", fname); 253 } else { 254 snprintf(namebuf, PATH_MAX, "%.*s/r%s", 255 (int)(dp - fname), fname, dp + 1); 256 } 257 *filename = namebuf; 258 warn_report("%s is a block device, using %s", fname, *filename); 259 260 return 0; 261 } 262 #else 263 static int raw_normalize_devicepath(const char **filename, Error **errp) 264 { 265 return 0; 266 } 267 #endif 268 269 /* 270 * Get logical block size via ioctl. On success store it in @sector_size_p. 271 */ 272 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) 273 { 274 unsigned int sector_size; 275 bool success = false; 276 int i; 277 278 errno = ENOTSUP; 279 static const unsigned long ioctl_list[] = { 280 #ifdef BLKSSZGET 281 BLKSSZGET, 282 #endif 283 #ifdef DKIOCGETBLOCKSIZE 284 DKIOCGETBLOCKSIZE, 285 #endif 286 #ifdef DIOCGSECTORSIZE 287 DIOCGSECTORSIZE, 288 #endif 289 }; 290 291 /* Try a few ioctls to get the right size */ 292 for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) { 293 if (ioctl(fd, ioctl_list[i], §or_size) >= 0) { 294 *sector_size_p = sector_size; 295 success = true; 296 } 297 } 298 299 return success ? 0 : -errno; 300 } 301 302 /** 303 * Get physical block size of @fd. 304 * On success, store it in @blk_size and return 0. 305 * On failure, return -errno. 306 */ 307 static int probe_physical_blocksize(int fd, unsigned int *blk_size) 308 { 309 #ifdef BLKPBSZGET 310 if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { 311 return -errno; 312 } 313 return 0; 314 #else 315 return -ENOTSUP; 316 #endif 317 } 318 319 /* 320 * Returns true if no alignment restrictions are necessary even for files 321 * opened with O_DIRECT. 322 * 323 * raw_probe_alignment() probes the required alignment and assume that 1 means 324 * the probing failed, so it falls back to a safe default of 4k. This can be 325 * avoided if we know that byte alignment is okay for the file. 326 */ 327 static bool dio_byte_aligned(int fd) 328 { 329 #ifdef __linux__ 330 struct statfs buf; 331 int ret; 332 333 ret = fstatfs(fd, &buf); 334 if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) { 335 return true; 336 } 337 #endif 338 return false; 339 } 340 341 /* Check if read is allowed with given memory buffer and length. 342 * 343 * This function is used to check O_DIRECT memory buffer and request alignment. 344 */ 345 static bool raw_is_io_aligned(int fd, void *buf, size_t len) 346 { 347 ssize_t ret = pread(fd, buf, len, 0); 348 349 if (ret >= 0) { 350 return true; 351 } 352 353 #ifdef __linux__ 354 /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore 355 * other errors (e.g. real I/O error), which could happen on a failed 356 * drive, since we only care about probing alignment. 357 */ 358 if (errno != EINVAL) { 359 return true; 360 } 361 #endif 362 363 return false; 364 } 365 366 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) 367 { 368 BDRVRawState *s = bs->opaque; 369 char *buf; 370 size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size); 371 size_t alignments[] = {1, 512, 1024, 2048, 4096}; 372 373 /* For SCSI generic devices the alignment is not really used. 374 With buffered I/O, we don't have any restrictions. */ 375 if (bdrv_is_sg(bs) || !s->needs_alignment) { 376 bs->bl.request_alignment = 1; 377 s->buf_align = 1; 378 return; 379 } 380 381 bs->bl.request_alignment = 0; 382 s->buf_align = 0; 383 /* Let's try to use the logical blocksize for the alignment. */ 384 if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) { 385 bs->bl.request_alignment = 0; 386 } 387 #ifdef CONFIG_XFS 388 if (s->is_xfs) { 389 struct dioattr da; 390 if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { 391 bs->bl.request_alignment = da.d_miniosz; 392 /* The kernel returns wrong information for d_mem */ 393 /* s->buf_align = da.d_mem; */ 394 } 395 } 396 #endif 397 398 /* 399 * If we could not get the sizes so far, we can only guess them. First try 400 * to detect request alignment, since it is more likely to succeed. Then 401 * try to detect buf_align, which cannot be detected in some cases (e.g. 402 * Gluster). If buf_align cannot be detected, we fallback to the value of 403 * request_alignment. 404 */ 405 406 if (!bs->bl.request_alignment) { 407 int i; 408 size_t align; 409 buf = qemu_memalign(max_align, max_align); 410 for (i = 0; i < ARRAY_SIZE(alignments); i++) { 411 align = alignments[i]; 412 if (raw_is_io_aligned(fd, buf, align)) { 413 /* Fallback to safe value. */ 414 bs->bl.request_alignment = (align != 1) ? align : max_align; 415 break; 416 } 417 } 418 qemu_vfree(buf); 419 } 420 421 if (!s->buf_align) { 422 int i; 423 size_t align; 424 buf = qemu_memalign(max_align, 2 * max_align); 425 for (i = 0; i < ARRAY_SIZE(alignments); i++) { 426 align = alignments[i]; 427 if (raw_is_io_aligned(fd, buf + align, max_align)) { 428 /* Fallback to request_alignment. */ 429 s->buf_align = (align != 1) ? align : bs->bl.request_alignment; 430 break; 431 } 432 } 433 qemu_vfree(buf); 434 } 435 436 if (!s->buf_align || !bs->bl.request_alignment) { 437 error_setg(errp, "Could not find working O_DIRECT alignment"); 438 error_append_hint(errp, "Try cache.direct=off\n"); 439 } 440 } 441 442 static int check_hdev_writable(int fd) 443 { 444 #if defined(BLKROGET) 445 /* Linux block devices can be configured "read-only" using blockdev(8). 446 * This is independent of device node permissions and therefore open(2) 447 * with O_RDWR succeeds. Actual writes fail with EPERM. 448 * 449 * bdrv_open() is supposed to fail if the disk is read-only. Explicitly 450 * check for read-only block devices so that Linux block devices behave 451 * properly. 452 */ 453 struct stat st; 454 int readonly = 0; 455 456 if (fstat(fd, &st)) { 457 return -errno; 458 } 459 460 if (!S_ISBLK(st.st_mode)) { 461 return 0; 462 } 463 464 if (ioctl(fd, BLKROGET, &readonly) < 0) { 465 return -errno; 466 } 467 468 if (readonly) { 469 return -EACCES; 470 } 471 #endif /* defined(BLKROGET) */ 472 return 0; 473 } 474 475 static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers) 476 { 477 bool read_write = false; 478 assert(open_flags != NULL); 479 480 *open_flags |= O_BINARY; 481 *open_flags &= ~O_ACCMODE; 482 483 if (bdrv_flags & BDRV_O_AUTO_RDONLY) { 484 read_write = has_writers; 485 } else if (bdrv_flags & BDRV_O_RDWR) { 486 read_write = true; 487 } 488 489 if (read_write) { 490 *open_flags |= O_RDWR; 491 } else { 492 *open_flags |= O_RDONLY; 493 } 494 495 /* Use O_DSYNC for write-through caching, no flags for write-back caching, 496 * and O_DIRECT for no caching. */ 497 if ((bdrv_flags & BDRV_O_NOCACHE)) { 498 *open_flags |= O_DIRECT; 499 } 500 } 501 502 static void raw_parse_filename(const char *filename, QDict *options, 503 Error **errp) 504 { 505 bdrv_parse_filename_strip_prefix(filename, "file:", options); 506 } 507 508 static QemuOptsList raw_runtime_opts = { 509 .name = "raw", 510 .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), 511 .desc = { 512 { 513 .name = "filename", 514 .type = QEMU_OPT_STRING, 515 .help = "File name of the image", 516 }, 517 { 518 .name = "aio", 519 .type = QEMU_OPT_STRING, 520 .help = "host AIO implementation (threads, native, io_uring)", 521 }, 522 { 523 .name = "locking", 524 .type = QEMU_OPT_STRING, 525 .help = "file locking mode (on/off/auto, default: auto)", 526 }, 527 { 528 .name = "pr-manager", 529 .type = QEMU_OPT_STRING, 530 .help = "id of persistent reservation manager object (default: none)", 531 }, 532 #if defined(__linux__) 533 { 534 .name = "drop-cache", 535 .type = QEMU_OPT_BOOL, 536 .help = "invalidate page cache during live migration (default: on)", 537 }, 538 #endif 539 { 540 .name = "x-check-cache-dropped", 541 .type = QEMU_OPT_BOOL, 542 .help = "check that page cache was dropped on live migration (default: off)" 543 }, 544 { /* end of list */ } 545 }, 546 }; 547 548 static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL }; 549 550 static int raw_open_common(BlockDriverState *bs, QDict *options, 551 int bdrv_flags, int open_flags, 552 bool device, Error **errp) 553 { 554 BDRVRawState *s = bs->opaque; 555 QemuOpts *opts; 556 Error *local_err = NULL; 557 const char *filename = NULL; 558 const char *str; 559 BlockdevAioOptions aio, aio_default; 560 int fd, ret; 561 struct stat st; 562 OnOffAuto locking; 563 564 opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); 565 if (!qemu_opts_absorb_qdict(opts, options, errp)) { 566 ret = -EINVAL; 567 goto fail; 568 } 569 570 filename = qemu_opt_get(opts, "filename"); 571 572 ret = raw_normalize_devicepath(&filename, errp); 573 if (ret != 0) { 574 goto fail; 575 } 576 577 if (bdrv_flags & BDRV_O_NATIVE_AIO) { 578 aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; 579 #ifdef CONFIG_LINUX_IO_URING 580 } else if (bdrv_flags & BDRV_O_IO_URING) { 581 aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; 582 #endif 583 } else { 584 aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; 585 } 586 587 aio = qapi_enum_parse(&BlockdevAioOptions_lookup, 588 qemu_opt_get(opts, "aio"), 589 aio_default, &local_err); 590 if (local_err) { 591 error_propagate(errp, local_err); 592 ret = -EINVAL; 593 goto fail; 594 } 595 596 s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); 597 #ifdef CONFIG_LINUX_IO_URING 598 s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); 599 #endif 600 601 locking = qapi_enum_parse(&OnOffAuto_lookup, 602 qemu_opt_get(opts, "locking"), 603 ON_OFF_AUTO_AUTO, &local_err); 604 if (local_err) { 605 error_propagate(errp, local_err); 606 ret = -EINVAL; 607 goto fail; 608 } 609 switch (locking) { 610 case ON_OFF_AUTO_ON: 611 s->use_lock = true; 612 if (!qemu_has_ofd_lock()) { 613 warn_report("File lock requested but OFD locking syscall is " 614 "unavailable, falling back to POSIX file locks"); 615 error_printf("Due to the implementation, locks can be lost " 616 "unexpectedly.\n"); 617 } 618 break; 619 case ON_OFF_AUTO_OFF: 620 s->use_lock = false; 621 break; 622 case ON_OFF_AUTO_AUTO: 623 s->use_lock = qemu_has_ofd_lock(); 624 break; 625 default: 626 abort(); 627 } 628 629 str = qemu_opt_get(opts, "pr-manager"); 630 if (str) { 631 s->pr_mgr = pr_manager_lookup(str, &local_err); 632 if (local_err) { 633 error_propagate(errp, local_err); 634 ret = -EINVAL; 635 goto fail; 636 } 637 } 638 639 s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true); 640 s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped", 641 false); 642 643 s->open_flags = open_flags; 644 raw_parse_flags(bdrv_flags, &s->open_flags, false); 645 646 s->fd = -1; 647 fd = qemu_open(filename, s->open_flags, errp); 648 ret = fd < 0 ? -errno : 0; 649 650 if (ret < 0) { 651 if (ret == -EROFS) { 652 ret = -EACCES; 653 } 654 goto fail; 655 } 656 s->fd = fd; 657 658 /* Check s->open_flags rather than bdrv_flags due to auto-read-only */ 659 if (s->open_flags & O_RDWR) { 660 ret = check_hdev_writable(s->fd); 661 if (ret < 0) { 662 error_setg_errno(errp, -ret, "The device is not writable"); 663 goto fail; 664 } 665 } 666 667 s->perm = 0; 668 s->shared_perm = BLK_PERM_ALL; 669 670 #ifdef CONFIG_LINUX_AIO 671 /* Currently Linux does AIO only for files opened with O_DIRECT */ 672 if (s->use_linux_aio) { 673 if (!(s->open_flags & O_DIRECT)) { 674 error_setg(errp, "aio=native was specified, but it requires " 675 "cache.direct=on, which was not specified."); 676 ret = -EINVAL; 677 goto fail; 678 } 679 if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) { 680 error_prepend(errp, "Unable to use native AIO: "); 681 goto fail; 682 } 683 } 684 #else 685 if (s->use_linux_aio) { 686 error_setg(errp, "aio=native was specified, but is not supported " 687 "in this build."); 688 ret = -EINVAL; 689 goto fail; 690 } 691 #endif /* !defined(CONFIG_LINUX_AIO) */ 692 693 #ifdef CONFIG_LINUX_IO_URING 694 if (s->use_linux_io_uring) { 695 if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { 696 error_prepend(errp, "Unable to use io_uring: "); 697 goto fail; 698 } 699 } 700 #else 701 if (s->use_linux_io_uring) { 702 error_setg(errp, "aio=io_uring was specified, but is not supported " 703 "in this build."); 704 ret = -EINVAL; 705 goto fail; 706 } 707 #endif /* !defined(CONFIG_LINUX_IO_URING) */ 708 709 s->has_discard = true; 710 s->has_write_zeroes = true; 711 if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) { 712 s->needs_alignment = true; 713 } 714 715 if (fstat(s->fd, &st) < 0) { 716 ret = -errno; 717 error_setg_errno(errp, errno, "Could not stat file"); 718 goto fail; 719 } 720 721 if (!device) { 722 if (S_ISBLK(st.st_mode)) { 723 warn_report("Opening a block device as a file using the '%s' " 724 "driver is deprecated", bs->drv->format_name); 725 } else if (S_ISCHR(st.st_mode)) { 726 warn_report("Opening a character device as a file using the '%s' " 727 "driver is deprecated", bs->drv->format_name); 728 } else if (!S_ISREG(st.st_mode)) { 729 error_setg(errp, "A regular file was expected by the '%s' driver, " 730 "but something else was given", bs->drv->format_name); 731 ret = -EINVAL; 732 goto fail; 733 } else { 734 s->discard_zeroes = true; 735 s->has_fallocate = true; 736 } 737 } else { 738 if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { 739 error_setg(errp, "'%s' driver expects either " 740 "a character or block device", bs->drv->format_name); 741 ret = -EINVAL; 742 goto fail; 743 } 744 } 745 746 if (S_ISBLK(st.st_mode)) { 747 #ifdef BLKDISCARDZEROES 748 unsigned int arg; 749 if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { 750 s->discard_zeroes = true; 751 } 752 #endif 753 #ifdef __linux__ 754 /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do 755 * not rely on the contents of discarded blocks unless using O_DIRECT. 756 * Same for BLKZEROOUT. 757 */ 758 if (!(bs->open_flags & BDRV_O_NOCACHE)) { 759 s->discard_zeroes = false; 760 s->has_write_zeroes = false; 761 } 762 #endif 763 } 764 #ifdef __FreeBSD__ 765 if (S_ISCHR(st.st_mode)) { 766 /* 767 * The file is a char device (disk), which on FreeBSD isn't behind 768 * a pager, so force all requests to be aligned. This is needed 769 * so QEMU makes sure all IO operations on the device are aligned 770 * to sector size, or else FreeBSD will reject them with EINVAL. 771 */ 772 s->needs_alignment = true; 773 } 774 #endif 775 776 #ifdef CONFIG_XFS 777 if (platform_test_xfs_fd(s->fd)) { 778 s->is_xfs = true; 779 } 780 #endif 781 782 bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; 783 if (S_ISREG(st.st_mode)) { 784 /* When extending regular files, we get zeros from the OS */ 785 bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; 786 } 787 ret = 0; 788 fail: 789 if (ret < 0 && s->fd != -1) { 790 qemu_close(s->fd); 791 } 792 if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { 793 unlink(filename); 794 } 795 qemu_opts_del(opts); 796 return ret; 797 } 798 799 static int raw_open(BlockDriverState *bs, QDict *options, int flags, 800 Error **errp) 801 { 802 BDRVRawState *s = bs->opaque; 803 804 s->type = FTYPE_FILE; 805 return raw_open_common(bs, options, flags, 0, false, errp); 806 } 807 808 typedef enum { 809 RAW_PL_PREPARE, 810 RAW_PL_COMMIT, 811 RAW_PL_ABORT, 812 } RawPermLockOp; 813 814 #define PERM_FOREACH(i) \ 815 for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++) 816 817 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the 818 * file; if @unlock == true, also unlock the unneeded bytes. 819 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared. 820 */ 821 static int raw_apply_lock_bytes(BDRVRawState *s, int fd, 822 uint64_t perm_lock_bits, 823 uint64_t shared_perm_lock_bits, 824 bool unlock, Error **errp) 825 { 826 int ret; 827 int i; 828 uint64_t locked_perm, locked_shared_perm; 829 830 if (s) { 831 locked_perm = s->locked_perm; 832 locked_shared_perm = s->locked_shared_perm; 833 } else { 834 /* 835 * We don't have the previous bits, just lock/unlock for each of the 836 * requested bits. 837 */ 838 if (unlock) { 839 locked_perm = BLK_PERM_ALL; 840 locked_shared_perm = BLK_PERM_ALL; 841 } else { 842 locked_perm = 0; 843 locked_shared_perm = 0; 844 } 845 } 846 847 PERM_FOREACH(i) { 848 int off = RAW_LOCK_PERM_BASE + i; 849 uint64_t bit = (1ULL << i); 850 if ((perm_lock_bits & bit) && !(locked_perm & bit)) { 851 ret = qemu_lock_fd(fd, off, 1, false); 852 if (ret) { 853 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d", 854 off); 855 return ret; 856 } else if (s) { 857 s->locked_perm |= bit; 858 } 859 } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) { 860 ret = qemu_unlock_fd(fd, off, 1); 861 if (ret) { 862 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off); 863 return ret; 864 } else if (s) { 865 s->locked_perm &= ~bit; 866 } 867 } 868 } 869 PERM_FOREACH(i) { 870 int off = RAW_LOCK_SHARED_BASE + i; 871 uint64_t bit = (1ULL << i); 872 if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) { 873 ret = qemu_lock_fd(fd, off, 1, false); 874 if (ret) { 875 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d", 876 off); 877 return ret; 878 } else if (s) { 879 s->locked_shared_perm |= bit; 880 } 881 } else if (unlock && (locked_shared_perm & bit) && 882 !(shared_perm_lock_bits & bit)) { 883 ret = qemu_unlock_fd(fd, off, 1); 884 if (ret) { 885 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off); 886 return ret; 887 } else if (s) { 888 s->locked_shared_perm &= ~bit; 889 } 890 } 891 } 892 return 0; 893 } 894 895 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */ 896 static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm, 897 Error **errp) 898 { 899 int ret; 900 int i; 901 902 PERM_FOREACH(i) { 903 int off = RAW_LOCK_SHARED_BASE + i; 904 uint64_t p = 1ULL << i; 905 if (perm & p) { 906 ret = qemu_lock_fd_test(fd, off, 1, true); 907 if (ret) { 908 char *perm_name = bdrv_perm_names(p); 909 910 raw_lock_error_setg_errno(errp, -ret, 911 "Failed to get \"%s\" lock", 912 perm_name); 913 g_free(perm_name); 914 return ret; 915 } 916 } 917 } 918 PERM_FOREACH(i) { 919 int off = RAW_LOCK_PERM_BASE + i; 920 uint64_t p = 1ULL << i; 921 if (!(shared_perm & p)) { 922 ret = qemu_lock_fd_test(fd, off, 1, true); 923 if (ret) { 924 char *perm_name = bdrv_perm_names(p); 925 926 raw_lock_error_setg_errno(errp, -ret, 927 "Failed to get shared \"%s\" lock", 928 perm_name); 929 g_free(perm_name); 930 return ret; 931 } 932 } 933 } 934 return 0; 935 } 936 937 static int raw_handle_perm_lock(BlockDriverState *bs, 938 RawPermLockOp op, 939 uint64_t new_perm, uint64_t new_shared, 940 Error **errp) 941 { 942 BDRVRawState *s = bs->opaque; 943 int ret = 0; 944 Error *local_err = NULL; 945 946 if (!s->use_lock) { 947 return 0; 948 } 949 950 if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) { 951 return 0; 952 } 953 954 switch (op) { 955 case RAW_PL_PREPARE: 956 if ((s->perm | new_perm) == s->perm && 957 (s->shared_perm & new_shared) == s->shared_perm) 958 { 959 /* 960 * We are going to unlock bytes, it should not fail. If it fail due 961 * to some fs-dependent permission-unrelated reasons (which occurs 962 * sometimes on NFS and leads to abort in bdrv_replace_child) we 963 * can't prevent such errors by any check here. And we ignore them 964 * anyway in ABORT and COMMIT. 965 */ 966 return 0; 967 } 968 ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm, 969 ~s->shared_perm | ~new_shared, 970 false, errp); 971 if (!ret) { 972 ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp); 973 if (!ret) { 974 return 0; 975 } 976 error_append_hint(errp, 977 "Is another process using the image [%s]?\n", 978 bs->filename); 979 } 980 /* fall through to unlock bytes. */ 981 case RAW_PL_ABORT: 982 raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm, 983 true, &local_err); 984 if (local_err) { 985 /* Theoretically the above call only unlocks bytes and it cannot 986 * fail. Something weird happened, report it. 987 */ 988 warn_report_err(local_err); 989 } 990 break; 991 case RAW_PL_COMMIT: 992 raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared, 993 true, &local_err); 994 if (local_err) { 995 /* Theoretically the above call only unlocks bytes and it cannot 996 * fail. Something weird happened, report it. 997 */ 998 warn_report_err(local_err); 999 } 1000 break; 1001 } 1002 return ret; 1003 } 1004 1005 static int raw_reconfigure_getfd(BlockDriverState *bs, int flags, 1006 int *open_flags, uint64_t perm, bool force_dup, 1007 Error **errp) 1008 { 1009 BDRVRawState *s = bs->opaque; 1010 int fd = -1; 1011 int ret; 1012 bool has_writers = perm & 1013 (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE); 1014 int fcntl_flags = O_APPEND | O_NONBLOCK; 1015 #ifdef O_NOATIME 1016 fcntl_flags |= O_NOATIME; 1017 #endif 1018 1019 *open_flags = 0; 1020 if (s->type == FTYPE_CD) { 1021 *open_flags |= O_NONBLOCK; 1022 } 1023 1024 raw_parse_flags(flags, open_flags, has_writers); 1025 1026 #ifdef O_ASYNC 1027 /* Not all operating systems have O_ASYNC, and those that don't 1028 * will not let us track the state into rs->open_flags (typically 1029 * you achieve the same effect with an ioctl, for example I_SETSIG 1030 * on Solaris). But we do not use O_ASYNC, so that's fine. 1031 */ 1032 assert((s->open_flags & O_ASYNC) == 0); 1033 #endif 1034 1035 if (!force_dup && *open_flags == s->open_flags) { 1036 /* We're lucky, the existing fd is fine */ 1037 return s->fd; 1038 } 1039 1040 if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { 1041 /* dup the original fd */ 1042 fd = qemu_dup(s->fd); 1043 if (fd >= 0) { 1044 ret = fcntl_setfl(fd, *open_flags); 1045 if (ret) { 1046 qemu_close(fd); 1047 fd = -1; 1048 } 1049 } 1050 } 1051 1052 /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ 1053 if (fd == -1) { 1054 const char *normalized_filename = bs->filename; 1055 ret = raw_normalize_devicepath(&normalized_filename, errp); 1056 if (ret >= 0) { 1057 fd = qemu_open(normalized_filename, *open_flags, errp); 1058 if (fd == -1) { 1059 return -1; 1060 } 1061 } 1062 } 1063 1064 if (fd != -1 && (*open_flags & O_RDWR)) { 1065 ret = check_hdev_writable(fd); 1066 if (ret < 0) { 1067 qemu_close(fd); 1068 error_setg_errno(errp, -ret, "The device is not writable"); 1069 return -1; 1070 } 1071 } 1072 1073 return fd; 1074 } 1075 1076 static int raw_reopen_prepare(BDRVReopenState *state, 1077 BlockReopenQueue *queue, Error **errp) 1078 { 1079 BDRVRawState *s; 1080 BDRVRawReopenState *rs; 1081 QemuOpts *opts; 1082 int ret; 1083 Error *local_err = NULL; 1084 1085 assert(state != NULL); 1086 assert(state->bs != NULL); 1087 1088 s = state->bs->opaque; 1089 1090 state->opaque = g_new0(BDRVRawReopenState, 1); 1091 rs = state->opaque; 1092 1093 /* Handle options changes */ 1094 opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); 1095 if (!qemu_opts_absorb_qdict(opts, state->options, errp)) { 1096 ret = -EINVAL; 1097 goto out; 1098 } 1099 1100 rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true); 1101 rs->check_cache_dropped = 1102 qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false); 1103 1104 /* This driver's reopen function doesn't currently allow changing 1105 * other options, so let's put them back in the original QDict and 1106 * bdrv_reopen_prepare() will detect changes and complain. */ 1107 qemu_opts_to_qdict(opts, state->options); 1108 1109 rs->fd = raw_reconfigure_getfd(state->bs, state->flags, &rs->open_flags, 1110 state->perm, true, &local_err); 1111 if (local_err) { 1112 error_propagate(errp, local_err); 1113 ret = -1; 1114 goto out; 1115 } 1116 1117 /* Fail already reopen_prepare() if we can't get a working O_DIRECT 1118 * alignment with the new fd. */ 1119 if (rs->fd != -1) { 1120 raw_probe_alignment(state->bs, rs->fd, &local_err); 1121 if (local_err) { 1122 error_propagate(errp, local_err); 1123 ret = -EINVAL; 1124 goto out_fd; 1125 } 1126 } 1127 1128 s->reopen_state = state; 1129 ret = 0; 1130 out_fd: 1131 if (ret < 0) { 1132 qemu_close(rs->fd); 1133 rs->fd = -1; 1134 } 1135 out: 1136 qemu_opts_del(opts); 1137 return ret; 1138 } 1139 1140 static void raw_reopen_commit(BDRVReopenState *state) 1141 { 1142 BDRVRawReopenState *rs = state->opaque; 1143 BDRVRawState *s = state->bs->opaque; 1144 1145 s->drop_cache = rs->drop_cache; 1146 s->check_cache_dropped = rs->check_cache_dropped; 1147 s->open_flags = rs->open_flags; 1148 1149 qemu_close(s->fd); 1150 s->fd = rs->fd; 1151 1152 g_free(state->opaque); 1153 state->opaque = NULL; 1154 1155 assert(s->reopen_state == state); 1156 s->reopen_state = NULL; 1157 } 1158 1159 1160 static void raw_reopen_abort(BDRVReopenState *state) 1161 { 1162 BDRVRawReopenState *rs = state->opaque; 1163 BDRVRawState *s = state->bs->opaque; 1164 1165 /* nothing to do if NULL, we didn't get far enough */ 1166 if (rs == NULL) { 1167 return; 1168 } 1169 1170 if (rs->fd >= 0) { 1171 qemu_close(rs->fd); 1172 rs->fd = -1; 1173 } 1174 g_free(state->opaque); 1175 state->opaque = NULL; 1176 1177 assert(s->reopen_state == state); 1178 s->reopen_state = NULL; 1179 } 1180 1181 static int sg_get_max_transfer_length(int fd) 1182 { 1183 #ifdef BLKSECTGET 1184 int max_bytes = 0; 1185 1186 if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) { 1187 return max_bytes; 1188 } else { 1189 return -errno; 1190 } 1191 #else 1192 return -ENOSYS; 1193 #endif 1194 } 1195 1196 static int sg_get_max_segments(int fd) 1197 { 1198 #ifdef CONFIG_LINUX 1199 char buf[32]; 1200 const char *end; 1201 char *sysfspath = NULL; 1202 int ret; 1203 int sysfd = -1; 1204 long max_segments; 1205 struct stat st; 1206 1207 if (fstat(fd, &st)) { 1208 ret = -errno; 1209 goto out; 1210 } 1211 1212 sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments", 1213 major(st.st_rdev), minor(st.st_rdev)); 1214 sysfd = open(sysfspath, O_RDONLY); 1215 if (sysfd == -1) { 1216 ret = -errno; 1217 goto out; 1218 } 1219 do { 1220 ret = read(sysfd, buf, sizeof(buf) - 1); 1221 } while (ret == -1 && errno == EINTR); 1222 if (ret < 0) { 1223 ret = -errno; 1224 goto out; 1225 } else if (ret == 0) { 1226 ret = -EIO; 1227 goto out; 1228 } 1229 buf[ret] = 0; 1230 /* The file is ended with '\n', pass 'end' to accept that. */ 1231 ret = qemu_strtol(buf, &end, 10, &max_segments); 1232 if (ret == 0 && end && *end == '\n') { 1233 ret = max_segments; 1234 } 1235 1236 out: 1237 if (sysfd != -1) { 1238 close(sysfd); 1239 } 1240 g_free(sysfspath); 1241 return ret; 1242 #else 1243 return -ENOTSUP; 1244 #endif 1245 } 1246 1247 static void raw_refresh_limits(BlockDriverState *bs, Error **errp) 1248 { 1249 BDRVRawState *s = bs->opaque; 1250 1251 if (bs->sg) { 1252 int ret = sg_get_max_transfer_length(s->fd); 1253 1254 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { 1255 bs->bl.max_transfer = pow2floor(ret); 1256 } 1257 1258 ret = sg_get_max_segments(s->fd); 1259 if (ret > 0) { 1260 bs->bl.max_transfer = MIN(bs->bl.max_transfer, 1261 ret * qemu_real_host_page_size); 1262 } 1263 } 1264 1265 raw_probe_alignment(bs, s->fd, errp); 1266 bs->bl.min_mem_alignment = s->buf_align; 1267 bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size); 1268 } 1269 1270 static int check_for_dasd(int fd) 1271 { 1272 #ifdef BIODASDINFO2 1273 struct dasd_information2_t info = {0}; 1274 1275 return ioctl(fd, BIODASDINFO2, &info); 1276 #else 1277 return -1; 1278 #endif 1279 } 1280 1281 /** 1282 * Try to get @bs's logical and physical block size. 1283 * On success, store them in @bsz and return zero. 1284 * On failure, return negative errno. 1285 */ 1286 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 1287 { 1288 BDRVRawState *s = bs->opaque; 1289 int ret; 1290 1291 /* If DASD, get blocksizes */ 1292 if (check_for_dasd(s->fd) < 0) { 1293 return -ENOTSUP; 1294 } 1295 ret = probe_logical_blocksize(s->fd, &bsz->log); 1296 if (ret < 0) { 1297 return ret; 1298 } 1299 return probe_physical_blocksize(s->fd, &bsz->phys); 1300 } 1301 1302 /** 1303 * Try to get @bs's geometry: cyls, heads, sectors. 1304 * On success, store them in @geo and return 0. 1305 * On failure return -errno. 1306 * (Allows block driver to assign default geometry values that guest sees) 1307 */ 1308 #ifdef __linux__ 1309 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1310 { 1311 BDRVRawState *s = bs->opaque; 1312 struct hd_geometry ioctl_geo = {0}; 1313 1314 /* If DASD, get its geometry */ 1315 if (check_for_dasd(s->fd) < 0) { 1316 return -ENOTSUP; 1317 } 1318 if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { 1319 return -errno; 1320 } 1321 /* HDIO_GETGEO may return success even though geo contains zeros 1322 (e.g. certain multipath setups) */ 1323 if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { 1324 return -ENOTSUP; 1325 } 1326 /* Do not return a geometry for partition */ 1327 if (ioctl_geo.start != 0) { 1328 return -ENOTSUP; 1329 } 1330 geo->heads = ioctl_geo.heads; 1331 geo->sectors = ioctl_geo.sectors; 1332 geo->cylinders = ioctl_geo.cylinders; 1333 1334 return 0; 1335 } 1336 #else /* __linux__ */ 1337 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1338 { 1339 return -ENOTSUP; 1340 } 1341 #endif 1342 1343 #if defined(__linux__) 1344 static int handle_aiocb_ioctl(void *opaque) 1345 { 1346 RawPosixAIOData *aiocb = opaque; 1347 int ret; 1348 1349 ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf); 1350 if (ret == -1) { 1351 return -errno; 1352 } 1353 1354 return 0; 1355 } 1356 #endif /* linux */ 1357 1358 static int handle_aiocb_flush(void *opaque) 1359 { 1360 RawPosixAIOData *aiocb = opaque; 1361 BDRVRawState *s = aiocb->bs->opaque; 1362 int ret; 1363 1364 if (s->page_cache_inconsistent) { 1365 return -EIO; 1366 } 1367 1368 ret = qemu_fdatasync(aiocb->aio_fildes); 1369 if (ret == -1) { 1370 /* There is no clear definition of the semantics of a failing fsync(), 1371 * so we may have to assume the worst. The sad truth is that this 1372 * assumption is correct for Linux. Some pages are now probably marked 1373 * clean in the page cache even though they are inconsistent with the 1374 * on-disk contents. The next fdatasync() call would succeed, but no 1375 * further writeback attempt will be made. We can't get back to a state 1376 * in which we know what is on disk (we would have to rewrite 1377 * everything that was touched since the last fdatasync() at least), so 1378 * make bdrv_flush() fail permanently. Given that the behaviour isn't 1379 * really defined, I have little hope that other OSes are doing better. 1380 * 1381 * Obviously, this doesn't affect O_DIRECT, which bypasses the page 1382 * cache. */ 1383 if ((s->open_flags & O_DIRECT) == 0) { 1384 s->page_cache_inconsistent = true; 1385 } 1386 return -errno; 1387 } 1388 return 0; 1389 } 1390 1391 #ifdef CONFIG_PREADV 1392 1393 static bool preadv_present = true; 1394 1395 static ssize_t 1396 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1397 { 1398 return preadv(fd, iov, nr_iov, offset); 1399 } 1400 1401 static ssize_t 1402 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1403 { 1404 return pwritev(fd, iov, nr_iov, offset); 1405 } 1406 1407 #else 1408 1409 static bool preadv_present = false; 1410 1411 static ssize_t 1412 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1413 { 1414 return -ENOSYS; 1415 } 1416 1417 static ssize_t 1418 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1419 { 1420 return -ENOSYS; 1421 } 1422 1423 #endif 1424 1425 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) 1426 { 1427 ssize_t len; 1428 1429 do { 1430 if (aiocb->aio_type & QEMU_AIO_WRITE) 1431 len = qemu_pwritev(aiocb->aio_fildes, 1432 aiocb->io.iov, 1433 aiocb->io.niov, 1434 aiocb->aio_offset); 1435 else 1436 len = qemu_preadv(aiocb->aio_fildes, 1437 aiocb->io.iov, 1438 aiocb->io.niov, 1439 aiocb->aio_offset); 1440 } while (len == -1 && errno == EINTR); 1441 1442 if (len == -1) { 1443 return -errno; 1444 } 1445 return len; 1446 } 1447 1448 /* 1449 * Read/writes the data to/from a given linear buffer. 1450 * 1451 * Returns the number of bytes handles or -errno in case of an error. Short 1452 * reads are only returned if the end of the file is reached. 1453 */ 1454 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) 1455 { 1456 ssize_t offset = 0; 1457 ssize_t len; 1458 1459 while (offset < aiocb->aio_nbytes) { 1460 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1461 len = pwrite(aiocb->aio_fildes, 1462 (const char *)buf + offset, 1463 aiocb->aio_nbytes - offset, 1464 aiocb->aio_offset + offset); 1465 } else { 1466 len = pread(aiocb->aio_fildes, 1467 buf + offset, 1468 aiocb->aio_nbytes - offset, 1469 aiocb->aio_offset + offset); 1470 } 1471 if (len == -1 && errno == EINTR) { 1472 continue; 1473 } else if (len == -1 && errno == EINVAL && 1474 (aiocb->bs->open_flags & BDRV_O_NOCACHE) && 1475 !(aiocb->aio_type & QEMU_AIO_WRITE) && 1476 offset > 0) { 1477 /* O_DIRECT pread() may fail with EINVAL when offset is unaligned 1478 * after a short read. Assume that O_DIRECT short reads only occur 1479 * at EOF. Therefore this is a short read, not an I/O error. 1480 */ 1481 break; 1482 } else if (len == -1) { 1483 offset = -errno; 1484 break; 1485 } else if (len == 0) { 1486 break; 1487 } 1488 offset += len; 1489 } 1490 1491 return offset; 1492 } 1493 1494 static int handle_aiocb_rw(void *opaque) 1495 { 1496 RawPosixAIOData *aiocb = opaque; 1497 ssize_t nbytes; 1498 char *buf; 1499 1500 if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { 1501 /* 1502 * If there is just a single buffer, and it is properly aligned 1503 * we can just use plain pread/pwrite without any problems. 1504 */ 1505 if (aiocb->io.niov == 1) { 1506 nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base); 1507 goto out; 1508 } 1509 /* 1510 * We have more than one iovec, and all are properly aligned. 1511 * 1512 * Try preadv/pwritev first and fall back to linearizing the 1513 * buffer if it's not supported. 1514 */ 1515 if (preadv_present) { 1516 nbytes = handle_aiocb_rw_vector(aiocb); 1517 if (nbytes == aiocb->aio_nbytes || 1518 (nbytes < 0 && nbytes != -ENOSYS)) { 1519 goto out; 1520 } 1521 preadv_present = false; 1522 } 1523 1524 /* 1525 * XXX(hch): short read/write. no easy way to handle the reminder 1526 * using these interfaces. For now retry using plain 1527 * pread/pwrite? 1528 */ 1529 } 1530 1531 /* 1532 * Ok, we have to do it the hard way, copy all segments into 1533 * a single aligned buffer. 1534 */ 1535 buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); 1536 if (buf == NULL) { 1537 nbytes = -ENOMEM; 1538 goto out; 1539 } 1540 1541 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1542 char *p = buf; 1543 int i; 1544 1545 for (i = 0; i < aiocb->io.niov; ++i) { 1546 memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len); 1547 p += aiocb->io.iov[i].iov_len; 1548 } 1549 assert(p - buf == aiocb->aio_nbytes); 1550 } 1551 1552 nbytes = handle_aiocb_rw_linear(aiocb, buf); 1553 if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { 1554 char *p = buf; 1555 size_t count = aiocb->aio_nbytes, copy; 1556 int i; 1557 1558 for (i = 0; i < aiocb->io.niov && count; ++i) { 1559 copy = count; 1560 if (copy > aiocb->io.iov[i].iov_len) { 1561 copy = aiocb->io.iov[i].iov_len; 1562 } 1563 memcpy(aiocb->io.iov[i].iov_base, p, copy); 1564 assert(count >= copy); 1565 p += copy; 1566 count -= copy; 1567 } 1568 assert(count == 0); 1569 } 1570 qemu_vfree(buf); 1571 1572 out: 1573 if (nbytes == aiocb->aio_nbytes) { 1574 return 0; 1575 } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { 1576 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1577 return -EINVAL; 1578 } else { 1579 iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes, 1580 0, aiocb->aio_nbytes - nbytes); 1581 return 0; 1582 } 1583 } else { 1584 assert(nbytes < 0); 1585 return nbytes; 1586 } 1587 } 1588 1589 static int translate_err(int err) 1590 { 1591 if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || 1592 err == -ENOTTY) { 1593 err = -ENOTSUP; 1594 } 1595 return err; 1596 } 1597 1598 #ifdef CONFIG_FALLOCATE 1599 static int do_fallocate(int fd, int mode, off_t offset, off_t len) 1600 { 1601 do { 1602 if (fallocate(fd, mode, offset, len) == 0) { 1603 return 0; 1604 } 1605 } while (errno == EINTR); 1606 return translate_err(-errno); 1607 } 1608 #endif 1609 1610 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) 1611 { 1612 int ret = -ENOTSUP; 1613 BDRVRawState *s = aiocb->bs->opaque; 1614 1615 if (!s->has_write_zeroes) { 1616 return -ENOTSUP; 1617 } 1618 1619 #ifdef BLKZEROOUT 1620 /* The BLKZEROOUT implementation in the kernel doesn't set 1621 * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow 1622 * fallbacks. */ 1623 if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) { 1624 do { 1625 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1626 if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { 1627 return 0; 1628 } 1629 } while (errno == EINTR); 1630 1631 ret = translate_err(-errno); 1632 if (ret == -ENOTSUP) { 1633 s->has_write_zeroes = false; 1634 } 1635 } 1636 #endif 1637 1638 return ret; 1639 } 1640 1641 static int handle_aiocb_write_zeroes(void *opaque) 1642 { 1643 RawPosixAIOData *aiocb = opaque; 1644 #ifdef CONFIG_FALLOCATE 1645 BDRVRawState *s = aiocb->bs->opaque; 1646 int64_t len; 1647 #endif 1648 1649 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1650 return handle_aiocb_write_zeroes_block(aiocb); 1651 } 1652 1653 #ifdef CONFIG_FALLOCATE_ZERO_RANGE 1654 if (s->has_write_zeroes) { 1655 int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, 1656 aiocb->aio_offset, aiocb->aio_nbytes); 1657 if (ret == -EINVAL) { 1658 /* 1659 * Allow falling back to pwrite for file systems that 1660 * do not support fallocate() for an unaligned byte range. 1661 */ 1662 return -ENOTSUP; 1663 } 1664 if (ret == 0 || ret != -ENOTSUP) { 1665 return ret; 1666 } 1667 s->has_write_zeroes = false; 1668 } 1669 #endif 1670 1671 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1672 if (s->has_discard && s->has_fallocate) { 1673 int ret = do_fallocate(s->fd, 1674 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1675 aiocb->aio_offset, aiocb->aio_nbytes); 1676 if (ret == 0) { 1677 ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1678 if (ret == 0 || ret != -ENOTSUP) { 1679 return ret; 1680 } 1681 s->has_fallocate = false; 1682 } else if (ret != -ENOTSUP) { 1683 return ret; 1684 } else { 1685 s->has_discard = false; 1686 } 1687 } 1688 #endif 1689 1690 #ifdef CONFIG_FALLOCATE 1691 /* Last resort: we are trying to extend the file with zeroed data. This 1692 * can be done via fallocate(fd, 0) */ 1693 len = bdrv_getlength(aiocb->bs); 1694 if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) { 1695 int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1696 if (ret == 0 || ret != -ENOTSUP) { 1697 return ret; 1698 } 1699 s->has_fallocate = false; 1700 } 1701 #endif 1702 1703 return -ENOTSUP; 1704 } 1705 1706 static int handle_aiocb_write_zeroes_unmap(void *opaque) 1707 { 1708 RawPosixAIOData *aiocb = opaque; 1709 BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque; 1710 1711 /* First try to write zeros and unmap at the same time */ 1712 1713 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1714 int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1715 aiocb->aio_offset, aiocb->aio_nbytes); 1716 switch (ret) { 1717 case -ENOTSUP: 1718 case -EINVAL: 1719 case -EBUSY: 1720 break; 1721 default: 1722 return ret; 1723 } 1724 #endif 1725 1726 /* If we couldn't manage to unmap while guaranteed that the area reads as 1727 * all-zero afterwards, just write zeroes without unmapping */ 1728 return handle_aiocb_write_zeroes(aiocb); 1729 } 1730 1731 #ifndef HAVE_COPY_FILE_RANGE 1732 static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, 1733 off_t *out_off, size_t len, unsigned int flags) 1734 { 1735 #ifdef __NR_copy_file_range 1736 return syscall(__NR_copy_file_range, in_fd, in_off, out_fd, 1737 out_off, len, flags); 1738 #else 1739 errno = ENOSYS; 1740 return -1; 1741 #endif 1742 } 1743 #endif 1744 1745 static int handle_aiocb_copy_range(void *opaque) 1746 { 1747 RawPosixAIOData *aiocb = opaque; 1748 uint64_t bytes = aiocb->aio_nbytes; 1749 off_t in_off = aiocb->aio_offset; 1750 off_t out_off = aiocb->copy_range.aio_offset2; 1751 1752 while (bytes) { 1753 ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off, 1754 aiocb->copy_range.aio_fd2, &out_off, 1755 bytes, 0); 1756 trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off, 1757 aiocb->copy_range.aio_fd2, out_off, bytes, 1758 0, ret); 1759 if (ret == 0) { 1760 /* No progress (e.g. when beyond EOF), let the caller fall back to 1761 * buffer I/O. */ 1762 return -ENOSPC; 1763 } 1764 if (ret < 0) { 1765 switch (errno) { 1766 case ENOSYS: 1767 return -ENOTSUP; 1768 case EINTR: 1769 continue; 1770 default: 1771 return -errno; 1772 } 1773 } 1774 bytes -= ret; 1775 } 1776 return 0; 1777 } 1778 1779 static int handle_aiocb_discard(void *opaque) 1780 { 1781 RawPosixAIOData *aiocb = opaque; 1782 int ret = -EOPNOTSUPP; 1783 BDRVRawState *s = aiocb->bs->opaque; 1784 1785 if (!s->has_discard) { 1786 return -ENOTSUP; 1787 } 1788 1789 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1790 #ifdef BLKDISCARD 1791 do { 1792 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1793 if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { 1794 return 0; 1795 } 1796 } while (errno == EINTR); 1797 1798 ret = -errno; 1799 #endif 1800 } else { 1801 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1802 ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1803 aiocb->aio_offset, aiocb->aio_nbytes); 1804 #endif 1805 } 1806 1807 ret = translate_err(ret); 1808 if (ret == -ENOTSUP) { 1809 s->has_discard = false; 1810 } 1811 return ret; 1812 } 1813 1814 /* 1815 * Help alignment probing by allocating the first block. 1816 * 1817 * When reading with direct I/O from unallocated area on Gluster backed by XFS, 1818 * reading succeeds regardless of request length. In this case we fallback to 1819 * safe alignment which is not optimal. Allocating the first block avoids this 1820 * fallback. 1821 * 1822 * fd may be opened with O_DIRECT, but we don't know the buffer alignment or 1823 * request alignment, so we use safe values. 1824 * 1825 * Returns: 0 on success, -errno on failure. Since this is an optimization, 1826 * caller may ignore failures. 1827 */ 1828 static int allocate_first_block(int fd, size_t max_size) 1829 { 1830 size_t write_size = (max_size < MAX_BLOCKSIZE) 1831 ? BDRV_SECTOR_SIZE 1832 : MAX_BLOCKSIZE; 1833 size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size); 1834 void *buf; 1835 ssize_t n; 1836 int ret; 1837 1838 buf = qemu_memalign(max_align, write_size); 1839 memset(buf, 0, write_size); 1840 1841 do { 1842 n = pwrite(fd, buf, write_size, 0); 1843 } while (n == -1 && errno == EINTR); 1844 1845 ret = (n == -1) ? -errno : 0; 1846 1847 qemu_vfree(buf); 1848 return ret; 1849 } 1850 1851 static int handle_aiocb_truncate(void *opaque) 1852 { 1853 RawPosixAIOData *aiocb = opaque; 1854 int result = 0; 1855 int64_t current_length = 0; 1856 char *buf = NULL; 1857 struct stat st; 1858 int fd = aiocb->aio_fildes; 1859 int64_t offset = aiocb->aio_offset; 1860 PreallocMode prealloc = aiocb->truncate.prealloc; 1861 Error **errp = aiocb->truncate.errp; 1862 1863 if (fstat(fd, &st) < 0) { 1864 result = -errno; 1865 error_setg_errno(errp, -result, "Could not stat file"); 1866 return result; 1867 } 1868 1869 current_length = st.st_size; 1870 if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { 1871 error_setg(errp, "Cannot use preallocation for shrinking files"); 1872 return -ENOTSUP; 1873 } 1874 1875 switch (prealloc) { 1876 #ifdef CONFIG_POSIX_FALLOCATE 1877 case PREALLOC_MODE_FALLOC: 1878 /* 1879 * Truncating before posix_fallocate() makes it about twice slower on 1880 * file systems that do not support fallocate(), trying to check if a 1881 * block is allocated before allocating it, so don't do that here. 1882 */ 1883 if (offset != current_length) { 1884 result = -posix_fallocate(fd, current_length, 1885 offset - current_length); 1886 if (result != 0) { 1887 /* posix_fallocate() doesn't set errno. */ 1888 error_setg_errno(errp, -result, 1889 "Could not preallocate new data"); 1890 } else if (current_length == 0) { 1891 /* 1892 * posix_fallocate() uses fallocate() if the filesystem 1893 * supports it, or fallback to manually writing zeroes. If 1894 * fallocate() was used, unaligned reads from the fallocated 1895 * area in raw_probe_alignment() will succeed, hence we need to 1896 * allocate the first block. 1897 * 1898 * Optimize future alignment probing; ignore failures. 1899 */ 1900 allocate_first_block(fd, offset); 1901 } 1902 } else { 1903 result = 0; 1904 } 1905 goto out; 1906 #endif 1907 case PREALLOC_MODE_FULL: 1908 { 1909 int64_t num = 0, left = offset - current_length; 1910 off_t seek_result; 1911 1912 /* 1913 * Knowing the final size from the beginning could allow the file 1914 * system driver to do less allocations and possibly avoid 1915 * fragmentation of the file. 1916 */ 1917 if (ftruncate(fd, offset) != 0) { 1918 result = -errno; 1919 error_setg_errno(errp, -result, "Could not resize file"); 1920 goto out; 1921 } 1922 1923 buf = g_malloc0(65536); 1924 1925 seek_result = lseek(fd, current_length, SEEK_SET); 1926 if (seek_result < 0) { 1927 result = -errno; 1928 error_setg_errno(errp, -result, 1929 "Failed to seek to the old end of file"); 1930 goto out; 1931 } 1932 1933 while (left > 0) { 1934 num = MIN(left, 65536); 1935 result = write(fd, buf, num); 1936 if (result < 0) { 1937 if (errno == EINTR) { 1938 continue; 1939 } 1940 result = -errno; 1941 error_setg_errno(errp, -result, 1942 "Could not write zeros for preallocation"); 1943 goto out; 1944 } 1945 left -= result; 1946 } 1947 if (result >= 0) { 1948 result = fsync(fd); 1949 if (result < 0) { 1950 result = -errno; 1951 error_setg_errno(errp, -result, 1952 "Could not flush file to disk"); 1953 goto out; 1954 } 1955 } 1956 goto out; 1957 } 1958 case PREALLOC_MODE_OFF: 1959 if (ftruncate(fd, offset) != 0) { 1960 result = -errno; 1961 error_setg_errno(errp, -result, "Could not resize file"); 1962 } else if (current_length == 0 && offset > current_length) { 1963 /* Optimize future alignment probing; ignore failures. */ 1964 allocate_first_block(fd, offset); 1965 } 1966 return result; 1967 default: 1968 result = -ENOTSUP; 1969 error_setg(errp, "Unsupported preallocation mode: %s", 1970 PreallocMode_str(prealloc)); 1971 return result; 1972 } 1973 1974 out: 1975 if (result < 0) { 1976 if (ftruncate(fd, current_length) < 0) { 1977 error_report("Failed to restore old file length: %s", 1978 strerror(errno)); 1979 } 1980 } 1981 1982 g_free(buf); 1983 return result; 1984 } 1985 1986 static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs, 1987 ThreadPoolFunc func, void *arg) 1988 { 1989 /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */ 1990 ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1991 return thread_pool_submit_co(pool, func, arg); 1992 } 1993 1994 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, 1995 uint64_t bytes, QEMUIOVector *qiov, int type) 1996 { 1997 BDRVRawState *s = bs->opaque; 1998 RawPosixAIOData acb; 1999 2000 if (fd_open(bs) < 0) 2001 return -EIO; 2002 2003 /* 2004 * When using O_DIRECT, the request must be aligned to be able to use 2005 * either libaio or io_uring interface. If not fail back to regular thread 2006 * pool read/write code which emulates this for us if we 2007 * set QEMU_AIO_MISALIGNED. 2008 */ 2009 if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { 2010 type |= QEMU_AIO_MISALIGNED; 2011 #ifdef CONFIG_LINUX_IO_URING 2012 } else if (s->use_linux_io_uring) { 2013 LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); 2014 assert(qiov->size == bytes); 2015 return luring_co_submit(bs, aio, s->fd, offset, qiov, type); 2016 #endif 2017 #ifdef CONFIG_LINUX_AIO 2018 } else if (s->use_linux_aio) { 2019 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 2020 assert(qiov->size == bytes); 2021 return laio_co_submit(bs, aio, s->fd, offset, qiov, type); 2022 #endif 2023 } 2024 2025 acb = (RawPosixAIOData) { 2026 .bs = bs, 2027 .aio_fildes = s->fd, 2028 .aio_type = type, 2029 .aio_offset = offset, 2030 .aio_nbytes = bytes, 2031 .io = { 2032 .iov = qiov->iov, 2033 .niov = qiov->niov, 2034 }, 2035 }; 2036 2037 assert(qiov->size == bytes); 2038 return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); 2039 } 2040 2041 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, 2042 uint64_t bytes, QEMUIOVector *qiov, 2043 int flags) 2044 { 2045 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); 2046 } 2047 2048 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, 2049 uint64_t bytes, QEMUIOVector *qiov, 2050 int flags) 2051 { 2052 assert(flags == 0); 2053 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); 2054 } 2055 2056 static void raw_aio_plug(BlockDriverState *bs) 2057 { 2058 BDRVRawState __attribute__((unused)) *s = bs->opaque; 2059 #ifdef CONFIG_LINUX_AIO 2060 if (s->use_linux_aio) { 2061 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 2062 laio_io_plug(bs, aio); 2063 } 2064 #endif 2065 #ifdef CONFIG_LINUX_IO_URING 2066 if (s->use_linux_io_uring) { 2067 LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); 2068 luring_io_plug(bs, aio); 2069 } 2070 #endif 2071 } 2072 2073 static void raw_aio_unplug(BlockDriverState *bs) 2074 { 2075 BDRVRawState __attribute__((unused)) *s = bs->opaque; 2076 #ifdef CONFIG_LINUX_AIO 2077 if (s->use_linux_aio) { 2078 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 2079 laio_io_unplug(bs, aio); 2080 } 2081 #endif 2082 #ifdef CONFIG_LINUX_IO_URING 2083 if (s->use_linux_io_uring) { 2084 LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); 2085 luring_io_unplug(bs, aio); 2086 } 2087 #endif 2088 } 2089 2090 static int raw_co_flush_to_disk(BlockDriverState *bs) 2091 { 2092 BDRVRawState *s = bs->opaque; 2093 RawPosixAIOData acb; 2094 int ret; 2095 2096 ret = fd_open(bs); 2097 if (ret < 0) { 2098 return ret; 2099 } 2100 2101 acb = (RawPosixAIOData) { 2102 .bs = bs, 2103 .aio_fildes = s->fd, 2104 .aio_type = QEMU_AIO_FLUSH, 2105 }; 2106 2107 #ifdef CONFIG_LINUX_IO_URING 2108 if (s->use_linux_io_uring) { 2109 LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); 2110 return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH); 2111 } 2112 #endif 2113 return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb); 2114 } 2115 2116 static void raw_aio_attach_aio_context(BlockDriverState *bs, 2117 AioContext *new_context) 2118 { 2119 BDRVRawState __attribute__((unused)) *s = bs->opaque; 2120 #ifdef CONFIG_LINUX_AIO 2121 if (s->use_linux_aio) { 2122 Error *local_err = NULL; 2123 if (!aio_setup_linux_aio(new_context, &local_err)) { 2124 error_reportf_err(local_err, "Unable to use native AIO, " 2125 "falling back to thread pool: "); 2126 s->use_linux_aio = false; 2127 } 2128 } 2129 #endif 2130 #ifdef CONFIG_LINUX_IO_URING 2131 if (s->use_linux_io_uring) { 2132 Error *local_err = NULL; 2133 if (!aio_setup_linux_io_uring(new_context, &local_err)) { 2134 error_reportf_err(local_err, "Unable to use linux io_uring, " 2135 "falling back to thread pool: "); 2136 s->use_linux_io_uring = false; 2137 } 2138 } 2139 #endif 2140 } 2141 2142 static void raw_close(BlockDriverState *bs) 2143 { 2144 BDRVRawState *s = bs->opaque; 2145 2146 if (s->fd >= 0) { 2147 qemu_close(s->fd); 2148 s->fd = -1; 2149 } 2150 } 2151 2152 /** 2153 * Truncates the given regular file @fd to @offset and, when growing, fills the 2154 * new space according to @prealloc. 2155 * 2156 * Returns: 0 on success, -errno on failure. 2157 */ 2158 static int coroutine_fn 2159 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, 2160 PreallocMode prealloc, Error **errp) 2161 { 2162 RawPosixAIOData acb; 2163 2164 acb = (RawPosixAIOData) { 2165 .bs = bs, 2166 .aio_fildes = fd, 2167 .aio_type = QEMU_AIO_TRUNCATE, 2168 .aio_offset = offset, 2169 .truncate = { 2170 .prealloc = prealloc, 2171 .errp = errp, 2172 }, 2173 }; 2174 2175 return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb); 2176 } 2177 2178 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, 2179 bool exact, PreallocMode prealloc, 2180 BdrvRequestFlags flags, Error **errp) 2181 { 2182 BDRVRawState *s = bs->opaque; 2183 struct stat st; 2184 int ret; 2185 2186 if (fstat(s->fd, &st)) { 2187 ret = -errno; 2188 error_setg_errno(errp, -ret, "Failed to fstat() the file"); 2189 return ret; 2190 } 2191 2192 if (S_ISREG(st.st_mode)) { 2193 /* Always resizes to the exact @offset */ 2194 return raw_regular_truncate(bs, s->fd, offset, prealloc, errp); 2195 } 2196 2197 if (prealloc != PREALLOC_MODE_OFF) { 2198 error_setg(errp, "Preallocation mode '%s' unsupported for this " 2199 "non-regular file", PreallocMode_str(prealloc)); 2200 return -ENOTSUP; 2201 } 2202 2203 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 2204 int64_t cur_length = raw_getlength(bs); 2205 2206 if (offset != cur_length && exact) { 2207 error_setg(errp, "Cannot resize device files"); 2208 return -ENOTSUP; 2209 } else if (offset > cur_length) { 2210 error_setg(errp, "Cannot grow device files"); 2211 return -EINVAL; 2212 } 2213 } else { 2214 error_setg(errp, "Resizing this file is not supported"); 2215 return -ENOTSUP; 2216 } 2217 2218 return 0; 2219 } 2220 2221 #ifdef __OpenBSD__ 2222 static int64_t raw_getlength(BlockDriverState *bs) 2223 { 2224 BDRVRawState *s = bs->opaque; 2225 int fd = s->fd; 2226 struct stat st; 2227 2228 if (fstat(fd, &st)) 2229 return -errno; 2230 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 2231 struct disklabel dl; 2232 2233 if (ioctl(fd, DIOCGDINFO, &dl)) 2234 return -errno; 2235 return (uint64_t)dl.d_secsize * 2236 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 2237 } else 2238 return st.st_size; 2239 } 2240 #elif defined(__NetBSD__) 2241 static int64_t raw_getlength(BlockDriverState *bs) 2242 { 2243 BDRVRawState *s = bs->opaque; 2244 int fd = s->fd; 2245 struct stat st; 2246 2247 if (fstat(fd, &st)) 2248 return -errno; 2249 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 2250 struct dkwedge_info dkw; 2251 2252 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { 2253 return dkw.dkw_size * 512; 2254 } else { 2255 struct disklabel dl; 2256 2257 if (ioctl(fd, DIOCGDINFO, &dl)) 2258 return -errno; 2259 return (uint64_t)dl.d_secsize * 2260 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 2261 } 2262 } else 2263 return st.st_size; 2264 } 2265 #elif defined(__sun__) 2266 static int64_t raw_getlength(BlockDriverState *bs) 2267 { 2268 BDRVRawState *s = bs->opaque; 2269 struct dk_minfo minfo; 2270 int ret; 2271 int64_t size; 2272 2273 ret = fd_open(bs); 2274 if (ret < 0) { 2275 return ret; 2276 } 2277 2278 /* 2279 * Use the DKIOCGMEDIAINFO ioctl to read the size. 2280 */ 2281 ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); 2282 if (ret != -1) { 2283 return minfo.dki_lbsize * minfo.dki_capacity; 2284 } 2285 2286 /* 2287 * There are reports that lseek on some devices fails, but 2288 * irc discussion said that contingency on contingency was overkill. 2289 */ 2290 size = lseek(s->fd, 0, SEEK_END); 2291 if (size < 0) { 2292 return -errno; 2293 } 2294 return size; 2295 } 2296 #elif defined(CONFIG_BSD) 2297 static int64_t raw_getlength(BlockDriverState *bs) 2298 { 2299 BDRVRawState *s = bs->opaque; 2300 int fd = s->fd; 2301 int64_t size; 2302 struct stat sb; 2303 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 2304 int reopened = 0; 2305 #endif 2306 int ret; 2307 2308 ret = fd_open(bs); 2309 if (ret < 0) 2310 return ret; 2311 2312 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 2313 again: 2314 #endif 2315 if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { 2316 #ifdef DIOCGMEDIASIZE 2317 if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) 2318 #elif defined(DIOCGPART) 2319 { 2320 struct partinfo pi; 2321 if (ioctl(fd, DIOCGPART, &pi) == 0) 2322 size = pi.media_size; 2323 else 2324 size = 0; 2325 } 2326 if (size == 0) 2327 #endif 2328 #if defined(__APPLE__) && defined(__MACH__) 2329 { 2330 uint64_t sectors = 0; 2331 uint32_t sector_size = 0; 2332 2333 if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 2334 && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { 2335 size = sectors * sector_size; 2336 } else { 2337 size = lseek(fd, 0LL, SEEK_END); 2338 if (size < 0) { 2339 return -errno; 2340 } 2341 } 2342 } 2343 #else 2344 size = lseek(fd, 0LL, SEEK_END); 2345 if (size < 0) { 2346 return -errno; 2347 } 2348 #endif 2349 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 2350 switch(s->type) { 2351 case FTYPE_CD: 2352 /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ 2353 if (size == 2048LL * (unsigned)-1) 2354 size = 0; 2355 /* XXX no disc? maybe we need to reopen... */ 2356 if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { 2357 reopened = 1; 2358 goto again; 2359 } 2360 } 2361 #endif 2362 } else { 2363 size = lseek(fd, 0, SEEK_END); 2364 if (size < 0) { 2365 return -errno; 2366 } 2367 } 2368 return size; 2369 } 2370 #else 2371 static int64_t raw_getlength(BlockDriverState *bs) 2372 { 2373 BDRVRawState *s = bs->opaque; 2374 int ret; 2375 int64_t size; 2376 2377 ret = fd_open(bs); 2378 if (ret < 0) { 2379 return ret; 2380 } 2381 2382 size = lseek(s->fd, 0, SEEK_END); 2383 if (size < 0) { 2384 return -errno; 2385 } 2386 return size; 2387 } 2388 #endif 2389 2390 static int64_t raw_get_allocated_file_size(BlockDriverState *bs) 2391 { 2392 struct stat st; 2393 BDRVRawState *s = bs->opaque; 2394 2395 if (fstat(s->fd, &st) < 0) { 2396 return -errno; 2397 } 2398 return (int64_t)st.st_blocks * 512; 2399 } 2400 2401 static int coroutine_fn 2402 raw_co_create(BlockdevCreateOptions *options, Error **errp) 2403 { 2404 BlockdevCreateOptionsFile *file_opts; 2405 Error *local_err = NULL; 2406 int fd; 2407 uint64_t perm, shared; 2408 int result = 0; 2409 2410 /* Validate options and set default values */ 2411 assert(options->driver == BLOCKDEV_DRIVER_FILE); 2412 file_opts = &options->u.file; 2413 2414 if (!file_opts->has_nocow) { 2415 file_opts->nocow = false; 2416 } 2417 if (!file_opts->has_preallocation) { 2418 file_opts->preallocation = PREALLOC_MODE_OFF; 2419 } 2420 if (!file_opts->has_extent_size_hint) { 2421 file_opts->extent_size_hint = 1 * MiB; 2422 } 2423 if (file_opts->extent_size_hint > UINT32_MAX) { 2424 result = -EINVAL; 2425 error_setg(errp, "Extent size hint is too large"); 2426 goto out; 2427 } 2428 2429 /* Create file */ 2430 fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp); 2431 if (fd < 0) { 2432 result = -errno; 2433 goto out; 2434 } 2435 2436 /* Take permissions: We want to discard everything, so we need 2437 * BLK_PERM_WRITE; and truncation to the desired size requires 2438 * BLK_PERM_RESIZE. 2439 * On the other hand, we cannot share the RESIZE permission 2440 * because we promise that after this function, the file has the 2441 * size given in the options. If someone else were to resize it 2442 * concurrently, we could not guarantee that. 2443 * Note that after this function, we can no longer guarantee that 2444 * the file is not touched by a third party, so it may be resized 2445 * then. */ 2446 perm = BLK_PERM_WRITE | BLK_PERM_RESIZE; 2447 shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE; 2448 2449 /* Step one: Take locks */ 2450 result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp); 2451 if (result < 0) { 2452 goto out_close; 2453 } 2454 2455 /* Step two: Check that nobody else has taken conflicting locks */ 2456 result = raw_check_lock_bytes(fd, perm, shared, errp); 2457 if (result < 0) { 2458 error_append_hint(errp, 2459 "Is another process using the image [%s]?\n", 2460 file_opts->filename); 2461 goto out_unlock; 2462 } 2463 2464 /* Clear the file by truncating it to 0 */ 2465 result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp); 2466 if (result < 0) { 2467 goto out_unlock; 2468 } 2469 2470 if (file_opts->nocow) { 2471 #ifdef __linux__ 2472 /* Set NOCOW flag to solve performance issue on fs like btrfs. 2473 * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value 2474 * will be ignored since any failure of this operation should not 2475 * block the left work. 2476 */ 2477 int attr; 2478 if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { 2479 attr |= FS_NOCOW_FL; 2480 ioctl(fd, FS_IOC_SETFLAGS, &attr); 2481 } 2482 #endif 2483 } 2484 #ifdef FS_IOC_FSSETXATTR 2485 /* 2486 * Try to set the extent size hint. Failure is not fatal, and a warning is 2487 * only printed if the option was explicitly specified. 2488 */ 2489 { 2490 struct fsxattr attr; 2491 result = ioctl(fd, FS_IOC_FSGETXATTR, &attr); 2492 if (result == 0) { 2493 attr.fsx_xflags |= FS_XFLAG_EXTSIZE; 2494 attr.fsx_extsize = file_opts->extent_size_hint; 2495 result = ioctl(fd, FS_IOC_FSSETXATTR, &attr); 2496 } 2497 if (result < 0 && file_opts->has_extent_size_hint && 2498 file_opts->extent_size_hint) 2499 { 2500 warn_report("Failed to set extent size hint: %s", 2501 strerror(errno)); 2502 } 2503 } 2504 #endif 2505 2506 /* Resize and potentially preallocate the file to the desired 2507 * final size */ 2508 result = raw_regular_truncate(NULL, fd, file_opts->size, 2509 file_opts->preallocation, errp); 2510 if (result < 0) { 2511 goto out_unlock; 2512 } 2513 2514 out_unlock: 2515 raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err); 2516 if (local_err) { 2517 /* The above call should not fail, and if it does, that does 2518 * not mean the whole creation operation has failed. So 2519 * report it the user for their convenience, but do not report 2520 * it to the caller. */ 2521 warn_report_err(local_err); 2522 } 2523 2524 out_close: 2525 if (qemu_close(fd) != 0 && result == 0) { 2526 result = -errno; 2527 error_setg_errno(errp, -result, "Could not close the new file"); 2528 } 2529 out: 2530 return result; 2531 } 2532 2533 static int coroutine_fn raw_co_create_opts(BlockDriver *drv, 2534 const char *filename, 2535 QemuOpts *opts, 2536 Error **errp) 2537 { 2538 BlockdevCreateOptions options; 2539 int64_t total_size = 0; 2540 int64_t extent_size_hint = 0; 2541 bool has_extent_size_hint = false; 2542 bool nocow = false; 2543 PreallocMode prealloc; 2544 char *buf = NULL; 2545 Error *local_err = NULL; 2546 2547 /* Skip file: protocol prefix */ 2548 strstart(filename, "file:", &filename); 2549 2550 /* Read out options */ 2551 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 2552 BDRV_SECTOR_SIZE); 2553 if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) { 2554 has_extent_size_hint = true; 2555 extent_size_hint = 2556 qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1); 2557 } 2558 nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); 2559 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 2560 prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, 2561 PREALLOC_MODE_OFF, &local_err); 2562 g_free(buf); 2563 if (local_err) { 2564 error_propagate(errp, local_err); 2565 return -EINVAL; 2566 } 2567 2568 options = (BlockdevCreateOptions) { 2569 .driver = BLOCKDEV_DRIVER_FILE, 2570 .u.file = { 2571 .filename = (char *) filename, 2572 .size = total_size, 2573 .has_preallocation = true, 2574 .preallocation = prealloc, 2575 .has_nocow = true, 2576 .nocow = nocow, 2577 .has_extent_size_hint = has_extent_size_hint, 2578 .extent_size_hint = extent_size_hint, 2579 }, 2580 }; 2581 return raw_co_create(&options, errp); 2582 } 2583 2584 static int coroutine_fn raw_co_delete_file(BlockDriverState *bs, 2585 Error **errp) 2586 { 2587 struct stat st; 2588 int ret; 2589 2590 if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) { 2591 error_setg_errno(errp, ENOENT, "%s is not a regular file", 2592 bs->filename); 2593 return -ENOENT; 2594 } 2595 2596 ret = unlink(bs->filename); 2597 if (ret < 0) { 2598 ret = -errno; 2599 error_setg_errno(errp, -ret, "Error when deleting file %s", 2600 bs->filename); 2601 } 2602 2603 return ret; 2604 } 2605 2606 /* 2607 * Find allocation range in @bs around offset @start. 2608 * May change underlying file descriptor's file offset. 2609 * If @start is not in a hole, store @start in @data, and the 2610 * beginning of the next hole in @hole, and return 0. 2611 * If @start is in a non-trailing hole, store @start in @hole and the 2612 * beginning of the next non-hole in @data, and return 0. 2613 * If @start is in a trailing hole or beyond EOF, return -ENXIO. 2614 * If we can't find out, return a negative errno other than -ENXIO. 2615 */ 2616 static int find_allocation(BlockDriverState *bs, off_t start, 2617 off_t *data, off_t *hole) 2618 { 2619 #if defined SEEK_HOLE && defined SEEK_DATA 2620 BDRVRawState *s = bs->opaque; 2621 off_t offs; 2622 2623 /* 2624 * SEEK_DATA cases: 2625 * D1. offs == start: start is in data 2626 * D2. offs > start: start is in a hole, next data at offs 2627 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole 2628 * or start is beyond EOF 2629 * If the latter happens, the file has been truncated behind 2630 * our back since we opened it. All bets are off then. 2631 * Treating like a trailing hole is simplest. 2632 * D4. offs < 0, errno != ENXIO: we learned nothing 2633 */ 2634 offs = lseek(s->fd, start, SEEK_DATA); 2635 if (offs < 0) { 2636 return -errno; /* D3 or D4 */ 2637 } 2638 2639 if (offs < start) { 2640 /* This is not a valid return by lseek(). We are safe to just return 2641 * -EIO in this case, and we'll treat it like D4. */ 2642 return -EIO; 2643 } 2644 2645 if (offs > start) { 2646 /* D2: in hole, next data at offs */ 2647 *hole = start; 2648 *data = offs; 2649 return 0; 2650 } 2651 2652 /* D1: in data, end not yet known */ 2653 2654 /* 2655 * SEEK_HOLE cases: 2656 * H1. offs == start: start is in a hole 2657 * If this happens here, a hole has been dug behind our back 2658 * since the previous lseek(). 2659 * H2. offs > start: either start is in data, next hole at offs, 2660 * or start is in trailing hole, EOF at offs 2661 * Linux treats trailing holes like any other hole: offs == 2662 * start. Solaris seeks to EOF instead: offs > start (blech). 2663 * If that happens here, a hole has been dug behind our back 2664 * since the previous lseek(). 2665 * H3. offs < 0, errno = ENXIO: start is beyond EOF 2666 * If this happens, the file has been truncated behind our 2667 * back since we opened it. Treat it like a trailing hole. 2668 * H4. offs < 0, errno != ENXIO: we learned nothing 2669 * Pretend we know nothing at all, i.e. "forget" about D1. 2670 */ 2671 offs = lseek(s->fd, start, SEEK_HOLE); 2672 if (offs < 0) { 2673 return -errno; /* D1 and (H3 or H4) */ 2674 } 2675 2676 if (offs < start) { 2677 /* This is not a valid return by lseek(). We are safe to just return 2678 * -EIO in this case, and we'll treat it like H4. */ 2679 return -EIO; 2680 } 2681 2682 if (offs > start) { 2683 /* 2684 * D1 and H2: either in data, next hole at offs, or it was in 2685 * data but is now in a trailing hole. In the latter case, 2686 * all bets are off. Treating it as if it there was data all 2687 * the way to EOF is safe, so simply do that. 2688 */ 2689 *data = start; 2690 *hole = offs; 2691 return 0; 2692 } 2693 2694 /* D1 and H1 */ 2695 return -EBUSY; 2696 #else 2697 return -ENOTSUP; 2698 #endif 2699 } 2700 2701 /* 2702 * Returns the allocation status of the specified offset. 2703 * 2704 * The block layer guarantees 'offset' and 'bytes' are within bounds. 2705 * 2706 * 'pnum' is set to the number of bytes (including and immediately following 2707 * the specified offset) that are known to be in the same 2708 * allocated/unallocated state. 2709 * 2710 * 'bytes' is the max value 'pnum' should be set to. 2711 */ 2712 static int coroutine_fn raw_co_block_status(BlockDriverState *bs, 2713 bool want_zero, 2714 int64_t offset, 2715 int64_t bytes, int64_t *pnum, 2716 int64_t *map, 2717 BlockDriverState **file) 2718 { 2719 off_t data = 0, hole = 0; 2720 int ret; 2721 2722 assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment)); 2723 2724 ret = fd_open(bs); 2725 if (ret < 0) { 2726 return ret; 2727 } 2728 2729 if (!want_zero) { 2730 *pnum = bytes; 2731 *map = offset; 2732 *file = bs; 2733 return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 2734 } 2735 2736 ret = find_allocation(bs, offset, &data, &hole); 2737 if (ret == -ENXIO) { 2738 /* Trailing hole */ 2739 *pnum = bytes; 2740 ret = BDRV_BLOCK_ZERO; 2741 } else if (ret < 0) { 2742 /* No info available, so pretend there are no holes */ 2743 *pnum = bytes; 2744 ret = BDRV_BLOCK_DATA; 2745 } else if (data == offset) { 2746 /* On a data extent, compute bytes to the end of the extent, 2747 * possibly including a partial sector at EOF. */ 2748 *pnum = MIN(bytes, hole - offset); 2749 2750 /* 2751 * We are not allowed to return partial sectors, though, so 2752 * round up if necessary. 2753 */ 2754 if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) { 2755 int64_t file_length = raw_getlength(bs); 2756 if (file_length > 0) { 2757 /* Ignore errors, this is just a safeguard */ 2758 assert(hole == file_length); 2759 } 2760 *pnum = ROUND_UP(*pnum, bs->bl.request_alignment); 2761 } 2762 2763 ret = BDRV_BLOCK_DATA; 2764 } else { 2765 /* On a hole, compute bytes to the beginning of the next extent. */ 2766 assert(hole == offset); 2767 *pnum = MIN(bytes, data - offset); 2768 ret = BDRV_BLOCK_ZERO; 2769 } 2770 *map = offset; 2771 *file = bs; 2772 return ret | BDRV_BLOCK_OFFSET_VALID; 2773 } 2774 2775 #if defined(__linux__) 2776 /* Verify that the file is not in the page cache */ 2777 static void check_cache_dropped(BlockDriverState *bs, Error **errp) 2778 { 2779 const size_t window_size = 128 * 1024 * 1024; 2780 BDRVRawState *s = bs->opaque; 2781 void *window = NULL; 2782 size_t length = 0; 2783 unsigned char *vec; 2784 size_t page_size; 2785 off_t offset; 2786 off_t end; 2787 2788 /* mincore(2) page status information requires 1 byte per page */ 2789 page_size = sysconf(_SC_PAGESIZE); 2790 vec = g_malloc(DIV_ROUND_UP(window_size, page_size)); 2791 2792 end = raw_getlength(bs); 2793 2794 for (offset = 0; offset < end; offset += window_size) { 2795 void *new_window; 2796 size_t new_length; 2797 size_t vec_end; 2798 size_t i; 2799 int ret; 2800 2801 /* Unmap previous window if size has changed */ 2802 new_length = MIN(end - offset, window_size); 2803 if (new_length != length) { 2804 munmap(window, length); 2805 window = NULL; 2806 length = 0; 2807 } 2808 2809 new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE, 2810 s->fd, offset); 2811 if (new_window == MAP_FAILED) { 2812 error_setg_errno(errp, errno, "mmap failed"); 2813 break; 2814 } 2815 2816 window = new_window; 2817 length = new_length; 2818 2819 ret = mincore(window, length, vec); 2820 if (ret < 0) { 2821 error_setg_errno(errp, errno, "mincore failed"); 2822 break; 2823 } 2824 2825 vec_end = DIV_ROUND_UP(length, page_size); 2826 for (i = 0; i < vec_end; i++) { 2827 if (vec[i] & 0x1) { 2828 break; 2829 } 2830 } 2831 if (i < vec_end) { 2832 error_setg(errp, "page cache still in use!"); 2833 break; 2834 } 2835 } 2836 2837 if (window) { 2838 munmap(window, length); 2839 } 2840 2841 g_free(vec); 2842 } 2843 #endif /* __linux__ */ 2844 2845 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, 2846 Error **errp) 2847 { 2848 BDRVRawState *s = bs->opaque; 2849 int ret; 2850 2851 ret = fd_open(bs); 2852 if (ret < 0) { 2853 error_setg_errno(errp, -ret, "The file descriptor is not open"); 2854 return; 2855 } 2856 2857 if (!s->drop_cache) { 2858 return; 2859 } 2860 2861 if (s->open_flags & O_DIRECT) { 2862 return; /* No host kernel page cache */ 2863 } 2864 2865 #if defined(__linux__) 2866 /* This sets the scene for the next syscall... */ 2867 ret = bdrv_co_flush(bs); 2868 if (ret < 0) { 2869 error_setg_errno(errp, -ret, "flush failed"); 2870 return; 2871 } 2872 2873 /* Linux does not invalidate pages that are dirty, locked, or mmapped by a 2874 * process. These limitations are okay because we just fsynced the file, 2875 * we don't use mmap, and the file should not be in use by other processes. 2876 */ 2877 ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED); 2878 if (ret != 0) { /* the return value is a positive errno */ 2879 error_setg_errno(errp, ret, "fadvise failed"); 2880 return; 2881 } 2882 2883 if (s->check_cache_dropped) { 2884 check_cache_dropped(bs, errp); 2885 } 2886 #else /* __linux__ */ 2887 /* Do nothing. Live migration to a remote host with cache.direct=off is 2888 * unsupported on other host operating systems. Cache consistency issues 2889 * may occur but no error is reported here, partly because that's the 2890 * historical behavior and partly because it's hard to differentiate valid 2891 * configurations that should not cause errors. 2892 */ 2893 #endif /* !__linux__ */ 2894 } 2895 2896 static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) 2897 { 2898 if (ret) { 2899 s->stats.discard_nb_failed++; 2900 } else { 2901 s->stats.discard_nb_ok++; 2902 s->stats.discard_bytes_ok += nbytes; 2903 } 2904 } 2905 2906 static coroutine_fn int 2907 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev) 2908 { 2909 BDRVRawState *s = bs->opaque; 2910 RawPosixAIOData acb; 2911 int ret; 2912 2913 acb = (RawPosixAIOData) { 2914 .bs = bs, 2915 .aio_fildes = s->fd, 2916 .aio_type = QEMU_AIO_DISCARD, 2917 .aio_offset = offset, 2918 .aio_nbytes = bytes, 2919 }; 2920 2921 if (blkdev) { 2922 acb.aio_type |= QEMU_AIO_BLKDEV; 2923 } 2924 2925 ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb); 2926 raw_account_discard(s, bytes, ret); 2927 return ret; 2928 } 2929 2930 static coroutine_fn int 2931 raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 2932 { 2933 return raw_do_pdiscard(bs, offset, bytes, false); 2934 } 2935 2936 static int coroutine_fn 2937 raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, 2938 BdrvRequestFlags flags, bool blkdev) 2939 { 2940 BDRVRawState *s = bs->opaque; 2941 RawPosixAIOData acb; 2942 ThreadPoolFunc *handler; 2943 2944 #ifdef CONFIG_FALLOCATE 2945 if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) { 2946 BdrvTrackedRequest *req; 2947 2948 /* 2949 * This is a workaround for a bug in the Linux XFS driver, 2950 * where writes submitted through the AIO interface will be 2951 * discarded if they happen beyond a concurrently running 2952 * fallocate() that increases the file length (i.e., both the 2953 * write and the fallocate() happen beyond the EOF). 2954 * 2955 * To work around it, we extend the tracked request for this 2956 * zero write until INT64_MAX (effectively infinity), and mark 2957 * it as serializing. 2958 * 2959 * We have to enable this workaround for all filesystems and 2960 * AIO modes (not just XFS with aio=native), because for 2961 * remote filesystems we do not know the host configuration. 2962 */ 2963 2964 req = bdrv_co_get_self_request(bs); 2965 assert(req); 2966 assert(req->type == BDRV_TRACKED_WRITE); 2967 assert(req->offset <= offset); 2968 assert(req->offset + req->bytes >= offset + bytes); 2969 2970 req->bytes = BDRV_MAX_LENGTH - req->offset; 2971 2972 assert(bdrv_check_request(req->offset, req->bytes) == 0); 2973 2974 bdrv_make_request_serialising(req, bs->bl.request_alignment); 2975 } 2976 #endif 2977 2978 acb = (RawPosixAIOData) { 2979 .bs = bs, 2980 .aio_fildes = s->fd, 2981 .aio_type = QEMU_AIO_WRITE_ZEROES, 2982 .aio_offset = offset, 2983 .aio_nbytes = bytes, 2984 }; 2985 2986 if (blkdev) { 2987 acb.aio_type |= QEMU_AIO_BLKDEV; 2988 } 2989 if (flags & BDRV_REQ_NO_FALLBACK) { 2990 acb.aio_type |= QEMU_AIO_NO_FALLBACK; 2991 } 2992 2993 if (flags & BDRV_REQ_MAY_UNMAP) { 2994 acb.aio_type |= QEMU_AIO_DISCARD; 2995 handler = handle_aiocb_write_zeroes_unmap; 2996 } else { 2997 handler = handle_aiocb_write_zeroes; 2998 } 2999 3000 return raw_thread_pool_submit(bs, handler, &acb); 3001 } 3002 3003 static int coroutine_fn raw_co_pwrite_zeroes( 3004 BlockDriverState *bs, int64_t offset, 3005 int bytes, BdrvRequestFlags flags) 3006 { 3007 return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false); 3008 } 3009 3010 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 3011 { 3012 return 0; 3013 } 3014 3015 static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs) 3016 { 3017 BDRVRawState *s = bs->opaque; 3018 return (BlockStatsSpecificFile) { 3019 .discard_nb_ok = s->stats.discard_nb_ok, 3020 .discard_nb_failed = s->stats.discard_nb_failed, 3021 .discard_bytes_ok = s->stats.discard_bytes_ok, 3022 }; 3023 } 3024 3025 static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs) 3026 { 3027 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); 3028 3029 stats->driver = BLOCKDEV_DRIVER_FILE; 3030 stats->u.file = get_blockstats_specific_file(bs); 3031 3032 return stats; 3033 } 3034 3035 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs) 3036 { 3037 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); 3038 3039 stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE; 3040 stats->u.host_device = get_blockstats_specific_file(bs); 3041 3042 return stats; 3043 } 3044 3045 static QemuOptsList raw_create_opts = { 3046 .name = "raw-create-opts", 3047 .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), 3048 .desc = { 3049 { 3050 .name = BLOCK_OPT_SIZE, 3051 .type = QEMU_OPT_SIZE, 3052 .help = "Virtual disk size" 3053 }, 3054 { 3055 .name = BLOCK_OPT_NOCOW, 3056 .type = QEMU_OPT_BOOL, 3057 .help = "Turn off copy-on-write (valid only on btrfs)" 3058 }, 3059 { 3060 .name = BLOCK_OPT_PREALLOC, 3061 .type = QEMU_OPT_STRING, 3062 .help = "Preallocation mode (allowed values: off" 3063 #ifdef CONFIG_POSIX_FALLOCATE 3064 ", falloc" 3065 #endif 3066 ", full)" 3067 }, 3068 { 3069 .name = BLOCK_OPT_EXTENT_SIZE_HINT, 3070 .type = QEMU_OPT_SIZE, 3071 .help = "Extent size hint for the image file, 0 to disable" 3072 }, 3073 { /* end of list */ } 3074 } 3075 }; 3076 3077 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared, 3078 Error **errp) 3079 { 3080 BDRVRawState *s = bs->opaque; 3081 BDRVRawReopenState *rs = NULL; 3082 int open_flags; 3083 int ret; 3084 3085 if (s->perm_change_fd) { 3086 /* 3087 * In the context of reopen, this function may be called several times 3088 * (directly and recursively while change permissions of the parent). 3089 * This is even true for children that don't inherit from the original 3090 * reopen node, so s->reopen_state is not set. 3091 * 3092 * Ignore all but the first call. 3093 */ 3094 return 0; 3095 } 3096 3097 if (s->reopen_state) { 3098 /* We already have a new file descriptor to set permissions for */ 3099 assert(s->reopen_state->perm == perm); 3100 assert(s->reopen_state->shared_perm == shared); 3101 rs = s->reopen_state->opaque; 3102 s->perm_change_fd = rs->fd; 3103 s->perm_change_flags = rs->open_flags; 3104 } else { 3105 /* We may need a new fd if auto-read-only switches the mode */ 3106 ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm, 3107 false, errp); 3108 if (ret < 0) { 3109 return ret; 3110 } else if (ret != s->fd) { 3111 s->perm_change_fd = ret; 3112 s->perm_change_flags = open_flags; 3113 } 3114 } 3115 3116 /* Prepare permissions on old fd to avoid conflicts between old and new, 3117 * but keep everything locked that new will need. */ 3118 ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp); 3119 if (ret < 0) { 3120 goto fail; 3121 } 3122 3123 /* Copy locks to the new fd */ 3124 if (s->perm_change_fd && s->use_lock) { 3125 ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared, 3126 false, errp); 3127 if (ret < 0) { 3128 raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); 3129 goto fail; 3130 } 3131 } 3132 return 0; 3133 3134 fail: 3135 if (s->perm_change_fd && !s->reopen_state) { 3136 qemu_close(s->perm_change_fd); 3137 } 3138 s->perm_change_fd = 0; 3139 return ret; 3140 } 3141 3142 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) 3143 { 3144 BDRVRawState *s = bs->opaque; 3145 3146 /* For reopen, we have already switched to the new fd (.bdrv_set_perm is 3147 * called after .bdrv_reopen_commit) */ 3148 if (s->perm_change_fd && s->fd != s->perm_change_fd) { 3149 qemu_close(s->fd); 3150 s->fd = s->perm_change_fd; 3151 s->open_flags = s->perm_change_flags; 3152 } 3153 s->perm_change_fd = 0; 3154 3155 raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL); 3156 s->perm = perm; 3157 s->shared_perm = shared; 3158 } 3159 3160 static void raw_abort_perm_update(BlockDriverState *bs) 3161 { 3162 BDRVRawState *s = bs->opaque; 3163 3164 /* For reopen, .bdrv_reopen_abort is called afterwards and will close 3165 * the file descriptor. */ 3166 if (s->perm_change_fd && !s->reopen_state) { 3167 qemu_close(s->perm_change_fd); 3168 } 3169 s->perm_change_fd = 0; 3170 3171 raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); 3172 } 3173 3174 static int coroutine_fn raw_co_copy_range_from( 3175 BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, 3176 BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, 3177 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) 3178 { 3179 return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3180 read_flags, write_flags); 3181 } 3182 3183 static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, 3184 BdrvChild *src, 3185 uint64_t src_offset, 3186 BdrvChild *dst, 3187 uint64_t dst_offset, 3188 uint64_t bytes, 3189 BdrvRequestFlags read_flags, 3190 BdrvRequestFlags write_flags) 3191 { 3192 RawPosixAIOData acb; 3193 BDRVRawState *s = bs->opaque; 3194 BDRVRawState *src_s; 3195 3196 assert(dst->bs == bs); 3197 if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) { 3198 return -ENOTSUP; 3199 } 3200 3201 src_s = src->bs->opaque; 3202 if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) { 3203 return -EIO; 3204 } 3205 3206 acb = (RawPosixAIOData) { 3207 .bs = bs, 3208 .aio_type = QEMU_AIO_COPY_RANGE, 3209 .aio_fildes = src_s->fd, 3210 .aio_offset = src_offset, 3211 .aio_nbytes = bytes, 3212 .copy_range = { 3213 .aio_fd2 = s->fd, 3214 .aio_offset2 = dst_offset, 3215 }, 3216 }; 3217 3218 return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb); 3219 } 3220 3221 BlockDriver bdrv_file = { 3222 .format_name = "file", 3223 .protocol_name = "file", 3224 .instance_size = sizeof(BDRVRawState), 3225 .bdrv_needs_filename = true, 3226 .bdrv_probe = NULL, /* no probe for protocols */ 3227 .bdrv_parse_filename = raw_parse_filename, 3228 .bdrv_file_open = raw_open, 3229 .bdrv_reopen_prepare = raw_reopen_prepare, 3230 .bdrv_reopen_commit = raw_reopen_commit, 3231 .bdrv_reopen_abort = raw_reopen_abort, 3232 .bdrv_close = raw_close, 3233 .bdrv_co_create = raw_co_create, 3234 .bdrv_co_create_opts = raw_co_create_opts, 3235 .bdrv_has_zero_init = bdrv_has_zero_init_1, 3236 .bdrv_co_block_status = raw_co_block_status, 3237 .bdrv_co_invalidate_cache = raw_co_invalidate_cache, 3238 .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, 3239 .bdrv_co_delete_file = raw_co_delete_file, 3240 3241 .bdrv_co_preadv = raw_co_preadv, 3242 .bdrv_co_pwritev = raw_co_pwritev, 3243 .bdrv_co_flush_to_disk = raw_co_flush_to_disk, 3244 .bdrv_co_pdiscard = raw_co_pdiscard, 3245 .bdrv_co_copy_range_from = raw_co_copy_range_from, 3246 .bdrv_co_copy_range_to = raw_co_copy_range_to, 3247 .bdrv_refresh_limits = raw_refresh_limits, 3248 .bdrv_io_plug = raw_aio_plug, 3249 .bdrv_io_unplug = raw_aio_unplug, 3250 .bdrv_attach_aio_context = raw_aio_attach_aio_context, 3251 3252 .bdrv_co_truncate = raw_co_truncate, 3253 .bdrv_getlength = raw_getlength, 3254 .bdrv_get_info = raw_get_info, 3255 .bdrv_get_allocated_file_size 3256 = raw_get_allocated_file_size, 3257 .bdrv_get_specific_stats = raw_get_specific_stats, 3258 .bdrv_check_perm = raw_check_perm, 3259 .bdrv_set_perm = raw_set_perm, 3260 .bdrv_abort_perm_update = raw_abort_perm_update, 3261 .create_opts = &raw_create_opts, 3262 .mutable_opts = mutable_opts, 3263 }; 3264 3265 /***********************************************/ 3266 /* host device */ 3267 3268 #if defined(__APPLE__) && defined(__MACH__) 3269 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 3270 CFIndex maxPathSize, int flags); 3271 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) 3272 { 3273 kern_return_t kernResult = KERN_FAILURE; 3274 mach_port_t masterPort; 3275 CFMutableDictionaryRef classesToMatch; 3276 const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; 3277 char *mediaType = NULL; 3278 3279 kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); 3280 if ( KERN_SUCCESS != kernResult ) { 3281 printf( "IOMasterPort returned %d\n", kernResult ); 3282 } 3283 3284 int index; 3285 for (index = 0; index < ARRAY_SIZE(matching_array); index++) { 3286 classesToMatch = IOServiceMatching(matching_array[index]); 3287 if (classesToMatch == NULL) { 3288 error_report("IOServiceMatching returned NULL for %s", 3289 matching_array[index]); 3290 continue; 3291 } 3292 CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), 3293 kCFBooleanTrue); 3294 kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, 3295 mediaIterator); 3296 if (kernResult != KERN_SUCCESS) { 3297 error_report("Note: IOServiceGetMatchingServices returned %d", 3298 kernResult); 3299 continue; 3300 } 3301 3302 /* If a match was found, leave the loop */ 3303 if (*mediaIterator != 0) { 3304 trace_file_FindEjectableOpticalMedia(matching_array[index]); 3305 mediaType = g_strdup(matching_array[index]); 3306 break; 3307 } 3308 } 3309 return mediaType; 3310 } 3311 3312 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 3313 CFIndex maxPathSize, int flags) 3314 { 3315 io_object_t nextMedia; 3316 kern_return_t kernResult = KERN_FAILURE; 3317 *bsdPath = '\0'; 3318 nextMedia = IOIteratorNext( mediaIterator ); 3319 if ( nextMedia ) 3320 { 3321 CFTypeRef bsdPathAsCFString; 3322 bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); 3323 if ( bsdPathAsCFString ) { 3324 size_t devPathLength; 3325 strcpy( bsdPath, _PATH_DEV ); 3326 if (flags & BDRV_O_NOCACHE) { 3327 strcat(bsdPath, "r"); 3328 } 3329 devPathLength = strlen( bsdPath ); 3330 if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { 3331 kernResult = KERN_SUCCESS; 3332 } 3333 CFRelease( bsdPathAsCFString ); 3334 } 3335 IOObjectRelease( nextMedia ); 3336 } 3337 3338 return kernResult; 3339 } 3340 3341 /* Sets up a real cdrom for use in QEMU */ 3342 static bool setup_cdrom(char *bsd_path, Error **errp) 3343 { 3344 int index, num_of_test_partitions = 2, fd; 3345 char test_partition[MAXPATHLEN]; 3346 bool partition_found = false; 3347 3348 /* look for a working partition */ 3349 for (index = 0; index < num_of_test_partitions; index++) { 3350 snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, 3351 index); 3352 fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL); 3353 if (fd >= 0) { 3354 partition_found = true; 3355 qemu_close(fd); 3356 break; 3357 } 3358 } 3359 3360 /* if a working partition on the device was not found */ 3361 if (partition_found == false) { 3362 error_setg(errp, "Failed to find a working partition on disc"); 3363 } else { 3364 trace_file_setup_cdrom(test_partition); 3365 pstrcpy(bsd_path, MAXPATHLEN, test_partition); 3366 } 3367 return partition_found; 3368 } 3369 3370 /* Prints directions on mounting and unmounting a device */ 3371 static void print_unmounting_directions(const char *file_name) 3372 { 3373 error_report("If device %s is mounted on the desktop, unmount" 3374 " it first before using it in QEMU", file_name); 3375 error_report("Command to unmount device: diskutil unmountDisk %s", 3376 file_name); 3377 error_report("Command to mount device: diskutil mountDisk %s", file_name); 3378 } 3379 3380 #endif /* defined(__APPLE__) && defined(__MACH__) */ 3381 3382 static int hdev_probe_device(const char *filename) 3383 { 3384 struct stat st; 3385 3386 /* allow a dedicated CD-ROM driver to match with a higher priority */ 3387 if (strstart(filename, "/dev/cdrom", NULL)) 3388 return 50; 3389 3390 if (stat(filename, &st) >= 0 && 3391 (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { 3392 return 100; 3393 } 3394 3395 return 0; 3396 } 3397 3398 static void hdev_parse_filename(const char *filename, QDict *options, 3399 Error **errp) 3400 { 3401 bdrv_parse_filename_strip_prefix(filename, "host_device:", options); 3402 } 3403 3404 static bool hdev_is_sg(BlockDriverState *bs) 3405 { 3406 3407 #if defined(__linux__) 3408 3409 BDRVRawState *s = bs->opaque; 3410 struct stat st; 3411 struct sg_scsi_id scsiid; 3412 int sg_version; 3413 int ret; 3414 3415 if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) { 3416 return false; 3417 } 3418 3419 ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version); 3420 if (ret < 0) { 3421 return false; 3422 } 3423 3424 ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid); 3425 if (ret >= 0) { 3426 trace_file_hdev_is_sg(scsiid.scsi_type, sg_version); 3427 return true; 3428 } 3429 3430 #endif 3431 3432 return false; 3433 } 3434 3435 static int hdev_open(BlockDriverState *bs, QDict *options, int flags, 3436 Error **errp) 3437 { 3438 BDRVRawState *s = bs->opaque; 3439 int ret; 3440 3441 #if defined(__APPLE__) && defined(__MACH__) 3442 /* 3443 * Caution: while qdict_get_str() is fine, getting non-string types 3444 * would require more care. When @options come from -blockdev or 3445 * blockdev_add, its members are typed according to the QAPI 3446 * schema, but when they come from -drive, they're all QString. 3447 */ 3448 const char *filename = qdict_get_str(options, "filename"); 3449 char bsd_path[MAXPATHLEN] = ""; 3450 bool error_occurred = false; 3451 3452 /* If using a real cdrom */ 3453 if (strcmp(filename, "/dev/cdrom") == 0) { 3454 char *mediaType = NULL; 3455 kern_return_t ret_val; 3456 io_iterator_t mediaIterator = 0; 3457 3458 mediaType = FindEjectableOpticalMedia(&mediaIterator); 3459 if (mediaType == NULL) { 3460 error_setg(errp, "Please make sure your CD/DVD is in the optical" 3461 " drive"); 3462 error_occurred = true; 3463 goto hdev_open_Mac_error; 3464 } 3465 3466 ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags); 3467 if (ret_val != KERN_SUCCESS) { 3468 error_setg(errp, "Could not get BSD path for optical drive"); 3469 error_occurred = true; 3470 goto hdev_open_Mac_error; 3471 } 3472 3473 /* If a real optical drive was not found */ 3474 if (bsd_path[0] == '\0') { 3475 error_setg(errp, "Failed to obtain bsd path for optical drive"); 3476 error_occurred = true; 3477 goto hdev_open_Mac_error; 3478 } 3479 3480 /* If using a cdrom disc and finding a partition on the disc failed */ 3481 if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 && 3482 setup_cdrom(bsd_path, errp) == false) { 3483 print_unmounting_directions(bsd_path); 3484 error_occurred = true; 3485 goto hdev_open_Mac_error; 3486 } 3487 3488 qdict_put_str(options, "filename", bsd_path); 3489 3490 hdev_open_Mac_error: 3491 g_free(mediaType); 3492 if (mediaIterator) { 3493 IOObjectRelease(mediaIterator); 3494 } 3495 if (error_occurred) { 3496 return -ENOENT; 3497 } 3498 } 3499 #endif /* defined(__APPLE__) && defined(__MACH__) */ 3500 3501 s->type = FTYPE_FILE; 3502 3503 ret = raw_open_common(bs, options, flags, 0, true, errp); 3504 if (ret < 0) { 3505 #if defined(__APPLE__) && defined(__MACH__) 3506 if (*bsd_path) { 3507 filename = bsd_path; 3508 } 3509 /* if a physical device experienced an error while being opened */ 3510 if (strncmp(filename, "/dev/", 5) == 0) { 3511 print_unmounting_directions(filename); 3512 } 3513 #endif /* defined(__APPLE__) && defined(__MACH__) */ 3514 return ret; 3515 } 3516 3517 /* Since this does ioctl the device must be already opened */ 3518 bs->sg = hdev_is_sg(bs); 3519 3520 return ret; 3521 } 3522 3523 #if defined(__linux__) 3524 static int coroutine_fn 3525 hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 3526 { 3527 BDRVRawState *s = bs->opaque; 3528 RawPosixAIOData acb; 3529 int ret; 3530 3531 ret = fd_open(bs); 3532 if (ret < 0) { 3533 return ret; 3534 } 3535 3536 if (req == SG_IO && s->pr_mgr) { 3537 struct sg_io_hdr *io_hdr = buf; 3538 if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || 3539 io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { 3540 return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), 3541 s->fd, io_hdr); 3542 } 3543 } 3544 3545 acb = (RawPosixAIOData) { 3546 .bs = bs, 3547 .aio_type = QEMU_AIO_IOCTL, 3548 .aio_fildes = s->fd, 3549 .aio_offset = 0, 3550 .ioctl = { 3551 .buf = buf, 3552 .cmd = req, 3553 }, 3554 }; 3555 3556 return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb); 3557 } 3558 #endif /* linux */ 3559 3560 static int fd_open(BlockDriverState *bs) 3561 { 3562 BDRVRawState *s = bs->opaque; 3563 3564 /* this is just to ensure s->fd is sane (its called by io ops) */ 3565 if (s->fd >= 0) 3566 return 0; 3567 return -EIO; 3568 } 3569 3570 static coroutine_fn int 3571 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 3572 { 3573 BDRVRawState *s = bs->opaque; 3574 int ret; 3575 3576 ret = fd_open(bs); 3577 if (ret < 0) { 3578 raw_account_discard(s, bytes, ret); 3579 return ret; 3580 } 3581 return raw_do_pdiscard(bs, offset, bytes, true); 3582 } 3583 3584 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, 3585 int64_t offset, int bytes, BdrvRequestFlags flags) 3586 { 3587 int rc; 3588 3589 rc = fd_open(bs); 3590 if (rc < 0) { 3591 return rc; 3592 } 3593 3594 return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true); 3595 } 3596 3597 static BlockDriver bdrv_host_device = { 3598 .format_name = "host_device", 3599 .protocol_name = "host_device", 3600 .instance_size = sizeof(BDRVRawState), 3601 .bdrv_needs_filename = true, 3602 .bdrv_probe_device = hdev_probe_device, 3603 .bdrv_parse_filename = hdev_parse_filename, 3604 .bdrv_file_open = hdev_open, 3605 .bdrv_close = raw_close, 3606 .bdrv_reopen_prepare = raw_reopen_prepare, 3607 .bdrv_reopen_commit = raw_reopen_commit, 3608 .bdrv_reopen_abort = raw_reopen_abort, 3609 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 3610 .create_opts = &bdrv_create_opts_simple, 3611 .mutable_opts = mutable_opts, 3612 .bdrv_co_invalidate_cache = raw_co_invalidate_cache, 3613 .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, 3614 3615 .bdrv_co_preadv = raw_co_preadv, 3616 .bdrv_co_pwritev = raw_co_pwritev, 3617 .bdrv_co_flush_to_disk = raw_co_flush_to_disk, 3618 .bdrv_co_pdiscard = hdev_co_pdiscard, 3619 .bdrv_co_copy_range_from = raw_co_copy_range_from, 3620 .bdrv_co_copy_range_to = raw_co_copy_range_to, 3621 .bdrv_refresh_limits = raw_refresh_limits, 3622 .bdrv_io_plug = raw_aio_plug, 3623 .bdrv_io_unplug = raw_aio_unplug, 3624 .bdrv_attach_aio_context = raw_aio_attach_aio_context, 3625 3626 .bdrv_co_truncate = raw_co_truncate, 3627 .bdrv_getlength = raw_getlength, 3628 .bdrv_get_info = raw_get_info, 3629 .bdrv_get_allocated_file_size 3630 = raw_get_allocated_file_size, 3631 .bdrv_get_specific_stats = hdev_get_specific_stats, 3632 .bdrv_check_perm = raw_check_perm, 3633 .bdrv_set_perm = raw_set_perm, 3634 .bdrv_abort_perm_update = raw_abort_perm_update, 3635 .bdrv_probe_blocksizes = hdev_probe_blocksizes, 3636 .bdrv_probe_geometry = hdev_probe_geometry, 3637 3638 /* generic scsi device */ 3639 #ifdef __linux__ 3640 .bdrv_co_ioctl = hdev_co_ioctl, 3641 #endif 3642 }; 3643 3644 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 3645 static void cdrom_parse_filename(const char *filename, QDict *options, 3646 Error **errp) 3647 { 3648 bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options); 3649 } 3650 #endif 3651 3652 #ifdef __linux__ 3653 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 3654 Error **errp) 3655 { 3656 BDRVRawState *s = bs->opaque; 3657 3658 s->type = FTYPE_CD; 3659 3660 /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ 3661 return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp); 3662 } 3663 3664 static int cdrom_probe_device(const char *filename) 3665 { 3666 int fd, ret; 3667 int prio = 0; 3668 struct stat st; 3669 3670 fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL); 3671 if (fd < 0) { 3672 goto out; 3673 } 3674 ret = fstat(fd, &st); 3675 if (ret == -1 || !S_ISBLK(st.st_mode)) { 3676 goto outc; 3677 } 3678 3679 /* Attempt to detect via a CDROM specific ioctl */ 3680 ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 3681 if (ret >= 0) 3682 prio = 100; 3683 3684 outc: 3685 qemu_close(fd); 3686 out: 3687 return prio; 3688 } 3689 3690 static bool cdrom_is_inserted(BlockDriverState *bs) 3691 { 3692 BDRVRawState *s = bs->opaque; 3693 int ret; 3694 3695 ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 3696 return ret == CDS_DISC_OK; 3697 } 3698 3699 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 3700 { 3701 BDRVRawState *s = bs->opaque; 3702 3703 if (eject_flag) { 3704 if (ioctl(s->fd, CDROMEJECT, NULL) < 0) 3705 perror("CDROMEJECT"); 3706 } else { 3707 if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) 3708 perror("CDROMEJECT"); 3709 } 3710 } 3711 3712 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 3713 { 3714 BDRVRawState *s = bs->opaque; 3715 3716 if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { 3717 /* 3718 * Note: an error can happen if the distribution automatically 3719 * mounts the CD-ROM 3720 */ 3721 /* perror("CDROM_LOCKDOOR"); */ 3722 } 3723 } 3724 3725 static BlockDriver bdrv_host_cdrom = { 3726 .format_name = "host_cdrom", 3727 .protocol_name = "host_cdrom", 3728 .instance_size = sizeof(BDRVRawState), 3729 .bdrv_needs_filename = true, 3730 .bdrv_probe_device = cdrom_probe_device, 3731 .bdrv_parse_filename = cdrom_parse_filename, 3732 .bdrv_file_open = cdrom_open, 3733 .bdrv_close = raw_close, 3734 .bdrv_reopen_prepare = raw_reopen_prepare, 3735 .bdrv_reopen_commit = raw_reopen_commit, 3736 .bdrv_reopen_abort = raw_reopen_abort, 3737 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 3738 .create_opts = &bdrv_create_opts_simple, 3739 .mutable_opts = mutable_opts, 3740 .bdrv_co_invalidate_cache = raw_co_invalidate_cache, 3741 3742 .bdrv_co_preadv = raw_co_preadv, 3743 .bdrv_co_pwritev = raw_co_pwritev, 3744 .bdrv_co_flush_to_disk = raw_co_flush_to_disk, 3745 .bdrv_refresh_limits = raw_refresh_limits, 3746 .bdrv_io_plug = raw_aio_plug, 3747 .bdrv_io_unplug = raw_aio_unplug, 3748 .bdrv_attach_aio_context = raw_aio_attach_aio_context, 3749 3750 .bdrv_co_truncate = raw_co_truncate, 3751 .bdrv_getlength = raw_getlength, 3752 .has_variable_length = true, 3753 .bdrv_get_allocated_file_size 3754 = raw_get_allocated_file_size, 3755 3756 /* removable device support */ 3757 .bdrv_is_inserted = cdrom_is_inserted, 3758 .bdrv_eject = cdrom_eject, 3759 .bdrv_lock_medium = cdrom_lock_medium, 3760 3761 /* generic scsi device */ 3762 .bdrv_co_ioctl = hdev_co_ioctl, 3763 }; 3764 #endif /* __linux__ */ 3765 3766 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 3767 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 3768 Error **errp) 3769 { 3770 BDRVRawState *s = bs->opaque; 3771 int ret; 3772 3773 s->type = FTYPE_CD; 3774 3775 ret = raw_open_common(bs, options, flags, 0, true, errp); 3776 if (ret) { 3777 return ret; 3778 } 3779 3780 /* make sure the door isn't locked at this time */ 3781 ioctl(s->fd, CDIOCALLOW); 3782 return 0; 3783 } 3784 3785 static int cdrom_probe_device(const char *filename) 3786 { 3787 if (strstart(filename, "/dev/cd", NULL) || 3788 strstart(filename, "/dev/acd", NULL)) 3789 return 100; 3790 return 0; 3791 } 3792 3793 static int cdrom_reopen(BlockDriverState *bs) 3794 { 3795 BDRVRawState *s = bs->opaque; 3796 int fd; 3797 3798 /* 3799 * Force reread of possibly changed/newly loaded disc, 3800 * FreeBSD seems to not notice sometimes... 3801 */ 3802 if (s->fd >= 0) 3803 qemu_close(s->fd); 3804 fd = qemu_open(bs->filename, s->open_flags, NULL); 3805 if (fd < 0) { 3806 s->fd = -1; 3807 return -EIO; 3808 } 3809 s->fd = fd; 3810 3811 /* make sure the door isn't locked at this time */ 3812 ioctl(s->fd, CDIOCALLOW); 3813 return 0; 3814 } 3815 3816 static bool cdrom_is_inserted(BlockDriverState *bs) 3817 { 3818 return raw_getlength(bs) > 0; 3819 } 3820 3821 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 3822 { 3823 BDRVRawState *s = bs->opaque; 3824 3825 if (s->fd < 0) 3826 return; 3827 3828 (void) ioctl(s->fd, CDIOCALLOW); 3829 3830 if (eject_flag) { 3831 if (ioctl(s->fd, CDIOCEJECT) < 0) 3832 perror("CDIOCEJECT"); 3833 } else { 3834 if (ioctl(s->fd, CDIOCCLOSE) < 0) 3835 perror("CDIOCCLOSE"); 3836 } 3837 3838 cdrom_reopen(bs); 3839 } 3840 3841 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 3842 { 3843 BDRVRawState *s = bs->opaque; 3844 3845 if (s->fd < 0) 3846 return; 3847 if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { 3848 /* 3849 * Note: an error can happen if the distribution automatically 3850 * mounts the CD-ROM 3851 */ 3852 /* perror("CDROM_LOCKDOOR"); */ 3853 } 3854 } 3855 3856 static BlockDriver bdrv_host_cdrom = { 3857 .format_name = "host_cdrom", 3858 .protocol_name = "host_cdrom", 3859 .instance_size = sizeof(BDRVRawState), 3860 .bdrv_needs_filename = true, 3861 .bdrv_probe_device = cdrom_probe_device, 3862 .bdrv_parse_filename = cdrom_parse_filename, 3863 .bdrv_file_open = cdrom_open, 3864 .bdrv_close = raw_close, 3865 .bdrv_reopen_prepare = raw_reopen_prepare, 3866 .bdrv_reopen_commit = raw_reopen_commit, 3867 .bdrv_reopen_abort = raw_reopen_abort, 3868 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 3869 .create_opts = &bdrv_create_opts_simple, 3870 .mutable_opts = mutable_opts, 3871 3872 .bdrv_co_preadv = raw_co_preadv, 3873 .bdrv_co_pwritev = raw_co_pwritev, 3874 .bdrv_co_flush_to_disk = raw_co_flush_to_disk, 3875 .bdrv_refresh_limits = raw_refresh_limits, 3876 .bdrv_io_plug = raw_aio_plug, 3877 .bdrv_io_unplug = raw_aio_unplug, 3878 .bdrv_attach_aio_context = raw_aio_attach_aio_context, 3879 3880 .bdrv_co_truncate = raw_co_truncate, 3881 .bdrv_getlength = raw_getlength, 3882 .has_variable_length = true, 3883 .bdrv_get_allocated_file_size 3884 = raw_get_allocated_file_size, 3885 3886 /* removable device support */ 3887 .bdrv_is_inserted = cdrom_is_inserted, 3888 .bdrv_eject = cdrom_eject, 3889 .bdrv_lock_medium = cdrom_lock_medium, 3890 }; 3891 #endif /* __FreeBSD__ */ 3892 3893 static void bdrv_file_init(void) 3894 { 3895 /* 3896 * Register all the drivers. Note that order is important, the driver 3897 * registered last will get probed first. 3898 */ 3899 bdrv_register(&bdrv_file); 3900 bdrv_register(&bdrv_host_device); 3901 #ifdef __linux__ 3902 bdrv_register(&bdrv_host_cdrom); 3903 #endif 3904 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 3905 bdrv_register(&bdrv_host_cdrom); 3906 #endif 3907 } 3908 3909 block_init(bdrv_file_init); 3910