1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "config-host.h" 25 #include "qemu-common.h" 26 #include "trace.h" 27 #include "monitor/monitor.h" 28 #include "block/block_int.h" 29 #include "block/blockjob.h" 30 #include "qemu/module.h" 31 #include "qapi/qmp/qjson.h" 32 #include "sysemu/sysemu.h" 33 #include "qemu/notify.h" 34 #include "block/coroutine.h" 35 #include "qmp-commands.h" 36 #include "qemu/timer.h" 37 38 #ifdef CONFIG_BSD 39 #include <sys/types.h> 40 #include <sys/stat.h> 41 #include <sys/ioctl.h> 42 #include <sys/queue.h> 43 #ifndef __DragonFly__ 44 #include <sys/disk.h> 45 #endif 46 #endif 47 48 #ifdef _WIN32 49 #include <windows.h> 50 #endif 51 52 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 53 54 typedef enum { 55 BDRV_REQ_COPY_ON_READ = 0x1, 56 BDRV_REQ_ZERO_WRITE = 0x2, 57 } BdrvRequestFlags; 58 59 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); 60 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 62 BlockDriverCompletionFunc *cb, void *opaque); 63 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 64 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 65 BlockDriverCompletionFunc *cb, void *opaque); 66 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 67 int64_t sector_num, int nb_sectors, 68 QEMUIOVector *iov); 69 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 70 int64_t sector_num, int nb_sectors, 71 QEMUIOVector *iov); 72 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 74 BdrvRequestFlags flags); 75 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 76 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 77 BdrvRequestFlags flags); 78 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 79 int64_t sector_num, 80 QEMUIOVector *qiov, 81 int nb_sectors, 82 BlockDriverCompletionFunc *cb, 83 void *opaque, 84 bool is_write); 85 static void coroutine_fn bdrv_co_do_rw(void *opaque); 86 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 87 int64_t sector_num, int nb_sectors); 88 89 static QTAILQ_HEAD(, BlockDriverState) bdrv_states = 90 QTAILQ_HEAD_INITIALIZER(bdrv_states); 91 92 static QLIST_HEAD(, BlockDriver) bdrv_drivers = 93 QLIST_HEAD_INITIALIZER(bdrv_drivers); 94 95 /* If non-zero, use only whitelisted block drivers */ 96 static int use_bdrv_whitelist; 97 98 #ifdef _WIN32 99 static int is_windows_drive_prefix(const char *filename) 100 { 101 return (((filename[0] >= 'a' && filename[0] <= 'z') || 102 (filename[0] >= 'A' && filename[0] <= 'Z')) && 103 filename[1] == ':'); 104 } 105 106 int is_windows_drive(const char *filename) 107 { 108 if (is_windows_drive_prefix(filename) && 109 filename[2] == '\0') 110 return 1; 111 if (strstart(filename, "\\\\.\\", NULL) || 112 strstart(filename, "//./", NULL)) 113 return 1; 114 return 0; 115 } 116 #endif 117 118 /* throttling disk I/O limits */ 119 void bdrv_set_io_limits(BlockDriverState *bs, 120 ThrottleConfig *cfg) 121 { 122 int i; 123 124 throttle_config(&bs->throttle_state, cfg); 125 126 for (i = 0; i < 2; i++) { 127 qemu_co_enter_next(&bs->throttled_reqs[i]); 128 } 129 } 130 131 /* this function drain all the throttled IOs */ 132 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 133 { 134 bool drained = false; 135 bool enabled = bs->io_limits_enabled; 136 int i; 137 138 bs->io_limits_enabled = false; 139 140 for (i = 0; i < 2; i++) { 141 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 142 drained = true; 143 } 144 } 145 146 bs->io_limits_enabled = enabled; 147 148 return drained; 149 } 150 151 void bdrv_io_limits_disable(BlockDriverState *bs) 152 { 153 bs->io_limits_enabled = false; 154 155 bdrv_start_throttled_reqs(bs); 156 157 throttle_destroy(&bs->throttle_state); 158 } 159 160 static void bdrv_throttle_read_timer_cb(void *opaque) 161 { 162 BlockDriverState *bs = opaque; 163 qemu_co_enter_next(&bs->throttled_reqs[0]); 164 } 165 166 static void bdrv_throttle_write_timer_cb(void *opaque) 167 { 168 BlockDriverState *bs = opaque; 169 qemu_co_enter_next(&bs->throttled_reqs[1]); 170 } 171 172 /* should be called before bdrv_set_io_limits if a limit is set */ 173 void bdrv_io_limits_enable(BlockDriverState *bs) 174 { 175 assert(!bs->io_limits_enabled); 176 throttle_init(&bs->throttle_state, 177 QEMU_CLOCK_VIRTUAL, 178 bdrv_throttle_read_timer_cb, 179 bdrv_throttle_write_timer_cb, 180 bs); 181 bs->io_limits_enabled = true; 182 } 183 184 /* This function makes an IO wait if needed 185 * 186 * @nb_sectors: the number of sectors of the IO 187 * @is_write: is the IO a write 188 */ 189 static void bdrv_io_limits_intercept(BlockDriverState *bs, 190 int nb_sectors, 191 bool is_write) 192 { 193 /* does this io must wait */ 194 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); 195 196 /* if must wait or any request of this type throttled queue the IO */ 197 if (must_wait || 198 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { 199 qemu_co_queue_wait(&bs->throttled_reqs[is_write]); 200 } 201 202 /* the IO will be executed, do the accounting */ 203 throttle_account(&bs->throttle_state, 204 is_write, 205 nb_sectors * BDRV_SECTOR_SIZE); 206 207 /* if the next request must wait -> do nothing */ 208 if (throttle_schedule_timer(&bs->throttle_state, is_write)) { 209 return; 210 } 211 212 /* else queue next request for execution */ 213 qemu_co_queue_next(&bs->throttled_reqs[is_write]); 214 } 215 216 /* check if the path starts with "<protocol>:" */ 217 static int path_has_protocol(const char *path) 218 { 219 const char *p; 220 221 #ifdef _WIN32 222 if (is_windows_drive(path) || 223 is_windows_drive_prefix(path)) { 224 return 0; 225 } 226 p = path + strcspn(path, ":/\\"); 227 #else 228 p = path + strcspn(path, ":/"); 229 #endif 230 231 return *p == ':'; 232 } 233 234 int path_is_absolute(const char *path) 235 { 236 #ifdef _WIN32 237 /* specific case for names like: "\\.\d:" */ 238 if (is_windows_drive(path) || is_windows_drive_prefix(path)) { 239 return 1; 240 } 241 return (*path == '/' || *path == '\\'); 242 #else 243 return (*path == '/'); 244 #endif 245 } 246 247 /* if filename is absolute, just copy it to dest. Otherwise, build a 248 path to it by considering it is relative to base_path. URL are 249 supported. */ 250 void path_combine(char *dest, int dest_size, 251 const char *base_path, 252 const char *filename) 253 { 254 const char *p, *p1; 255 int len; 256 257 if (dest_size <= 0) 258 return; 259 if (path_is_absolute(filename)) { 260 pstrcpy(dest, dest_size, filename); 261 } else { 262 p = strchr(base_path, ':'); 263 if (p) 264 p++; 265 else 266 p = base_path; 267 p1 = strrchr(base_path, '/'); 268 #ifdef _WIN32 269 { 270 const char *p2; 271 p2 = strrchr(base_path, '\\'); 272 if (!p1 || p2 > p1) 273 p1 = p2; 274 } 275 #endif 276 if (p1) 277 p1++; 278 else 279 p1 = base_path; 280 if (p1 > p) 281 p = p1; 282 len = p - base_path; 283 if (len > dest_size - 1) 284 len = dest_size - 1; 285 memcpy(dest, base_path, len); 286 dest[len] = '\0'; 287 pstrcat(dest, dest_size, filename); 288 } 289 } 290 291 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz) 292 { 293 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) { 294 pstrcpy(dest, sz, bs->backing_file); 295 } else { 296 path_combine(dest, sz, bs->filename, bs->backing_file); 297 } 298 } 299 300 void bdrv_register(BlockDriver *bdrv) 301 { 302 /* Block drivers without coroutine functions need emulation */ 303 if (!bdrv->bdrv_co_readv) { 304 bdrv->bdrv_co_readv = bdrv_co_readv_em; 305 bdrv->bdrv_co_writev = bdrv_co_writev_em; 306 307 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 308 * the block driver lacks aio we need to emulate that too. 309 */ 310 if (!bdrv->bdrv_aio_readv) { 311 /* add AIO emulation layer */ 312 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 313 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 314 } 315 } 316 317 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); 318 } 319 320 /* create a new block device (by default it is empty) */ 321 BlockDriverState *bdrv_new(const char *device_name) 322 { 323 BlockDriverState *bs; 324 325 bs = g_malloc0(sizeof(BlockDriverState)); 326 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); 327 if (device_name[0] != '\0') { 328 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); 329 } 330 bdrv_iostatus_disable(bs); 331 notifier_list_init(&bs->close_notifiers); 332 notifier_with_return_list_init(&bs->before_write_notifiers); 333 qemu_co_queue_init(&bs->throttled_reqs[0]); 334 qemu_co_queue_init(&bs->throttled_reqs[1]); 335 bs->refcnt = 1; 336 337 return bs; 338 } 339 340 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) 341 { 342 notifier_list_add(&bs->close_notifiers, notify); 343 } 344 345 BlockDriver *bdrv_find_format(const char *format_name) 346 { 347 BlockDriver *drv1; 348 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 349 if (!strcmp(drv1->format_name, format_name)) { 350 return drv1; 351 } 352 } 353 return NULL; 354 } 355 356 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only) 357 { 358 static const char *whitelist_rw[] = { 359 CONFIG_BDRV_RW_WHITELIST 360 }; 361 static const char *whitelist_ro[] = { 362 CONFIG_BDRV_RO_WHITELIST 363 }; 364 const char **p; 365 366 if (!whitelist_rw[0] && !whitelist_ro[0]) { 367 return 1; /* no whitelist, anything goes */ 368 } 369 370 for (p = whitelist_rw; *p; p++) { 371 if (!strcmp(drv->format_name, *p)) { 372 return 1; 373 } 374 } 375 if (read_only) { 376 for (p = whitelist_ro; *p; p++) { 377 if (!strcmp(drv->format_name, *p)) { 378 return 1; 379 } 380 } 381 } 382 return 0; 383 } 384 385 BlockDriver *bdrv_find_whitelisted_format(const char *format_name, 386 bool read_only) 387 { 388 BlockDriver *drv = bdrv_find_format(format_name); 389 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL; 390 } 391 392 typedef struct CreateCo { 393 BlockDriver *drv; 394 char *filename; 395 QEMUOptionParameter *options; 396 int ret; 397 Error *err; 398 } CreateCo; 399 400 static void coroutine_fn bdrv_create_co_entry(void *opaque) 401 { 402 Error *local_err = NULL; 403 int ret; 404 405 CreateCo *cco = opaque; 406 assert(cco->drv); 407 408 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err); 409 if (error_is_set(&local_err)) { 410 error_propagate(&cco->err, local_err); 411 } 412 cco->ret = ret; 413 } 414 415 int bdrv_create(BlockDriver *drv, const char* filename, 416 QEMUOptionParameter *options, Error **errp) 417 { 418 int ret; 419 420 Coroutine *co; 421 CreateCo cco = { 422 .drv = drv, 423 .filename = g_strdup(filename), 424 .options = options, 425 .ret = NOT_DONE, 426 .err = NULL, 427 }; 428 429 if (!drv->bdrv_create) { 430 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); 431 ret = -ENOTSUP; 432 goto out; 433 } 434 435 if (qemu_in_coroutine()) { 436 /* Fast-path if already in coroutine context */ 437 bdrv_create_co_entry(&cco); 438 } else { 439 co = qemu_coroutine_create(bdrv_create_co_entry); 440 qemu_coroutine_enter(co, &cco); 441 while (cco.ret == NOT_DONE) { 442 qemu_aio_wait(); 443 } 444 } 445 446 ret = cco.ret; 447 if (ret < 0) { 448 if (error_is_set(&cco.err)) { 449 error_propagate(errp, cco.err); 450 } else { 451 error_setg_errno(errp, -ret, "Could not create image"); 452 } 453 } 454 455 out: 456 g_free(cco.filename); 457 return ret; 458 } 459 460 int bdrv_create_file(const char* filename, QEMUOptionParameter *options, 461 Error **errp) 462 { 463 BlockDriver *drv; 464 Error *local_err = NULL; 465 int ret; 466 467 drv = bdrv_find_protocol(filename, true); 468 if (drv == NULL) { 469 error_setg(errp, "Could not find protocol for file '%s'", filename); 470 return -ENOENT; 471 } 472 473 ret = bdrv_create(drv, filename, options, &local_err); 474 if (error_is_set(&local_err)) { 475 error_propagate(errp, local_err); 476 } 477 return ret; 478 } 479 480 /* 481 * Create a uniquely-named empty temporary file. 482 * Return 0 upon success, otherwise a negative errno value. 483 */ 484 int get_tmp_filename(char *filename, int size) 485 { 486 #ifdef _WIN32 487 char temp_dir[MAX_PATH]; 488 /* GetTempFileName requires that its output buffer (4th param) 489 have length MAX_PATH or greater. */ 490 assert(size >= MAX_PATH); 491 return (GetTempPath(MAX_PATH, temp_dir) 492 && GetTempFileName(temp_dir, "qem", 0, filename) 493 ? 0 : -GetLastError()); 494 #else 495 int fd; 496 const char *tmpdir; 497 tmpdir = getenv("TMPDIR"); 498 if (!tmpdir) 499 tmpdir = "/tmp"; 500 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { 501 return -EOVERFLOW; 502 } 503 fd = mkstemp(filename); 504 if (fd < 0) { 505 return -errno; 506 } 507 if (close(fd) != 0) { 508 unlink(filename); 509 return -errno; 510 } 511 return 0; 512 #endif 513 } 514 515 /* 516 * Detect host devices. By convention, /dev/cdrom[N] is always 517 * recognized as a host CDROM. 518 */ 519 static BlockDriver *find_hdev_driver(const char *filename) 520 { 521 int score_max = 0, score; 522 BlockDriver *drv = NULL, *d; 523 524 QLIST_FOREACH(d, &bdrv_drivers, list) { 525 if (d->bdrv_probe_device) { 526 score = d->bdrv_probe_device(filename); 527 if (score > score_max) { 528 score_max = score; 529 drv = d; 530 } 531 } 532 } 533 534 return drv; 535 } 536 537 BlockDriver *bdrv_find_protocol(const char *filename, 538 bool allow_protocol_prefix) 539 { 540 BlockDriver *drv1; 541 char protocol[128]; 542 int len; 543 const char *p; 544 545 /* TODO Drivers without bdrv_file_open must be specified explicitly */ 546 547 /* 548 * XXX(hch): we really should not let host device detection 549 * override an explicit protocol specification, but moving this 550 * later breaks access to device names with colons in them. 551 * Thanks to the brain-dead persistent naming schemes on udev- 552 * based Linux systems those actually are quite common. 553 */ 554 drv1 = find_hdev_driver(filename); 555 if (drv1) { 556 return drv1; 557 } 558 559 if (!path_has_protocol(filename) || !allow_protocol_prefix) { 560 return bdrv_find_format("file"); 561 } 562 563 p = strchr(filename, ':'); 564 assert(p != NULL); 565 len = p - filename; 566 if (len > sizeof(protocol) - 1) 567 len = sizeof(protocol) - 1; 568 memcpy(protocol, filename, len); 569 protocol[len] = '\0'; 570 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 571 if (drv1->protocol_name && 572 !strcmp(drv1->protocol_name, protocol)) { 573 return drv1; 574 } 575 } 576 return NULL; 577 } 578 579 static int find_image_format(BlockDriverState *bs, const char *filename, 580 BlockDriver **pdrv, Error **errp) 581 { 582 int score, score_max; 583 BlockDriver *drv1, *drv; 584 uint8_t buf[2048]; 585 int ret = 0; 586 587 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ 588 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { 589 drv = bdrv_find_format("raw"); 590 if (!drv) { 591 error_setg(errp, "Could not find raw image format"); 592 ret = -ENOENT; 593 } 594 *pdrv = drv; 595 return ret; 596 } 597 598 ret = bdrv_pread(bs, 0, buf, sizeof(buf)); 599 if (ret < 0) { 600 error_setg_errno(errp, -ret, "Could not read image for determining its " 601 "format"); 602 *pdrv = NULL; 603 return ret; 604 } 605 606 score_max = 0; 607 drv = NULL; 608 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 609 if (drv1->bdrv_probe) { 610 score = drv1->bdrv_probe(buf, ret, filename); 611 if (score > score_max) { 612 score_max = score; 613 drv = drv1; 614 } 615 } 616 } 617 if (!drv) { 618 error_setg(errp, "Could not determine image format: No compatible " 619 "driver found"); 620 ret = -ENOENT; 621 } 622 *pdrv = drv; 623 return ret; 624 } 625 626 /** 627 * Set the current 'total_sectors' value 628 */ 629 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) 630 { 631 BlockDriver *drv = bs->drv; 632 633 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ 634 if (bs->sg) 635 return 0; 636 637 /* query actual device if possible, otherwise just trust the hint */ 638 if (drv->bdrv_getlength) { 639 int64_t length = drv->bdrv_getlength(bs); 640 if (length < 0) { 641 return length; 642 } 643 hint = length >> BDRV_SECTOR_BITS; 644 } 645 646 bs->total_sectors = hint; 647 return 0; 648 } 649 650 /** 651 * Set open flags for a given discard mode 652 * 653 * Return 0 on success, -1 if the discard mode was invalid. 654 */ 655 int bdrv_parse_discard_flags(const char *mode, int *flags) 656 { 657 *flags &= ~BDRV_O_UNMAP; 658 659 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) { 660 /* do nothing */ 661 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) { 662 *flags |= BDRV_O_UNMAP; 663 } else { 664 return -1; 665 } 666 667 return 0; 668 } 669 670 /** 671 * Set open flags for a given cache mode 672 * 673 * Return 0 on success, -1 if the cache mode was invalid. 674 */ 675 int bdrv_parse_cache_flags(const char *mode, int *flags) 676 { 677 *flags &= ~BDRV_O_CACHE_MASK; 678 679 if (!strcmp(mode, "off") || !strcmp(mode, "none")) { 680 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; 681 } else if (!strcmp(mode, "directsync")) { 682 *flags |= BDRV_O_NOCACHE; 683 } else if (!strcmp(mode, "writeback")) { 684 *flags |= BDRV_O_CACHE_WB; 685 } else if (!strcmp(mode, "unsafe")) { 686 *flags |= BDRV_O_CACHE_WB; 687 *flags |= BDRV_O_NO_FLUSH; 688 } else if (!strcmp(mode, "writethrough")) { 689 /* this is the default */ 690 } else { 691 return -1; 692 } 693 694 return 0; 695 } 696 697 /** 698 * The copy-on-read flag is actually a reference count so multiple users may 699 * use the feature without worrying about clobbering its previous state. 700 * Copy-on-read stays enabled until all users have called to disable it. 701 */ 702 void bdrv_enable_copy_on_read(BlockDriverState *bs) 703 { 704 bs->copy_on_read++; 705 } 706 707 void bdrv_disable_copy_on_read(BlockDriverState *bs) 708 { 709 assert(bs->copy_on_read > 0); 710 bs->copy_on_read--; 711 } 712 713 static int bdrv_open_flags(BlockDriverState *bs, int flags) 714 { 715 int open_flags = flags | BDRV_O_CACHE_WB; 716 717 /* 718 * Clear flags that are internal to the block layer before opening the 719 * image. 720 */ 721 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 722 723 /* 724 * Snapshots should be writable. 725 */ 726 if (bs->is_temporary) { 727 open_flags |= BDRV_O_RDWR; 728 } 729 730 return open_flags; 731 } 732 733 /* 734 * Common part for opening disk images and files 735 * 736 * Removes all processed options from *options. 737 */ 738 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, 739 QDict *options, int flags, BlockDriver *drv, Error **errp) 740 { 741 int ret, open_flags; 742 const char *filename; 743 Error *local_err = NULL; 744 745 assert(drv != NULL); 746 assert(bs->file == NULL); 747 assert(options != NULL && bs->options != options); 748 749 if (file != NULL) { 750 filename = file->filename; 751 } else { 752 filename = qdict_get_try_str(options, "filename"); 753 } 754 755 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); 756 757 /* bdrv_open() with directly using a protocol as drv. This layer is already 758 * opened, so assign it to bs (while file becomes a closed BlockDriverState) 759 * and return immediately. */ 760 if (file != NULL && drv->bdrv_file_open) { 761 bdrv_swap(file, bs); 762 return 0; 763 } 764 765 bs->open_flags = flags; 766 bs->buffer_alignment = 512; 767 bs->zero_beyond_eof = true; 768 open_flags = bdrv_open_flags(bs, flags); 769 bs->read_only = !(open_flags & BDRV_O_RDWR); 770 771 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) { 772 error_setg(errp, "Driver '%s' is not whitelisted", drv->format_name); 773 return -ENOTSUP; 774 } 775 776 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ 777 if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) { 778 bdrv_enable_copy_on_read(bs); 779 } 780 781 if (filename != NULL) { 782 pstrcpy(bs->filename, sizeof(bs->filename), filename); 783 } else { 784 bs->filename[0] = '\0'; 785 } 786 787 bs->drv = drv; 788 bs->opaque = g_malloc0(drv->instance_size); 789 790 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); 791 792 /* Open the image, either directly or using a protocol */ 793 if (drv->bdrv_file_open) { 794 assert(file == NULL); 795 assert(!drv->bdrv_needs_filename || filename != NULL); 796 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err); 797 } else { 798 if (file == NULL) { 799 error_setg(errp, "Can't use '%s' as a block driver for the " 800 "protocol level", drv->format_name); 801 ret = -EINVAL; 802 goto free_and_fail; 803 } 804 bs->file = file; 805 ret = drv->bdrv_open(bs, options, open_flags, &local_err); 806 } 807 808 if (ret < 0) { 809 if (error_is_set(&local_err)) { 810 error_propagate(errp, local_err); 811 } else if (bs->filename[0]) { 812 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename); 813 } else { 814 error_setg_errno(errp, -ret, "Could not open image"); 815 } 816 goto free_and_fail; 817 } 818 819 ret = refresh_total_sectors(bs, bs->total_sectors); 820 if (ret < 0) { 821 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 822 goto free_and_fail; 823 } 824 825 #ifndef _WIN32 826 if (bs->is_temporary) { 827 assert(bs->filename[0] != '\0'); 828 unlink(bs->filename); 829 } 830 #endif 831 return 0; 832 833 free_and_fail: 834 bs->file = NULL; 835 g_free(bs->opaque); 836 bs->opaque = NULL; 837 bs->drv = NULL; 838 return ret; 839 } 840 841 /* 842 * Opens a file using a protocol (file, host_device, nbd, ...) 843 * 844 * options is a QDict of options to pass to the block drivers, or NULL for an 845 * empty set of options. The reference to the QDict belongs to the block layer 846 * after the call (even on failure), so if the caller intends to reuse the 847 * dictionary, it needs to use QINCREF() before calling bdrv_file_open. 848 */ 849 int bdrv_file_open(BlockDriverState **pbs, const char *filename, 850 QDict *options, int flags, Error **errp) 851 { 852 BlockDriverState *bs; 853 BlockDriver *drv; 854 const char *drvname; 855 bool allow_protocol_prefix = false; 856 Error *local_err = NULL; 857 int ret; 858 859 /* NULL means an empty set of options */ 860 if (options == NULL) { 861 options = qdict_new(); 862 } 863 864 bs = bdrv_new(""); 865 bs->options = options; 866 options = qdict_clone_shallow(options); 867 868 /* Fetch the file name from the options QDict if necessary */ 869 if (!filename) { 870 filename = qdict_get_try_str(options, "filename"); 871 } else if (filename && !qdict_haskey(options, "filename")) { 872 qdict_put(options, "filename", qstring_from_str(filename)); 873 allow_protocol_prefix = true; 874 } else { 875 error_setg(errp, "Can't specify 'file' and 'filename' options at the " 876 "same time"); 877 ret = -EINVAL; 878 goto fail; 879 } 880 881 /* Find the right block driver */ 882 drvname = qdict_get_try_str(options, "driver"); 883 if (drvname) { 884 drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR)); 885 if (!drv) { 886 error_setg(errp, "Unknown driver '%s'", drvname); 887 } 888 qdict_del(options, "driver"); 889 } else if (filename) { 890 drv = bdrv_find_protocol(filename, allow_protocol_prefix); 891 if (!drv) { 892 error_setg(errp, "Unknown protocol"); 893 } 894 } else { 895 error_setg(errp, "Must specify either driver or file"); 896 drv = NULL; 897 } 898 899 if (!drv) { 900 /* errp has been set already */ 901 ret = -ENOENT; 902 goto fail; 903 } 904 905 /* Parse the filename and open it */ 906 if (drv->bdrv_parse_filename && filename) { 907 drv->bdrv_parse_filename(filename, options, &local_err); 908 if (error_is_set(&local_err)) { 909 error_propagate(errp, local_err); 910 ret = -EINVAL; 911 goto fail; 912 } 913 qdict_del(options, "filename"); 914 } else if (drv->bdrv_needs_filename && !filename) { 915 error_setg(errp, "The '%s' block driver requires a file name", 916 drv->format_name); 917 ret = -EINVAL; 918 goto fail; 919 } 920 921 ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); 922 if (ret < 0) { 923 error_propagate(errp, local_err); 924 goto fail; 925 } 926 927 /* Check if any unknown options were used */ 928 if (qdict_size(options) != 0) { 929 const QDictEntry *entry = qdict_first(options); 930 error_setg(errp, "Block protocol '%s' doesn't support the option '%s'", 931 drv->format_name, entry->key); 932 ret = -EINVAL; 933 goto fail; 934 } 935 QDECREF(options); 936 937 bs->growable = 1; 938 *pbs = bs; 939 return 0; 940 941 fail: 942 QDECREF(options); 943 if (!bs->drv) { 944 QDECREF(bs->options); 945 } 946 bdrv_unref(bs); 947 return ret; 948 } 949 950 /* 951 * Opens the backing file for a BlockDriverState if not yet open 952 * 953 * options is a QDict of options to pass to the block drivers, or NULL for an 954 * empty set of options. The reference to the QDict is transferred to this 955 * function (even on failure), so if the caller intends to reuse the dictionary, 956 * it needs to use QINCREF() before calling bdrv_file_open. 957 */ 958 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) 959 { 960 char backing_filename[PATH_MAX]; 961 int back_flags, ret; 962 BlockDriver *back_drv = NULL; 963 Error *local_err = NULL; 964 965 if (bs->backing_hd != NULL) { 966 QDECREF(options); 967 return 0; 968 } 969 970 /* NULL means an empty set of options */ 971 if (options == NULL) { 972 options = qdict_new(); 973 } 974 975 bs->open_flags &= ~BDRV_O_NO_BACKING; 976 if (qdict_haskey(options, "file.filename")) { 977 backing_filename[0] = '\0'; 978 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) { 979 QDECREF(options); 980 return 0; 981 } else { 982 bdrv_get_full_backing_filename(bs, backing_filename, 983 sizeof(backing_filename)); 984 } 985 986 bs->backing_hd = bdrv_new(""); 987 988 if (bs->backing_format[0] != '\0') { 989 back_drv = bdrv_find_format(bs->backing_format); 990 } 991 992 /* backing files always opened read-only */ 993 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT); 994 995 ret = bdrv_open(bs->backing_hd, 996 *backing_filename ? backing_filename : NULL, options, 997 back_flags, back_drv, &local_err); 998 pstrcpy(bs->backing_file, sizeof(bs->backing_file), 999 bs->backing_hd->file->filename); 1000 if (ret < 0) { 1001 bdrv_unref(bs->backing_hd); 1002 bs->backing_hd = NULL; 1003 bs->open_flags |= BDRV_O_NO_BACKING; 1004 error_propagate(errp, local_err); 1005 return ret; 1006 } 1007 return 0; 1008 } 1009 1010 /* 1011 * Opens a disk image (raw, qcow2, vmdk, ...) 1012 * 1013 * options is a QDict of options to pass to the block drivers, or NULL for an 1014 * empty set of options. The reference to the QDict belongs to the block layer 1015 * after the call (even on failure), so if the caller intends to reuse the 1016 * dictionary, it needs to use QINCREF() before calling bdrv_open. 1017 */ 1018 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, 1019 int flags, BlockDriver *drv, Error **errp) 1020 { 1021 int ret; 1022 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ 1023 char tmp_filename[PATH_MAX + 1]; 1024 BlockDriverState *file = NULL; 1025 QDict *file_options = NULL; 1026 const char *drvname; 1027 Error *local_err = NULL; 1028 1029 /* NULL means an empty set of options */ 1030 if (options == NULL) { 1031 options = qdict_new(); 1032 } 1033 1034 bs->options = options; 1035 options = qdict_clone_shallow(options); 1036 1037 /* For snapshot=on, create a temporary qcow2 overlay */ 1038 if (flags & BDRV_O_SNAPSHOT) { 1039 BlockDriverState *bs1; 1040 int64_t total_size; 1041 BlockDriver *bdrv_qcow2; 1042 QEMUOptionParameter *create_options; 1043 char backing_filename[PATH_MAX]; 1044 1045 if (qdict_size(options) != 0) { 1046 error_setg(errp, "Can't use snapshot=on with driver-specific options"); 1047 ret = -EINVAL; 1048 goto fail; 1049 } 1050 assert(filename != NULL); 1051 1052 /* if snapshot, we create a temporary backing file and open it 1053 instead of opening 'filename' directly */ 1054 1055 /* if there is a backing file, use it */ 1056 bs1 = bdrv_new(""); 1057 ret = bdrv_open(bs1, filename, NULL, 0, drv, &local_err); 1058 if (ret < 0) { 1059 bdrv_unref(bs1); 1060 goto fail; 1061 } 1062 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK; 1063 1064 bdrv_unref(bs1); 1065 1066 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename)); 1067 if (ret < 0) { 1068 error_setg_errno(errp, -ret, "Could not get temporary filename"); 1069 goto fail; 1070 } 1071 1072 /* Real path is meaningless for protocols */ 1073 if (path_has_protocol(filename)) { 1074 snprintf(backing_filename, sizeof(backing_filename), 1075 "%s", filename); 1076 } else if (!realpath(filename, backing_filename)) { 1077 error_setg_errno(errp, errno, "Could not resolve path '%s'", filename); 1078 ret = -errno; 1079 goto fail; 1080 } 1081 1082 bdrv_qcow2 = bdrv_find_format("qcow2"); 1083 create_options = parse_option_parameters("", bdrv_qcow2->create_options, 1084 NULL); 1085 1086 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size); 1087 set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE, 1088 backing_filename); 1089 if (drv) { 1090 set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT, 1091 drv->format_name); 1092 } 1093 1094 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err); 1095 free_option_parameters(create_options); 1096 if (ret < 0) { 1097 error_setg_errno(errp, -ret, "Could not create temporary overlay " 1098 "'%s': %s", tmp_filename, 1099 error_get_pretty(local_err)); 1100 error_free(local_err); 1101 local_err = NULL; 1102 goto fail; 1103 } 1104 1105 filename = tmp_filename; 1106 drv = bdrv_qcow2; 1107 bs->is_temporary = 1; 1108 } 1109 1110 /* Open image file without format layer */ 1111 if (flags & BDRV_O_RDWR) { 1112 flags |= BDRV_O_ALLOW_RDWR; 1113 } 1114 1115 qdict_extract_subqdict(options, &file_options, "file."); 1116 1117 ret = bdrv_file_open(&file, filename, file_options, 1118 bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err); 1119 if (ret < 0) { 1120 goto fail; 1121 } 1122 1123 /* Find the right image format driver */ 1124 drvname = qdict_get_try_str(options, "driver"); 1125 if (drvname) { 1126 drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR)); 1127 qdict_del(options, "driver"); 1128 } 1129 1130 if (!drv) { 1131 ret = find_image_format(file, filename, &drv, &local_err); 1132 } 1133 1134 if (!drv) { 1135 goto unlink_and_fail; 1136 } 1137 1138 /* Open the image */ 1139 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err); 1140 if (ret < 0) { 1141 goto unlink_and_fail; 1142 } 1143 1144 if (bs->file != file) { 1145 bdrv_unref(file); 1146 file = NULL; 1147 } 1148 1149 /* If there is a backing file, use it */ 1150 if ((flags & BDRV_O_NO_BACKING) == 0) { 1151 QDict *backing_options; 1152 1153 qdict_extract_subqdict(options, &backing_options, "backing."); 1154 ret = bdrv_open_backing_file(bs, backing_options, &local_err); 1155 if (ret < 0) { 1156 goto close_and_fail; 1157 } 1158 } 1159 1160 /* Check if any unknown options were used */ 1161 if (qdict_size(options) != 0) { 1162 const QDictEntry *entry = qdict_first(options); 1163 error_setg(errp, "Block format '%s' used by device '%s' doesn't " 1164 "support the option '%s'", drv->format_name, bs->device_name, 1165 entry->key); 1166 1167 ret = -EINVAL; 1168 goto close_and_fail; 1169 } 1170 QDECREF(options); 1171 1172 if (!bdrv_key_required(bs)) { 1173 bdrv_dev_change_media_cb(bs, true); 1174 } 1175 1176 return 0; 1177 1178 unlink_and_fail: 1179 if (file != NULL) { 1180 bdrv_unref(file); 1181 } 1182 if (bs->is_temporary) { 1183 unlink(filename); 1184 } 1185 fail: 1186 QDECREF(bs->options); 1187 QDECREF(options); 1188 bs->options = NULL; 1189 if (error_is_set(&local_err)) { 1190 error_propagate(errp, local_err); 1191 } 1192 return ret; 1193 1194 close_and_fail: 1195 bdrv_close(bs); 1196 QDECREF(options); 1197 if (error_is_set(&local_err)) { 1198 error_propagate(errp, local_err); 1199 } 1200 return ret; 1201 } 1202 1203 typedef struct BlockReopenQueueEntry { 1204 bool prepared; 1205 BDRVReopenState state; 1206 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; 1207 } BlockReopenQueueEntry; 1208 1209 /* 1210 * Adds a BlockDriverState to a simple queue for an atomic, transactional 1211 * reopen of multiple devices. 1212 * 1213 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT 1214 * already performed, or alternatively may be NULL a new BlockReopenQueue will 1215 * be created and initialized. This newly created BlockReopenQueue should be 1216 * passed back in for subsequent calls that are intended to be of the same 1217 * atomic 'set'. 1218 * 1219 * bs is the BlockDriverState to add to the reopen queue. 1220 * 1221 * flags contains the open flags for the associated bs 1222 * 1223 * returns a pointer to bs_queue, which is either the newly allocated 1224 * bs_queue, or the existing bs_queue being used. 1225 * 1226 */ 1227 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, 1228 BlockDriverState *bs, int flags) 1229 { 1230 assert(bs != NULL); 1231 1232 BlockReopenQueueEntry *bs_entry; 1233 if (bs_queue == NULL) { 1234 bs_queue = g_new0(BlockReopenQueue, 1); 1235 QSIMPLEQ_INIT(bs_queue); 1236 } 1237 1238 if (bs->file) { 1239 bdrv_reopen_queue(bs_queue, bs->file, flags); 1240 } 1241 1242 bs_entry = g_new0(BlockReopenQueueEntry, 1); 1243 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); 1244 1245 bs_entry->state.bs = bs; 1246 bs_entry->state.flags = flags; 1247 1248 return bs_queue; 1249 } 1250 1251 /* 1252 * Reopen multiple BlockDriverStates atomically & transactionally. 1253 * 1254 * The queue passed in (bs_queue) must have been built up previous 1255 * via bdrv_reopen_queue(). 1256 * 1257 * Reopens all BDS specified in the queue, with the appropriate 1258 * flags. All devices are prepared for reopen, and failure of any 1259 * device will cause all device changes to be abandonded, and intermediate 1260 * data cleaned up. 1261 * 1262 * If all devices prepare successfully, then the changes are committed 1263 * to all devices. 1264 * 1265 */ 1266 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) 1267 { 1268 int ret = -1; 1269 BlockReopenQueueEntry *bs_entry, *next; 1270 Error *local_err = NULL; 1271 1272 assert(bs_queue != NULL); 1273 1274 bdrv_drain_all(); 1275 1276 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1277 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { 1278 error_propagate(errp, local_err); 1279 goto cleanup; 1280 } 1281 bs_entry->prepared = true; 1282 } 1283 1284 /* If we reach this point, we have success and just need to apply the 1285 * changes 1286 */ 1287 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1288 bdrv_reopen_commit(&bs_entry->state); 1289 } 1290 1291 ret = 0; 1292 1293 cleanup: 1294 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { 1295 if (ret && bs_entry->prepared) { 1296 bdrv_reopen_abort(&bs_entry->state); 1297 } 1298 g_free(bs_entry); 1299 } 1300 g_free(bs_queue); 1301 return ret; 1302 } 1303 1304 1305 /* Reopen a single BlockDriverState with the specified flags. */ 1306 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) 1307 { 1308 int ret = -1; 1309 Error *local_err = NULL; 1310 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); 1311 1312 ret = bdrv_reopen_multiple(queue, &local_err); 1313 if (local_err != NULL) { 1314 error_propagate(errp, local_err); 1315 } 1316 return ret; 1317 } 1318 1319 1320 /* 1321 * Prepares a BlockDriverState for reopen. All changes are staged in the 1322 * 'opaque' field of the BDRVReopenState, which is used and allocated by 1323 * the block driver layer .bdrv_reopen_prepare() 1324 * 1325 * bs is the BlockDriverState to reopen 1326 * flags are the new open flags 1327 * queue is the reopen queue 1328 * 1329 * Returns 0 on success, non-zero on error. On error errp will be set 1330 * as well. 1331 * 1332 * On failure, bdrv_reopen_abort() will be called to clean up any data. 1333 * It is the responsibility of the caller to then call the abort() or 1334 * commit() for any other BDS that have been left in a prepare() state 1335 * 1336 */ 1337 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, 1338 Error **errp) 1339 { 1340 int ret = -1; 1341 Error *local_err = NULL; 1342 BlockDriver *drv; 1343 1344 assert(reopen_state != NULL); 1345 assert(reopen_state->bs->drv != NULL); 1346 drv = reopen_state->bs->drv; 1347 1348 /* if we are to stay read-only, do not allow permission change 1349 * to r/w */ 1350 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && 1351 reopen_state->flags & BDRV_O_RDWR) { 1352 error_set(errp, QERR_DEVICE_IS_READ_ONLY, 1353 reopen_state->bs->device_name); 1354 goto error; 1355 } 1356 1357 1358 ret = bdrv_flush(reopen_state->bs); 1359 if (ret) { 1360 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", 1361 strerror(-ret)); 1362 goto error; 1363 } 1364 1365 if (drv->bdrv_reopen_prepare) { 1366 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); 1367 if (ret) { 1368 if (local_err != NULL) { 1369 error_propagate(errp, local_err); 1370 } else { 1371 error_setg(errp, "failed while preparing to reopen image '%s'", 1372 reopen_state->bs->filename); 1373 } 1374 goto error; 1375 } 1376 } else { 1377 /* It is currently mandatory to have a bdrv_reopen_prepare() 1378 * handler for each supported drv. */ 1379 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, 1380 drv->format_name, reopen_state->bs->device_name, 1381 "reopening of file"); 1382 ret = -1; 1383 goto error; 1384 } 1385 1386 ret = 0; 1387 1388 error: 1389 return ret; 1390 } 1391 1392 /* 1393 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and 1394 * makes them final by swapping the staging BlockDriverState contents into 1395 * the active BlockDriverState contents. 1396 */ 1397 void bdrv_reopen_commit(BDRVReopenState *reopen_state) 1398 { 1399 BlockDriver *drv; 1400 1401 assert(reopen_state != NULL); 1402 drv = reopen_state->bs->drv; 1403 assert(drv != NULL); 1404 1405 /* If there are any driver level actions to take */ 1406 if (drv->bdrv_reopen_commit) { 1407 drv->bdrv_reopen_commit(reopen_state); 1408 } 1409 1410 /* set BDS specific flags now */ 1411 reopen_state->bs->open_flags = reopen_state->flags; 1412 reopen_state->bs->enable_write_cache = !!(reopen_state->flags & 1413 BDRV_O_CACHE_WB); 1414 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); 1415 } 1416 1417 /* 1418 * Abort the reopen, and delete and free the staged changes in 1419 * reopen_state 1420 */ 1421 void bdrv_reopen_abort(BDRVReopenState *reopen_state) 1422 { 1423 BlockDriver *drv; 1424 1425 assert(reopen_state != NULL); 1426 drv = reopen_state->bs->drv; 1427 assert(drv != NULL); 1428 1429 if (drv->bdrv_reopen_abort) { 1430 drv->bdrv_reopen_abort(reopen_state); 1431 } 1432 } 1433 1434 1435 void bdrv_close(BlockDriverState *bs) 1436 { 1437 if (bs->job) { 1438 block_job_cancel_sync(bs->job); 1439 } 1440 bdrv_drain_all(); /* complete I/O */ 1441 bdrv_flush(bs); 1442 bdrv_drain_all(); /* in case flush left pending I/O */ 1443 notifier_list_notify(&bs->close_notifiers, bs); 1444 1445 if (bs->drv) { 1446 if (bs->backing_hd) { 1447 bdrv_unref(bs->backing_hd); 1448 bs->backing_hd = NULL; 1449 } 1450 bs->drv->bdrv_close(bs); 1451 g_free(bs->opaque); 1452 #ifdef _WIN32 1453 if (bs->is_temporary) { 1454 unlink(bs->filename); 1455 } 1456 #endif 1457 bs->opaque = NULL; 1458 bs->drv = NULL; 1459 bs->copy_on_read = 0; 1460 bs->backing_file[0] = '\0'; 1461 bs->backing_format[0] = '\0'; 1462 bs->total_sectors = 0; 1463 bs->encrypted = 0; 1464 bs->valid_key = 0; 1465 bs->sg = 0; 1466 bs->growable = 0; 1467 bs->zero_beyond_eof = false; 1468 QDECREF(bs->options); 1469 bs->options = NULL; 1470 1471 if (bs->file != NULL) { 1472 bdrv_unref(bs->file); 1473 bs->file = NULL; 1474 } 1475 } 1476 1477 bdrv_dev_change_media_cb(bs, false); 1478 1479 /*throttling disk I/O limits*/ 1480 if (bs->io_limits_enabled) { 1481 bdrv_io_limits_disable(bs); 1482 } 1483 } 1484 1485 void bdrv_close_all(void) 1486 { 1487 BlockDriverState *bs; 1488 1489 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1490 bdrv_close(bs); 1491 } 1492 } 1493 1494 /* Check if any requests are in-flight (including throttled requests) */ 1495 static bool bdrv_requests_pending(BlockDriverState *bs) 1496 { 1497 if (!QLIST_EMPTY(&bs->tracked_requests)) { 1498 return true; 1499 } 1500 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 1501 return true; 1502 } 1503 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 1504 return true; 1505 } 1506 if (bs->file && bdrv_requests_pending(bs->file)) { 1507 return true; 1508 } 1509 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 1510 return true; 1511 } 1512 return false; 1513 } 1514 1515 static bool bdrv_requests_pending_all(void) 1516 { 1517 BlockDriverState *bs; 1518 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1519 if (bdrv_requests_pending(bs)) { 1520 return true; 1521 } 1522 } 1523 return false; 1524 } 1525 1526 /* 1527 * Wait for pending requests to complete across all BlockDriverStates 1528 * 1529 * This function does not flush data to disk, use bdrv_flush_all() for that 1530 * after calling this function. 1531 * 1532 * Note that completion of an asynchronous I/O operation can trigger any 1533 * number of other I/O operations on other devices---for example a coroutine 1534 * can be arbitrarily complex and a constant flow of I/O can come until the 1535 * coroutine is complete. Because of this, it is not possible to have a 1536 * function to drain a single device's I/O queue. 1537 */ 1538 void bdrv_drain_all(void) 1539 { 1540 /* Always run first iteration so any pending completion BHs run */ 1541 bool busy = true; 1542 BlockDriverState *bs; 1543 1544 while (busy) { 1545 /* FIXME: We do not have timer support here, so this is effectively 1546 * a busy wait. 1547 */ 1548 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1549 if (bdrv_start_throttled_reqs(bs)) { 1550 busy = true; 1551 } 1552 } 1553 1554 busy = bdrv_requests_pending_all(); 1555 busy |= aio_poll(qemu_get_aio_context(), busy); 1556 } 1557 } 1558 1559 /* make a BlockDriverState anonymous by removing from bdrv_state list. 1560 Also, NULL terminate the device_name to prevent double remove */ 1561 void bdrv_make_anon(BlockDriverState *bs) 1562 { 1563 if (bs->device_name[0] != '\0') { 1564 QTAILQ_REMOVE(&bdrv_states, bs, list); 1565 } 1566 bs->device_name[0] = '\0'; 1567 } 1568 1569 static void bdrv_rebind(BlockDriverState *bs) 1570 { 1571 if (bs->drv && bs->drv->bdrv_rebind) { 1572 bs->drv->bdrv_rebind(bs); 1573 } 1574 } 1575 1576 static void bdrv_move_feature_fields(BlockDriverState *bs_dest, 1577 BlockDriverState *bs_src) 1578 { 1579 /* move some fields that need to stay attached to the device */ 1580 bs_dest->open_flags = bs_src->open_flags; 1581 1582 /* dev info */ 1583 bs_dest->dev_ops = bs_src->dev_ops; 1584 bs_dest->dev_opaque = bs_src->dev_opaque; 1585 bs_dest->dev = bs_src->dev; 1586 bs_dest->buffer_alignment = bs_src->buffer_alignment; 1587 bs_dest->copy_on_read = bs_src->copy_on_read; 1588 1589 bs_dest->enable_write_cache = bs_src->enable_write_cache; 1590 1591 /* i/o throttled req */ 1592 memcpy(&bs_dest->throttle_state, 1593 &bs_src->throttle_state, 1594 sizeof(ThrottleState)); 1595 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0]; 1596 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1]; 1597 bs_dest->io_limits_enabled = bs_src->io_limits_enabled; 1598 1599 /* r/w error */ 1600 bs_dest->on_read_error = bs_src->on_read_error; 1601 bs_dest->on_write_error = bs_src->on_write_error; 1602 1603 /* i/o status */ 1604 bs_dest->iostatus_enabled = bs_src->iostatus_enabled; 1605 bs_dest->iostatus = bs_src->iostatus; 1606 1607 /* dirty bitmap */ 1608 bs_dest->dirty_bitmap = bs_src->dirty_bitmap; 1609 1610 /* reference count */ 1611 bs_dest->refcnt = bs_src->refcnt; 1612 1613 /* job */ 1614 bs_dest->in_use = bs_src->in_use; 1615 bs_dest->job = bs_src->job; 1616 1617 /* keep the same entry in bdrv_states */ 1618 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), 1619 bs_src->device_name); 1620 bs_dest->list = bs_src->list; 1621 } 1622 1623 /* 1624 * Swap bs contents for two image chains while they are live, 1625 * while keeping required fields on the BlockDriverState that is 1626 * actually attached to a device. 1627 * 1628 * This will modify the BlockDriverState fields, and swap contents 1629 * between bs_new and bs_old. Both bs_new and bs_old are modified. 1630 * 1631 * bs_new is required to be anonymous. 1632 * 1633 * This function does not create any image files. 1634 */ 1635 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) 1636 { 1637 BlockDriverState tmp; 1638 1639 /* bs_new must be anonymous and shouldn't have anything fancy enabled */ 1640 assert(bs_new->device_name[0] == '\0'); 1641 assert(bs_new->dirty_bitmap == NULL); 1642 assert(bs_new->job == NULL); 1643 assert(bs_new->dev == NULL); 1644 assert(bs_new->in_use == 0); 1645 assert(bs_new->io_limits_enabled == false); 1646 assert(!throttle_have_timer(&bs_new->throttle_state)); 1647 1648 tmp = *bs_new; 1649 *bs_new = *bs_old; 1650 *bs_old = tmp; 1651 1652 /* there are some fields that should not be swapped, move them back */ 1653 bdrv_move_feature_fields(&tmp, bs_old); 1654 bdrv_move_feature_fields(bs_old, bs_new); 1655 bdrv_move_feature_fields(bs_new, &tmp); 1656 1657 /* bs_new shouldn't be in bdrv_states even after the swap! */ 1658 assert(bs_new->device_name[0] == '\0'); 1659 1660 /* Check a few fields that should remain attached to the device */ 1661 assert(bs_new->dev == NULL); 1662 assert(bs_new->job == NULL); 1663 assert(bs_new->in_use == 0); 1664 assert(bs_new->io_limits_enabled == false); 1665 assert(!throttle_have_timer(&bs_new->throttle_state)); 1666 1667 bdrv_rebind(bs_new); 1668 bdrv_rebind(bs_old); 1669 } 1670 1671 /* 1672 * Add new bs contents at the top of an image chain while the chain is 1673 * live, while keeping required fields on the top layer. 1674 * 1675 * This will modify the BlockDriverState fields, and swap contents 1676 * between bs_new and bs_top. Both bs_new and bs_top are modified. 1677 * 1678 * bs_new is required to be anonymous. 1679 * 1680 * This function does not create any image files. 1681 */ 1682 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) 1683 { 1684 bdrv_swap(bs_new, bs_top); 1685 1686 /* The contents of 'tmp' will become bs_top, as we are 1687 * swapping bs_new and bs_top contents. */ 1688 bs_top->backing_hd = bs_new; 1689 bs_top->open_flags &= ~BDRV_O_NO_BACKING; 1690 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file), 1691 bs_new->filename); 1692 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format), 1693 bs_new->drv ? bs_new->drv->format_name : ""); 1694 } 1695 1696 static void bdrv_delete(BlockDriverState *bs) 1697 { 1698 assert(!bs->dev); 1699 assert(!bs->job); 1700 assert(!bs->in_use); 1701 assert(!bs->refcnt); 1702 1703 bdrv_close(bs); 1704 1705 /* remove from list, if necessary */ 1706 bdrv_make_anon(bs); 1707 1708 g_free(bs); 1709 } 1710 1711 int bdrv_attach_dev(BlockDriverState *bs, void *dev) 1712 /* TODO change to DeviceState *dev when all users are qdevified */ 1713 { 1714 if (bs->dev) { 1715 return -EBUSY; 1716 } 1717 bs->dev = dev; 1718 bdrv_iostatus_reset(bs); 1719 return 0; 1720 } 1721 1722 /* TODO qdevified devices don't use this, remove when devices are qdevified */ 1723 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev) 1724 { 1725 if (bdrv_attach_dev(bs, dev) < 0) { 1726 abort(); 1727 } 1728 } 1729 1730 void bdrv_detach_dev(BlockDriverState *bs, void *dev) 1731 /* TODO change to DeviceState *dev when all users are qdevified */ 1732 { 1733 assert(bs->dev == dev); 1734 bs->dev = NULL; 1735 bs->dev_ops = NULL; 1736 bs->dev_opaque = NULL; 1737 bs->buffer_alignment = 512; 1738 } 1739 1740 /* TODO change to return DeviceState * when all users are qdevified */ 1741 void *bdrv_get_attached_dev(BlockDriverState *bs) 1742 { 1743 return bs->dev; 1744 } 1745 1746 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, 1747 void *opaque) 1748 { 1749 bs->dev_ops = ops; 1750 bs->dev_opaque = opaque; 1751 } 1752 1753 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv, 1754 enum MonitorEvent ev, 1755 BlockErrorAction action, bool is_read) 1756 { 1757 QObject *data; 1758 const char *action_str; 1759 1760 switch (action) { 1761 case BDRV_ACTION_REPORT: 1762 action_str = "report"; 1763 break; 1764 case BDRV_ACTION_IGNORE: 1765 action_str = "ignore"; 1766 break; 1767 case BDRV_ACTION_STOP: 1768 action_str = "stop"; 1769 break; 1770 default: 1771 abort(); 1772 } 1773 1774 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }", 1775 bdrv->device_name, 1776 action_str, 1777 is_read ? "read" : "write"); 1778 monitor_protocol_event(ev, data); 1779 1780 qobject_decref(data); 1781 } 1782 1783 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected) 1784 { 1785 QObject *data; 1786 1787 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }", 1788 bdrv_get_device_name(bs), ejected); 1789 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data); 1790 1791 qobject_decref(data); 1792 } 1793 1794 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load) 1795 { 1796 if (bs->dev_ops && bs->dev_ops->change_media_cb) { 1797 bool tray_was_closed = !bdrv_dev_is_tray_open(bs); 1798 bs->dev_ops->change_media_cb(bs->dev_opaque, load); 1799 if (tray_was_closed) { 1800 /* tray open */ 1801 bdrv_emit_qmp_eject_event(bs, true); 1802 } 1803 if (load) { 1804 /* tray close */ 1805 bdrv_emit_qmp_eject_event(bs, false); 1806 } 1807 } 1808 } 1809 1810 bool bdrv_dev_has_removable_media(BlockDriverState *bs) 1811 { 1812 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb); 1813 } 1814 1815 void bdrv_dev_eject_request(BlockDriverState *bs, bool force) 1816 { 1817 if (bs->dev_ops && bs->dev_ops->eject_request_cb) { 1818 bs->dev_ops->eject_request_cb(bs->dev_opaque, force); 1819 } 1820 } 1821 1822 bool bdrv_dev_is_tray_open(BlockDriverState *bs) 1823 { 1824 if (bs->dev_ops && bs->dev_ops->is_tray_open) { 1825 return bs->dev_ops->is_tray_open(bs->dev_opaque); 1826 } 1827 return false; 1828 } 1829 1830 static void bdrv_dev_resize_cb(BlockDriverState *bs) 1831 { 1832 if (bs->dev_ops && bs->dev_ops->resize_cb) { 1833 bs->dev_ops->resize_cb(bs->dev_opaque); 1834 } 1835 } 1836 1837 bool bdrv_dev_is_medium_locked(BlockDriverState *bs) 1838 { 1839 if (bs->dev_ops && bs->dev_ops->is_medium_locked) { 1840 return bs->dev_ops->is_medium_locked(bs->dev_opaque); 1841 } 1842 return false; 1843 } 1844 1845 /* 1846 * Run consistency checks on an image 1847 * 1848 * Returns 0 if the check could be completed (it doesn't mean that the image is 1849 * free of errors) or -errno when an internal error occurred. The results of the 1850 * check are stored in res. 1851 */ 1852 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 1853 { 1854 if (bs->drv->bdrv_check == NULL) { 1855 return -ENOTSUP; 1856 } 1857 1858 memset(res, 0, sizeof(*res)); 1859 return bs->drv->bdrv_check(bs, res, fix); 1860 } 1861 1862 #define COMMIT_BUF_SECTORS 2048 1863 1864 /* commit COW file into the raw image */ 1865 int bdrv_commit(BlockDriverState *bs) 1866 { 1867 BlockDriver *drv = bs->drv; 1868 int64_t sector, total_sectors; 1869 int n, ro, open_flags; 1870 int ret = 0; 1871 uint8_t *buf; 1872 char filename[PATH_MAX]; 1873 1874 if (!drv) 1875 return -ENOMEDIUM; 1876 1877 if (!bs->backing_hd) { 1878 return -ENOTSUP; 1879 } 1880 1881 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) { 1882 return -EBUSY; 1883 } 1884 1885 ro = bs->backing_hd->read_only; 1886 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ 1887 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); 1888 open_flags = bs->backing_hd->open_flags; 1889 1890 if (ro) { 1891 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { 1892 return -EACCES; 1893 } 1894 } 1895 1896 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; 1897 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); 1898 1899 for (sector = 0; sector < total_sectors; sector += n) { 1900 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); 1901 if (ret < 0) { 1902 goto ro_cleanup; 1903 } 1904 if (ret) { 1905 if (bdrv_read(bs, sector, buf, n) != 0) { 1906 ret = -EIO; 1907 goto ro_cleanup; 1908 } 1909 1910 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { 1911 ret = -EIO; 1912 goto ro_cleanup; 1913 } 1914 } 1915 } 1916 1917 if (drv->bdrv_make_empty) { 1918 ret = drv->bdrv_make_empty(bs); 1919 bdrv_flush(bs); 1920 } 1921 1922 /* 1923 * Make sure all data we wrote to the backing device is actually 1924 * stable on disk. 1925 */ 1926 if (bs->backing_hd) 1927 bdrv_flush(bs->backing_hd); 1928 1929 ro_cleanup: 1930 g_free(buf); 1931 1932 if (ro) { 1933 /* ignoring error return here */ 1934 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); 1935 } 1936 1937 return ret; 1938 } 1939 1940 int bdrv_commit_all(void) 1941 { 1942 BlockDriverState *bs; 1943 1944 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1945 if (bs->drv && bs->backing_hd) { 1946 int ret = bdrv_commit(bs); 1947 if (ret < 0) { 1948 return ret; 1949 } 1950 } 1951 } 1952 return 0; 1953 } 1954 1955 /** 1956 * Remove an active request from the tracked requests list 1957 * 1958 * This function should be called when a tracked request is completing. 1959 */ 1960 static void tracked_request_end(BdrvTrackedRequest *req) 1961 { 1962 QLIST_REMOVE(req, list); 1963 qemu_co_queue_restart_all(&req->wait_queue); 1964 } 1965 1966 /** 1967 * Add an active request to the tracked requests list 1968 */ 1969 static void tracked_request_begin(BdrvTrackedRequest *req, 1970 BlockDriverState *bs, 1971 int64_t sector_num, 1972 int nb_sectors, bool is_write) 1973 { 1974 *req = (BdrvTrackedRequest){ 1975 .bs = bs, 1976 .sector_num = sector_num, 1977 .nb_sectors = nb_sectors, 1978 .is_write = is_write, 1979 .co = qemu_coroutine_self(), 1980 }; 1981 1982 qemu_co_queue_init(&req->wait_queue); 1983 1984 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 1985 } 1986 1987 /** 1988 * Round a region to cluster boundaries 1989 */ 1990 void bdrv_round_to_clusters(BlockDriverState *bs, 1991 int64_t sector_num, int nb_sectors, 1992 int64_t *cluster_sector_num, 1993 int *cluster_nb_sectors) 1994 { 1995 BlockDriverInfo bdi; 1996 1997 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 1998 *cluster_sector_num = sector_num; 1999 *cluster_nb_sectors = nb_sectors; 2000 } else { 2001 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 2002 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 2003 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 2004 nb_sectors, c); 2005 } 2006 } 2007 2008 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 2009 int64_t sector_num, int nb_sectors) { 2010 /* aaaa bbbb */ 2011 if (sector_num >= req->sector_num + req->nb_sectors) { 2012 return false; 2013 } 2014 /* bbbb aaaa */ 2015 if (req->sector_num >= sector_num + nb_sectors) { 2016 return false; 2017 } 2018 return true; 2019 } 2020 2021 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, 2022 int64_t sector_num, int nb_sectors) 2023 { 2024 BdrvTrackedRequest *req; 2025 int64_t cluster_sector_num; 2026 int cluster_nb_sectors; 2027 bool retry; 2028 2029 /* If we touch the same cluster it counts as an overlap. This guarantees 2030 * that allocating writes will be serialized and not race with each other 2031 * for the same cluster. For example, in copy-on-read it ensures that the 2032 * CoR read and write operations are atomic and guest writes cannot 2033 * interleave between them. 2034 */ 2035 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2036 &cluster_sector_num, &cluster_nb_sectors); 2037 2038 do { 2039 retry = false; 2040 QLIST_FOREACH(req, &bs->tracked_requests, list) { 2041 if (tracked_request_overlaps(req, cluster_sector_num, 2042 cluster_nb_sectors)) { 2043 /* Hitting this means there was a reentrant request, for 2044 * example, a block driver issuing nested requests. This must 2045 * never happen since it means deadlock. 2046 */ 2047 assert(qemu_coroutine_self() != req->co); 2048 2049 qemu_co_queue_wait(&req->wait_queue); 2050 retry = true; 2051 break; 2052 } 2053 } 2054 } while (retry); 2055 } 2056 2057 /* 2058 * Return values: 2059 * 0 - success 2060 * -EINVAL - backing format specified, but no file 2061 * -ENOSPC - can't update the backing file because no space is left in the 2062 * image file header 2063 * -ENOTSUP - format driver doesn't support changing the backing file 2064 */ 2065 int bdrv_change_backing_file(BlockDriverState *bs, 2066 const char *backing_file, const char *backing_fmt) 2067 { 2068 BlockDriver *drv = bs->drv; 2069 int ret; 2070 2071 /* Backing file format doesn't make sense without a backing file */ 2072 if (backing_fmt && !backing_file) { 2073 return -EINVAL; 2074 } 2075 2076 if (drv->bdrv_change_backing_file != NULL) { 2077 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); 2078 } else { 2079 ret = -ENOTSUP; 2080 } 2081 2082 if (ret == 0) { 2083 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 2084 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 2085 } 2086 return ret; 2087 } 2088 2089 /* 2090 * Finds the image layer in the chain that has 'bs' as its backing file. 2091 * 2092 * active is the current topmost image. 2093 * 2094 * Returns NULL if bs is not found in active's image chain, 2095 * or if active == bs. 2096 */ 2097 BlockDriverState *bdrv_find_overlay(BlockDriverState *active, 2098 BlockDriverState *bs) 2099 { 2100 BlockDriverState *overlay = NULL; 2101 BlockDriverState *intermediate; 2102 2103 assert(active != NULL); 2104 assert(bs != NULL); 2105 2106 /* if bs is the same as active, then by definition it has no overlay 2107 */ 2108 if (active == bs) { 2109 return NULL; 2110 } 2111 2112 intermediate = active; 2113 while (intermediate->backing_hd) { 2114 if (intermediate->backing_hd == bs) { 2115 overlay = intermediate; 2116 break; 2117 } 2118 intermediate = intermediate->backing_hd; 2119 } 2120 2121 return overlay; 2122 } 2123 2124 typedef struct BlkIntermediateStates { 2125 BlockDriverState *bs; 2126 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; 2127 } BlkIntermediateStates; 2128 2129 2130 /* 2131 * Drops images above 'base' up to and including 'top', and sets the image 2132 * above 'top' to have base as its backing file. 2133 * 2134 * Requires that the overlay to 'top' is opened r/w, so that the backing file 2135 * information in 'bs' can be properly updated. 2136 * 2137 * E.g., this will convert the following chain: 2138 * bottom <- base <- intermediate <- top <- active 2139 * 2140 * to 2141 * 2142 * bottom <- base <- active 2143 * 2144 * It is allowed for bottom==base, in which case it converts: 2145 * 2146 * base <- intermediate <- top <- active 2147 * 2148 * to 2149 * 2150 * base <- active 2151 * 2152 * Error conditions: 2153 * if active == top, that is considered an error 2154 * 2155 */ 2156 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, 2157 BlockDriverState *base) 2158 { 2159 BlockDriverState *intermediate; 2160 BlockDriverState *base_bs = NULL; 2161 BlockDriverState *new_top_bs = NULL; 2162 BlkIntermediateStates *intermediate_state, *next; 2163 int ret = -EIO; 2164 2165 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; 2166 QSIMPLEQ_INIT(&states_to_delete); 2167 2168 if (!top->drv || !base->drv) { 2169 goto exit; 2170 } 2171 2172 new_top_bs = bdrv_find_overlay(active, top); 2173 2174 if (new_top_bs == NULL) { 2175 /* we could not find the image above 'top', this is an error */ 2176 goto exit; 2177 } 2178 2179 /* special case of new_top_bs->backing_hd already pointing to base - nothing 2180 * to do, no intermediate images */ 2181 if (new_top_bs->backing_hd == base) { 2182 ret = 0; 2183 goto exit; 2184 } 2185 2186 intermediate = top; 2187 2188 /* now we will go down through the list, and add each BDS we find 2189 * into our deletion queue, until we hit the 'base' 2190 */ 2191 while (intermediate) { 2192 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates)); 2193 intermediate_state->bs = intermediate; 2194 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); 2195 2196 if (intermediate->backing_hd == base) { 2197 base_bs = intermediate->backing_hd; 2198 break; 2199 } 2200 intermediate = intermediate->backing_hd; 2201 } 2202 if (base_bs == NULL) { 2203 /* something went wrong, we did not end at the base. safely 2204 * unravel everything, and exit with error */ 2205 goto exit; 2206 } 2207 2208 /* success - we can delete the intermediate states, and link top->base */ 2209 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename, 2210 base_bs->drv ? base_bs->drv->format_name : ""); 2211 if (ret) { 2212 goto exit; 2213 } 2214 new_top_bs->backing_hd = base_bs; 2215 2216 2217 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2218 /* so that bdrv_close() does not recursively close the chain */ 2219 intermediate_state->bs->backing_hd = NULL; 2220 bdrv_unref(intermediate_state->bs); 2221 } 2222 ret = 0; 2223 2224 exit: 2225 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2226 g_free(intermediate_state); 2227 } 2228 return ret; 2229 } 2230 2231 2232 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 2233 size_t size) 2234 { 2235 int64_t len; 2236 2237 if (!bdrv_is_inserted(bs)) 2238 return -ENOMEDIUM; 2239 2240 if (bs->growable) 2241 return 0; 2242 2243 len = bdrv_getlength(bs); 2244 2245 if (offset < 0) 2246 return -EIO; 2247 2248 if ((offset > len) || (len - offset < size)) 2249 return -EIO; 2250 2251 return 0; 2252 } 2253 2254 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 2255 int nb_sectors) 2256 { 2257 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 2258 nb_sectors * BDRV_SECTOR_SIZE); 2259 } 2260 2261 typedef struct RwCo { 2262 BlockDriverState *bs; 2263 int64_t sector_num; 2264 int nb_sectors; 2265 QEMUIOVector *qiov; 2266 bool is_write; 2267 int ret; 2268 BdrvRequestFlags flags; 2269 } RwCo; 2270 2271 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 2272 { 2273 RwCo *rwco = opaque; 2274 2275 if (!rwco->is_write) { 2276 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, 2277 rwco->nb_sectors, rwco->qiov, 2278 rwco->flags); 2279 } else { 2280 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, 2281 rwco->nb_sectors, rwco->qiov, 2282 rwco->flags); 2283 } 2284 } 2285 2286 /* 2287 * Process a vectored synchronous request using coroutines 2288 */ 2289 static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, 2290 QEMUIOVector *qiov, bool is_write, 2291 BdrvRequestFlags flags) 2292 { 2293 Coroutine *co; 2294 RwCo rwco = { 2295 .bs = bs, 2296 .sector_num = sector_num, 2297 .nb_sectors = qiov->size >> BDRV_SECTOR_BITS, 2298 .qiov = qiov, 2299 .is_write = is_write, 2300 .ret = NOT_DONE, 2301 .flags = flags, 2302 }; 2303 assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); 2304 2305 /** 2306 * In sync call context, when the vcpu is blocked, this throttling timer 2307 * will not fire; so the I/O throttling function has to be disabled here 2308 * if it has been enabled. 2309 */ 2310 if (bs->io_limits_enabled) { 2311 fprintf(stderr, "Disabling I/O throttling on '%s' due " 2312 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 2313 bdrv_io_limits_disable(bs); 2314 } 2315 2316 if (qemu_in_coroutine()) { 2317 /* Fast-path if already in coroutine context */ 2318 bdrv_rw_co_entry(&rwco); 2319 } else { 2320 co = qemu_coroutine_create(bdrv_rw_co_entry); 2321 qemu_coroutine_enter(co, &rwco); 2322 while (rwco.ret == NOT_DONE) { 2323 qemu_aio_wait(); 2324 } 2325 } 2326 return rwco.ret; 2327 } 2328 2329 /* 2330 * Process a synchronous request using coroutines 2331 */ 2332 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 2333 int nb_sectors, bool is_write, BdrvRequestFlags flags) 2334 { 2335 QEMUIOVector qiov; 2336 struct iovec iov = { 2337 .iov_base = (void *)buf, 2338 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 2339 }; 2340 2341 qemu_iovec_init_external(&qiov, &iov, 1); 2342 return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags); 2343 } 2344 2345 /* return < 0 if error. See bdrv_write() for the return codes */ 2346 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 2347 uint8_t *buf, int nb_sectors) 2348 { 2349 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 2350 } 2351 2352 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 2353 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 2354 uint8_t *buf, int nb_sectors) 2355 { 2356 bool enabled; 2357 int ret; 2358 2359 enabled = bs->io_limits_enabled; 2360 bs->io_limits_enabled = false; 2361 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 2362 bs->io_limits_enabled = enabled; 2363 return ret; 2364 } 2365 2366 /* Return < 0 if error. Important errors are: 2367 -EIO generic I/O error (may happen for all errors) 2368 -ENOMEDIUM No media inserted. 2369 -EINVAL Invalid sector number or nb_sectors 2370 -EACCES Trying to write a read-only device 2371 */ 2372 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 2373 const uint8_t *buf, int nb_sectors) 2374 { 2375 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 2376 } 2377 2378 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) 2379 { 2380 return bdrv_rwv_co(bs, sector_num, qiov, true, 0); 2381 } 2382 2383 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2384 { 2385 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 2386 BDRV_REQ_ZERO_WRITE); 2387 } 2388 2389 int bdrv_pread(BlockDriverState *bs, int64_t offset, 2390 void *buf, int count1) 2391 { 2392 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 2393 int len, nb_sectors, count; 2394 int64_t sector_num; 2395 int ret; 2396 2397 count = count1; 2398 /* first read to align to sector start */ 2399 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 2400 if (len > count) 2401 len = count; 2402 sector_num = offset >> BDRV_SECTOR_BITS; 2403 if (len > 0) { 2404 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2405 return ret; 2406 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); 2407 count -= len; 2408 if (count == 0) 2409 return count1; 2410 sector_num++; 2411 buf += len; 2412 } 2413 2414 /* read the sectors "in place" */ 2415 nb_sectors = count >> BDRV_SECTOR_BITS; 2416 if (nb_sectors > 0) { 2417 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) 2418 return ret; 2419 sector_num += nb_sectors; 2420 len = nb_sectors << BDRV_SECTOR_BITS; 2421 buf += len; 2422 count -= len; 2423 } 2424 2425 /* add data from the last sector */ 2426 if (count > 0) { 2427 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2428 return ret; 2429 memcpy(buf, tmp_buf, count); 2430 } 2431 return count1; 2432 } 2433 2434 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 2435 { 2436 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 2437 int len, nb_sectors, count; 2438 int64_t sector_num; 2439 int ret; 2440 2441 count = qiov->size; 2442 2443 /* first write to align to sector start */ 2444 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 2445 if (len > count) 2446 len = count; 2447 sector_num = offset >> BDRV_SECTOR_BITS; 2448 if (len > 0) { 2449 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2450 return ret; 2451 qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), 2452 len); 2453 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 2454 return ret; 2455 count -= len; 2456 if (count == 0) 2457 return qiov->size; 2458 sector_num++; 2459 } 2460 2461 /* write the sectors "in place" */ 2462 nb_sectors = count >> BDRV_SECTOR_BITS; 2463 if (nb_sectors > 0) { 2464 QEMUIOVector qiov_inplace; 2465 2466 qemu_iovec_init(&qiov_inplace, qiov->niov); 2467 qemu_iovec_concat(&qiov_inplace, qiov, len, 2468 nb_sectors << BDRV_SECTOR_BITS); 2469 ret = bdrv_writev(bs, sector_num, &qiov_inplace); 2470 qemu_iovec_destroy(&qiov_inplace); 2471 if (ret < 0) { 2472 return ret; 2473 } 2474 2475 sector_num += nb_sectors; 2476 len = nb_sectors << BDRV_SECTOR_BITS; 2477 count -= len; 2478 } 2479 2480 /* add data from the last sector */ 2481 if (count > 0) { 2482 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2483 return ret; 2484 qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count); 2485 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 2486 return ret; 2487 } 2488 return qiov->size; 2489 } 2490 2491 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 2492 const void *buf, int count1) 2493 { 2494 QEMUIOVector qiov; 2495 struct iovec iov = { 2496 .iov_base = (void *) buf, 2497 .iov_len = count1, 2498 }; 2499 2500 qemu_iovec_init_external(&qiov, &iov, 1); 2501 return bdrv_pwritev(bs, offset, &qiov); 2502 } 2503 2504 /* 2505 * Writes to the file and ensures that no writes are reordered across this 2506 * request (acts as a barrier) 2507 * 2508 * Returns 0 on success, -errno in error cases. 2509 */ 2510 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 2511 const void *buf, int count) 2512 { 2513 int ret; 2514 2515 ret = bdrv_pwrite(bs, offset, buf, count); 2516 if (ret < 0) { 2517 return ret; 2518 } 2519 2520 /* No flush needed for cache modes that already do it */ 2521 if (bs->enable_write_cache) { 2522 bdrv_flush(bs); 2523 } 2524 2525 return 0; 2526 } 2527 2528 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 2529 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2530 { 2531 /* Perform I/O through a temporary buffer so that users who scribble over 2532 * their read buffer while the operation is in progress do not end up 2533 * modifying the image file. This is critical for zero-copy guest I/O 2534 * where anything might happen inside guest memory. 2535 */ 2536 void *bounce_buffer; 2537 2538 BlockDriver *drv = bs->drv; 2539 struct iovec iov; 2540 QEMUIOVector bounce_qiov; 2541 int64_t cluster_sector_num; 2542 int cluster_nb_sectors; 2543 size_t skip_bytes; 2544 int ret; 2545 2546 /* Cover entire cluster so no additional backing file I/O is required when 2547 * allocating cluster in the image file. 2548 */ 2549 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2550 &cluster_sector_num, &cluster_nb_sectors); 2551 2552 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 2553 cluster_sector_num, cluster_nb_sectors); 2554 2555 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 2556 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); 2557 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 2558 2559 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 2560 &bounce_qiov); 2561 if (ret < 0) { 2562 goto err; 2563 } 2564 2565 if (drv->bdrv_co_write_zeroes && 2566 buffer_is_zero(bounce_buffer, iov.iov_len)) { 2567 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 2568 cluster_nb_sectors); 2569 } else { 2570 /* This does not change the data on the disk, it is not necessary 2571 * to flush even in cache=writethrough mode. 2572 */ 2573 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 2574 &bounce_qiov); 2575 } 2576 2577 if (ret < 0) { 2578 /* It might be okay to ignore write errors for guest requests. If this 2579 * is a deliberate copy-on-read then we don't want to ignore the error. 2580 * Simply report it in all cases. 2581 */ 2582 goto err; 2583 } 2584 2585 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 2586 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 2587 nb_sectors * BDRV_SECTOR_SIZE); 2588 2589 err: 2590 qemu_vfree(bounce_buffer); 2591 return ret; 2592 } 2593 2594 /* 2595 * Handle a read request in coroutine context 2596 */ 2597 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 2598 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 2599 BdrvRequestFlags flags) 2600 { 2601 BlockDriver *drv = bs->drv; 2602 BdrvTrackedRequest req; 2603 int ret; 2604 2605 if (!drv) { 2606 return -ENOMEDIUM; 2607 } 2608 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 2609 return -EIO; 2610 } 2611 2612 if (bs->copy_on_read) { 2613 flags |= BDRV_REQ_COPY_ON_READ; 2614 } 2615 if (flags & BDRV_REQ_COPY_ON_READ) { 2616 bs->copy_on_read_in_flight++; 2617 } 2618 2619 if (bs->copy_on_read_in_flight) { 2620 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 2621 } 2622 2623 /* throttling disk I/O */ 2624 if (bs->io_limits_enabled) { 2625 bdrv_io_limits_intercept(bs, nb_sectors, false); 2626 } 2627 2628 tracked_request_begin(&req, bs, sector_num, nb_sectors, false); 2629 2630 if (flags & BDRV_REQ_COPY_ON_READ) { 2631 int pnum; 2632 2633 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 2634 if (ret < 0) { 2635 goto out; 2636 } 2637 2638 if (!ret || pnum != nb_sectors) { 2639 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 2640 goto out; 2641 } 2642 } 2643 2644 if (!(bs->zero_beyond_eof && bs->growable)) { 2645 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 2646 } else { 2647 /* Read zeros after EOF of growable BDSes */ 2648 int64_t len, total_sectors, max_nb_sectors; 2649 2650 len = bdrv_getlength(bs); 2651 if (len < 0) { 2652 ret = len; 2653 goto out; 2654 } 2655 2656 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); 2657 max_nb_sectors = MAX(0, total_sectors - sector_num); 2658 if (max_nb_sectors > 0) { 2659 ret = drv->bdrv_co_readv(bs, sector_num, 2660 MIN(nb_sectors, max_nb_sectors), qiov); 2661 } else { 2662 ret = 0; 2663 } 2664 2665 /* Reading beyond end of file is supposed to produce zeroes */ 2666 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 2667 uint64_t offset = MAX(0, total_sectors - sector_num); 2668 uint64_t bytes = (sector_num + nb_sectors - offset) * 2669 BDRV_SECTOR_SIZE; 2670 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 2671 } 2672 } 2673 2674 out: 2675 tracked_request_end(&req); 2676 2677 if (flags & BDRV_REQ_COPY_ON_READ) { 2678 bs->copy_on_read_in_flight--; 2679 } 2680 2681 return ret; 2682 } 2683 2684 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 2685 int nb_sectors, QEMUIOVector *qiov) 2686 { 2687 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 2688 2689 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 2690 } 2691 2692 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 2693 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2694 { 2695 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 2696 2697 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 2698 BDRV_REQ_COPY_ON_READ); 2699 } 2700 2701 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 2702 int64_t sector_num, int nb_sectors) 2703 { 2704 BlockDriver *drv = bs->drv; 2705 QEMUIOVector qiov; 2706 struct iovec iov; 2707 int ret; 2708 2709 /* TODO Emulate only part of misaligned requests instead of letting block 2710 * drivers return -ENOTSUP and emulate everything */ 2711 2712 /* First try the efficient write zeroes operation */ 2713 if (drv->bdrv_co_write_zeroes) { 2714 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); 2715 if (ret != -ENOTSUP) { 2716 return ret; 2717 } 2718 } 2719 2720 /* Fall back to bounce buffer if write zeroes is unsupported */ 2721 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; 2722 iov.iov_base = qemu_blockalign(bs, iov.iov_len); 2723 memset(iov.iov_base, 0, iov.iov_len); 2724 qemu_iovec_init_external(&qiov, &iov, 1); 2725 2726 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); 2727 2728 qemu_vfree(iov.iov_base); 2729 return ret; 2730 } 2731 2732 /* 2733 * Handle a write request in coroutine context 2734 */ 2735 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 2736 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 2737 BdrvRequestFlags flags) 2738 { 2739 BlockDriver *drv = bs->drv; 2740 BdrvTrackedRequest req; 2741 int ret; 2742 2743 if (!bs->drv) { 2744 return -ENOMEDIUM; 2745 } 2746 if (bs->read_only) { 2747 return -EACCES; 2748 } 2749 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 2750 return -EIO; 2751 } 2752 2753 if (bs->copy_on_read_in_flight) { 2754 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 2755 } 2756 2757 /* throttling disk I/O */ 2758 if (bs->io_limits_enabled) { 2759 bdrv_io_limits_intercept(bs, nb_sectors, true); 2760 } 2761 2762 tracked_request_begin(&req, bs, sector_num, nb_sectors, true); 2763 2764 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2765 2766 if (ret < 0) { 2767 /* Do nothing, write notifier decided to fail this request */ 2768 } else if (flags & BDRV_REQ_ZERO_WRITE) { 2769 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); 2770 } else { 2771 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 2772 } 2773 2774 if (ret == 0 && !bs->enable_write_cache) { 2775 ret = bdrv_co_flush(bs); 2776 } 2777 2778 if (bs->dirty_bitmap) { 2779 bdrv_set_dirty(bs, sector_num, nb_sectors); 2780 } 2781 2782 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { 2783 bs->wr_highest_sector = sector_num + nb_sectors - 1; 2784 } 2785 if (bs->growable && ret >= 0) { 2786 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 2787 } 2788 2789 tracked_request_end(&req); 2790 2791 return ret; 2792 } 2793 2794 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 2795 int nb_sectors, QEMUIOVector *qiov) 2796 { 2797 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 2798 2799 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 2800 } 2801 2802 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 2803 int64_t sector_num, int nb_sectors) 2804 { 2805 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); 2806 2807 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 2808 BDRV_REQ_ZERO_WRITE); 2809 } 2810 2811 /** 2812 * Truncate file to 'offset' bytes (needed only for file protocols) 2813 */ 2814 int bdrv_truncate(BlockDriverState *bs, int64_t offset) 2815 { 2816 BlockDriver *drv = bs->drv; 2817 int ret; 2818 if (!drv) 2819 return -ENOMEDIUM; 2820 if (!drv->bdrv_truncate) 2821 return -ENOTSUP; 2822 if (bs->read_only) 2823 return -EACCES; 2824 if (bdrv_in_use(bs)) 2825 return -EBUSY; 2826 ret = drv->bdrv_truncate(bs, offset); 2827 if (ret == 0) { 2828 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 2829 bdrv_dev_resize_cb(bs); 2830 } 2831 return ret; 2832 } 2833 2834 /** 2835 * Length of a allocated file in bytes. Sparse files are counted by actual 2836 * allocated space. Return < 0 if error or unknown. 2837 */ 2838 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) 2839 { 2840 BlockDriver *drv = bs->drv; 2841 if (!drv) { 2842 return -ENOMEDIUM; 2843 } 2844 if (drv->bdrv_get_allocated_file_size) { 2845 return drv->bdrv_get_allocated_file_size(bs); 2846 } 2847 if (bs->file) { 2848 return bdrv_get_allocated_file_size(bs->file); 2849 } 2850 return -ENOTSUP; 2851 } 2852 2853 /** 2854 * Length of a file in bytes. Return < 0 if error or unknown. 2855 */ 2856 int64_t bdrv_getlength(BlockDriverState *bs) 2857 { 2858 BlockDriver *drv = bs->drv; 2859 if (!drv) 2860 return -ENOMEDIUM; 2861 2862 if (bdrv_dev_has_removable_media(bs)) { 2863 if (drv->bdrv_getlength) { 2864 return drv->bdrv_getlength(bs); 2865 } 2866 } 2867 return bs->total_sectors * BDRV_SECTOR_SIZE; 2868 } 2869 2870 /* return 0 as number of sectors if no device present or error */ 2871 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) 2872 { 2873 int64_t length; 2874 length = bdrv_getlength(bs); 2875 if (length < 0) 2876 length = 0; 2877 else 2878 length = length >> BDRV_SECTOR_BITS; 2879 *nb_sectors_ptr = length; 2880 } 2881 2882 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, 2883 BlockdevOnError on_write_error) 2884 { 2885 bs->on_read_error = on_read_error; 2886 bs->on_write_error = on_write_error; 2887 } 2888 2889 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) 2890 { 2891 return is_read ? bs->on_read_error : bs->on_write_error; 2892 } 2893 2894 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) 2895 { 2896 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; 2897 2898 switch (on_err) { 2899 case BLOCKDEV_ON_ERROR_ENOSPC: 2900 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT; 2901 case BLOCKDEV_ON_ERROR_STOP: 2902 return BDRV_ACTION_STOP; 2903 case BLOCKDEV_ON_ERROR_REPORT: 2904 return BDRV_ACTION_REPORT; 2905 case BLOCKDEV_ON_ERROR_IGNORE: 2906 return BDRV_ACTION_IGNORE; 2907 default: 2908 abort(); 2909 } 2910 } 2911 2912 /* This is done by device models because, while the block layer knows 2913 * about the error, it does not know whether an operation comes from 2914 * the device or the block layer (from a job, for example). 2915 */ 2916 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, 2917 bool is_read, int error) 2918 { 2919 assert(error >= 0); 2920 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read); 2921 if (action == BDRV_ACTION_STOP) { 2922 vm_stop(RUN_STATE_IO_ERROR); 2923 bdrv_iostatus_set_err(bs, error); 2924 } 2925 } 2926 2927 int bdrv_is_read_only(BlockDriverState *bs) 2928 { 2929 return bs->read_only; 2930 } 2931 2932 int bdrv_is_sg(BlockDriverState *bs) 2933 { 2934 return bs->sg; 2935 } 2936 2937 int bdrv_enable_write_cache(BlockDriverState *bs) 2938 { 2939 return bs->enable_write_cache; 2940 } 2941 2942 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) 2943 { 2944 bs->enable_write_cache = wce; 2945 2946 /* so a reopen() will preserve wce */ 2947 if (wce) { 2948 bs->open_flags |= BDRV_O_CACHE_WB; 2949 } else { 2950 bs->open_flags &= ~BDRV_O_CACHE_WB; 2951 } 2952 } 2953 2954 int bdrv_is_encrypted(BlockDriverState *bs) 2955 { 2956 if (bs->backing_hd && bs->backing_hd->encrypted) 2957 return 1; 2958 return bs->encrypted; 2959 } 2960 2961 int bdrv_key_required(BlockDriverState *bs) 2962 { 2963 BlockDriverState *backing_hd = bs->backing_hd; 2964 2965 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) 2966 return 1; 2967 return (bs->encrypted && !bs->valid_key); 2968 } 2969 2970 int bdrv_set_key(BlockDriverState *bs, const char *key) 2971 { 2972 int ret; 2973 if (bs->backing_hd && bs->backing_hd->encrypted) { 2974 ret = bdrv_set_key(bs->backing_hd, key); 2975 if (ret < 0) 2976 return ret; 2977 if (!bs->encrypted) 2978 return 0; 2979 } 2980 if (!bs->encrypted) { 2981 return -EINVAL; 2982 } else if (!bs->drv || !bs->drv->bdrv_set_key) { 2983 return -ENOMEDIUM; 2984 } 2985 ret = bs->drv->bdrv_set_key(bs, key); 2986 if (ret < 0) { 2987 bs->valid_key = 0; 2988 } else if (!bs->valid_key) { 2989 bs->valid_key = 1; 2990 /* call the change callback now, we skipped it on open */ 2991 bdrv_dev_change_media_cb(bs, true); 2992 } 2993 return ret; 2994 } 2995 2996 const char *bdrv_get_format_name(BlockDriverState *bs) 2997 { 2998 return bs->drv ? bs->drv->format_name : NULL; 2999 } 3000 3001 void bdrv_iterate_format(void (*it)(void *opaque, const char *name), 3002 void *opaque) 3003 { 3004 BlockDriver *drv; 3005 3006 QLIST_FOREACH(drv, &bdrv_drivers, list) { 3007 it(opaque, drv->format_name); 3008 } 3009 } 3010 3011 BlockDriverState *bdrv_find(const char *name) 3012 { 3013 BlockDriverState *bs; 3014 3015 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3016 if (!strcmp(name, bs->device_name)) { 3017 return bs; 3018 } 3019 } 3020 return NULL; 3021 } 3022 3023 BlockDriverState *bdrv_next(BlockDriverState *bs) 3024 { 3025 if (!bs) { 3026 return QTAILQ_FIRST(&bdrv_states); 3027 } 3028 return QTAILQ_NEXT(bs, list); 3029 } 3030 3031 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) 3032 { 3033 BlockDriverState *bs; 3034 3035 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3036 it(opaque, bs); 3037 } 3038 } 3039 3040 const char *bdrv_get_device_name(BlockDriverState *bs) 3041 { 3042 return bs->device_name; 3043 } 3044 3045 int bdrv_get_flags(BlockDriverState *bs) 3046 { 3047 return bs->open_flags; 3048 } 3049 3050 int bdrv_flush_all(void) 3051 { 3052 BlockDriverState *bs; 3053 int result = 0; 3054 3055 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3056 int ret = bdrv_flush(bs); 3057 if (ret < 0 && !result) { 3058 result = ret; 3059 } 3060 } 3061 3062 return result; 3063 } 3064 3065 int bdrv_has_zero_init_1(BlockDriverState *bs) 3066 { 3067 return 1; 3068 } 3069 3070 int bdrv_has_zero_init(BlockDriverState *bs) 3071 { 3072 assert(bs->drv); 3073 3074 /* If BS is a copy on write image, it is initialized to 3075 the contents of the base image, which may not be zeroes. */ 3076 if (bs->backing_hd) { 3077 return 0; 3078 } 3079 if (bs->drv->bdrv_has_zero_init) { 3080 return bs->drv->bdrv_has_zero_init(bs); 3081 } 3082 3083 /* safe default */ 3084 return 0; 3085 } 3086 3087 typedef struct BdrvCoGetBlockStatusData { 3088 BlockDriverState *bs; 3089 BlockDriverState *base; 3090 int64_t sector_num; 3091 int nb_sectors; 3092 int *pnum; 3093 int64_t ret; 3094 bool done; 3095 } BdrvCoGetBlockStatusData; 3096 3097 /* 3098 * Returns true iff the specified sector is present in the disk image. Drivers 3099 * not implementing the functionality are assumed to not support backing files, 3100 * hence all their sectors are reported as allocated. 3101 * 3102 * If 'sector_num' is beyond the end of the disk image the return value is 0 3103 * and 'pnum' is set to 0. 3104 * 3105 * 'pnum' is set to the number of sectors (including and immediately following 3106 * the specified sector) that are known to be in the same 3107 * allocated/unallocated state. 3108 * 3109 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 3110 * beyond the end of the disk image it will be clamped. 3111 */ 3112 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 3113 int64_t sector_num, 3114 int nb_sectors, int *pnum) 3115 { 3116 int64_t length; 3117 int64_t n; 3118 int64_t ret, ret2; 3119 3120 length = bdrv_getlength(bs); 3121 if (length < 0) { 3122 return length; 3123 } 3124 3125 if (sector_num >= (length >> BDRV_SECTOR_BITS)) { 3126 *pnum = 0; 3127 return 0; 3128 } 3129 3130 n = bs->total_sectors - sector_num; 3131 if (n < nb_sectors) { 3132 nb_sectors = n; 3133 } 3134 3135 if (!bs->drv->bdrv_co_get_block_status) { 3136 *pnum = nb_sectors; 3137 ret = BDRV_BLOCK_DATA; 3138 if (bs->drv->protocol_name) { 3139 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 3140 } 3141 return ret; 3142 } 3143 3144 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 3145 if (ret < 0) { 3146 *pnum = 0; 3147 return ret; 3148 } 3149 3150 if (!(ret & BDRV_BLOCK_DATA)) { 3151 if (bdrv_has_zero_init(bs)) { 3152 ret |= BDRV_BLOCK_ZERO; 3153 } else if (bs->backing_hd) { 3154 BlockDriverState *bs2 = bs->backing_hd; 3155 int64_t length2 = bdrv_getlength(bs2); 3156 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) { 3157 ret |= BDRV_BLOCK_ZERO; 3158 } 3159 } 3160 } 3161 3162 if (bs->file && 3163 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 3164 (ret & BDRV_BLOCK_OFFSET_VALID)) { 3165 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 3166 *pnum, pnum); 3167 if (ret2 >= 0) { 3168 /* Ignore errors. This is just providing extra information, it 3169 * is useful but not necessary. 3170 */ 3171 ret |= (ret2 & BDRV_BLOCK_ZERO); 3172 } 3173 } 3174 3175 return ret; 3176 } 3177 3178 /* Coroutine wrapper for bdrv_get_block_status() */ 3179 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) 3180 { 3181 BdrvCoGetBlockStatusData *data = opaque; 3182 BlockDriverState *bs = data->bs; 3183 3184 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, 3185 data->pnum); 3186 data->done = true; 3187 } 3188 3189 /* 3190 * Synchronous wrapper around bdrv_co_get_block_status(). 3191 * 3192 * See bdrv_co_get_block_status() for details. 3193 */ 3194 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, 3195 int nb_sectors, int *pnum) 3196 { 3197 Coroutine *co; 3198 BdrvCoGetBlockStatusData data = { 3199 .bs = bs, 3200 .sector_num = sector_num, 3201 .nb_sectors = nb_sectors, 3202 .pnum = pnum, 3203 .done = false, 3204 }; 3205 3206 if (qemu_in_coroutine()) { 3207 /* Fast-path if already in coroutine context */ 3208 bdrv_get_block_status_co_entry(&data); 3209 } else { 3210 co = qemu_coroutine_create(bdrv_get_block_status_co_entry); 3211 qemu_coroutine_enter(co, &data); 3212 while (!data.done) { 3213 qemu_aio_wait(); 3214 } 3215 } 3216 return data.ret; 3217 } 3218 3219 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 3220 int nb_sectors, int *pnum) 3221 { 3222 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 3223 if (ret < 0) { 3224 return ret; 3225 } 3226 return 3227 (ret & BDRV_BLOCK_DATA) || 3228 ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs)); 3229 } 3230 3231 /* 3232 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 3233 * 3234 * Return true if the given sector is allocated in any image between 3235 * BASE and TOP (inclusive). BASE can be NULL to check if the given 3236 * sector is allocated in any image of the chain. Return false otherwise. 3237 * 3238 * 'pnum' is set to the number of sectors (including and immediately following 3239 * the specified sector) that are known to be in the same 3240 * allocated/unallocated state. 3241 * 3242 */ 3243 int bdrv_is_allocated_above(BlockDriverState *top, 3244 BlockDriverState *base, 3245 int64_t sector_num, 3246 int nb_sectors, int *pnum) 3247 { 3248 BlockDriverState *intermediate; 3249 int ret, n = nb_sectors; 3250 3251 intermediate = top; 3252 while (intermediate && intermediate != base) { 3253 int pnum_inter; 3254 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 3255 &pnum_inter); 3256 if (ret < 0) { 3257 return ret; 3258 } else if (ret) { 3259 *pnum = pnum_inter; 3260 return 1; 3261 } 3262 3263 /* 3264 * [sector_num, nb_sectors] is unallocated on top but intermediate 3265 * might have 3266 * 3267 * [sector_num+x, nr_sectors] allocated. 3268 */ 3269 if (n > pnum_inter && 3270 (intermediate == top || 3271 sector_num + pnum_inter < intermediate->total_sectors)) { 3272 n = pnum_inter; 3273 } 3274 3275 intermediate = intermediate->backing_hd; 3276 } 3277 3278 *pnum = n; 3279 return 0; 3280 } 3281 3282 const char *bdrv_get_encrypted_filename(BlockDriverState *bs) 3283 { 3284 if (bs->backing_hd && bs->backing_hd->encrypted) 3285 return bs->backing_file; 3286 else if (bs->encrypted) 3287 return bs->filename; 3288 else 3289 return NULL; 3290 } 3291 3292 void bdrv_get_backing_filename(BlockDriverState *bs, 3293 char *filename, int filename_size) 3294 { 3295 pstrcpy(filename, filename_size, bs->backing_file); 3296 } 3297 3298 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 3299 const uint8_t *buf, int nb_sectors) 3300 { 3301 BlockDriver *drv = bs->drv; 3302 if (!drv) 3303 return -ENOMEDIUM; 3304 if (!drv->bdrv_write_compressed) 3305 return -ENOTSUP; 3306 if (bdrv_check_request(bs, sector_num, nb_sectors)) 3307 return -EIO; 3308 3309 assert(!bs->dirty_bitmap); 3310 3311 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 3312 } 3313 3314 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 3315 { 3316 BlockDriver *drv = bs->drv; 3317 if (!drv) 3318 return -ENOMEDIUM; 3319 if (!drv->bdrv_get_info) 3320 return -ENOTSUP; 3321 memset(bdi, 0, sizeof(*bdi)); 3322 return drv->bdrv_get_info(bs, bdi); 3323 } 3324 3325 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) 3326 { 3327 BlockDriver *drv = bs->drv; 3328 if (drv && drv->bdrv_get_specific_info) { 3329 return drv->bdrv_get_specific_info(bs); 3330 } 3331 return NULL; 3332 } 3333 3334 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 3335 int64_t pos, int size) 3336 { 3337 QEMUIOVector qiov; 3338 struct iovec iov = { 3339 .iov_base = (void *) buf, 3340 .iov_len = size, 3341 }; 3342 3343 qemu_iovec_init_external(&qiov, &iov, 1); 3344 return bdrv_writev_vmstate(bs, &qiov, pos); 3345 } 3346 3347 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 3348 { 3349 BlockDriver *drv = bs->drv; 3350 3351 if (!drv) { 3352 return -ENOMEDIUM; 3353 } else if (drv->bdrv_save_vmstate) { 3354 return drv->bdrv_save_vmstate(bs, qiov, pos); 3355 } else if (bs->file) { 3356 return bdrv_writev_vmstate(bs->file, qiov, pos); 3357 } 3358 3359 return -ENOTSUP; 3360 } 3361 3362 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 3363 int64_t pos, int size) 3364 { 3365 BlockDriver *drv = bs->drv; 3366 if (!drv) 3367 return -ENOMEDIUM; 3368 if (drv->bdrv_load_vmstate) 3369 return drv->bdrv_load_vmstate(bs, buf, pos, size); 3370 if (bs->file) 3371 return bdrv_load_vmstate(bs->file, buf, pos, size); 3372 return -ENOTSUP; 3373 } 3374 3375 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) 3376 { 3377 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { 3378 return; 3379 } 3380 3381 bs->drv->bdrv_debug_event(bs, event); 3382 } 3383 3384 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, 3385 const char *tag) 3386 { 3387 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { 3388 bs = bs->file; 3389 } 3390 3391 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { 3392 return bs->drv->bdrv_debug_breakpoint(bs, event, tag); 3393 } 3394 3395 return -ENOTSUP; 3396 } 3397 3398 int bdrv_debug_resume(BlockDriverState *bs, const char *tag) 3399 { 3400 while (bs && bs->drv && !bs->drv->bdrv_debug_resume) { 3401 bs = bs->file; 3402 } 3403 3404 if (bs && bs->drv && bs->drv->bdrv_debug_resume) { 3405 return bs->drv->bdrv_debug_resume(bs, tag); 3406 } 3407 3408 return -ENOTSUP; 3409 } 3410 3411 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) 3412 { 3413 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { 3414 bs = bs->file; 3415 } 3416 3417 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { 3418 return bs->drv->bdrv_debug_is_suspended(bs, tag); 3419 } 3420 3421 return false; 3422 } 3423 3424 int bdrv_is_snapshot(BlockDriverState *bs) 3425 { 3426 return !!(bs->open_flags & BDRV_O_SNAPSHOT); 3427 } 3428 3429 /* backing_file can either be relative, or absolute, or a protocol. If it is 3430 * relative, it must be relative to the chain. So, passing in bs->filename 3431 * from a BDS as backing_file should not be done, as that may be relative to 3432 * the CWD rather than the chain. */ 3433 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, 3434 const char *backing_file) 3435 { 3436 char *filename_full = NULL; 3437 char *backing_file_full = NULL; 3438 char *filename_tmp = NULL; 3439 int is_protocol = 0; 3440 BlockDriverState *curr_bs = NULL; 3441 BlockDriverState *retval = NULL; 3442 3443 if (!bs || !bs->drv || !backing_file) { 3444 return NULL; 3445 } 3446 3447 filename_full = g_malloc(PATH_MAX); 3448 backing_file_full = g_malloc(PATH_MAX); 3449 filename_tmp = g_malloc(PATH_MAX); 3450 3451 is_protocol = path_has_protocol(backing_file); 3452 3453 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { 3454 3455 /* If either of the filename paths is actually a protocol, then 3456 * compare unmodified paths; otherwise make paths relative */ 3457 if (is_protocol || path_has_protocol(curr_bs->backing_file)) { 3458 if (strcmp(backing_file, curr_bs->backing_file) == 0) { 3459 retval = curr_bs->backing_hd; 3460 break; 3461 } 3462 } else { 3463 /* If not an absolute filename path, make it relative to the current 3464 * image's filename path */ 3465 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 3466 backing_file); 3467 3468 /* We are going to compare absolute pathnames */ 3469 if (!realpath(filename_tmp, filename_full)) { 3470 continue; 3471 } 3472 3473 /* We need to make sure the backing filename we are comparing against 3474 * is relative to the current image filename (or absolute) */ 3475 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 3476 curr_bs->backing_file); 3477 3478 if (!realpath(filename_tmp, backing_file_full)) { 3479 continue; 3480 } 3481 3482 if (strcmp(backing_file_full, filename_full) == 0) { 3483 retval = curr_bs->backing_hd; 3484 break; 3485 } 3486 } 3487 } 3488 3489 g_free(filename_full); 3490 g_free(backing_file_full); 3491 g_free(filename_tmp); 3492 return retval; 3493 } 3494 3495 int bdrv_get_backing_file_depth(BlockDriverState *bs) 3496 { 3497 if (!bs->drv) { 3498 return 0; 3499 } 3500 3501 if (!bs->backing_hd) { 3502 return 0; 3503 } 3504 3505 return 1 + bdrv_get_backing_file_depth(bs->backing_hd); 3506 } 3507 3508 BlockDriverState *bdrv_find_base(BlockDriverState *bs) 3509 { 3510 BlockDriverState *curr_bs = NULL; 3511 3512 if (!bs) { 3513 return NULL; 3514 } 3515 3516 curr_bs = bs; 3517 3518 while (curr_bs->backing_hd) { 3519 curr_bs = curr_bs->backing_hd; 3520 } 3521 return curr_bs; 3522 } 3523 3524 /**************************************************************/ 3525 /* async I/Os */ 3526 3527 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 3528 QEMUIOVector *qiov, int nb_sectors, 3529 BlockDriverCompletionFunc *cb, void *opaque) 3530 { 3531 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 3532 3533 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 3534 cb, opaque, false); 3535 } 3536 3537 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 3538 QEMUIOVector *qiov, int nb_sectors, 3539 BlockDriverCompletionFunc *cb, void *opaque) 3540 { 3541 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 3542 3543 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 3544 cb, opaque, true); 3545 } 3546 3547 3548 typedef struct MultiwriteCB { 3549 int error; 3550 int num_requests; 3551 int num_callbacks; 3552 struct { 3553 BlockDriverCompletionFunc *cb; 3554 void *opaque; 3555 QEMUIOVector *free_qiov; 3556 } callbacks[]; 3557 } MultiwriteCB; 3558 3559 static void multiwrite_user_cb(MultiwriteCB *mcb) 3560 { 3561 int i; 3562 3563 for (i = 0; i < mcb->num_callbacks; i++) { 3564 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 3565 if (mcb->callbacks[i].free_qiov) { 3566 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 3567 } 3568 g_free(mcb->callbacks[i].free_qiov); 3569 } 3570 } 3571 3572 static void multiwrite_cb(void *opaque, int ret) 3573 { 3574 MultiwriteCB *mcb = opaque; 3575 3576 trace_multiwrite_cb(mcb, ret); 3577 3578 if (ret < 0 && !mcb->error) { 3579 mcb->error = ret; 3580 } 3581 3582 mcb->num_requests--; 3583 if (mcb->num_requests == 0) { 3584 multiwrite_user_cb(mcb); 3585 g_free(mcb); 3586 } 3587 } 3588 3589 static int multiwrite_req_compare(const void *a, const void *b) 3590 { 3591 const BlockRequest *req1 = a, *req2 = b; 3592 3593 /* 3594 * Note that we can't simply subtract req2->sector from req1->sector 3595 * here as that could overflow the return value. 3596 */ 3597 if (req1->sector > req2->sector) { 3598 return 1; 3599 } else if (req1->sector < req2->sector) { 3600 return -1; 3601 } else { 3602 return 0; 3603 } 3604 } 3605 3606 /* 3607 * Takes a bunch of requests and tries to merge them. Returns the number of 3608 * requests that remain after merging. 3609 */ 3610 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 3611 int num_reqs, MultiwriteCB *mcb) 3612 { 3613 int i, outidx; 3614 3615 // Sort requests by start sector 3616 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 3617 3618 // Check if adjacent requests touch the same clusters. If so, combine them, 3619 // filling up gaps with zero sectors. 3620 outidx = 0; 3621 for (i = 1; i < num_reqs; i++) { 3622 int merge = 0; 3623 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 3624 3625 // Handle exactly sequential writes and overlapping writes. 3626 if (reqs[i].sector <= oldreq_last) { 3627 merge = 1; 3628 } 3629 3630 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 3631 merge = 0; 3632 } 3633 3634 if (merge) { 3635 size_t size; 3636 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 3637 qemu_iovec_init(qiov, 3638 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 3639 3640 // Add the first request to the merged one. If the requests are 3641 // overlapping, drop the last sectors of the first request. 3642 size = (reqs[i].sector - reqs[outidx].sector) << 9; 3643 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 3644 3645 // We should need to add any zeros between the two requests 3646 assert (reqs[i].sector <= oldreq_last); 3647 3648 // Add the second request 3649 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 3650 3651 reqs[outidx].nb_sectors = qiov->size >> 9; 3652 reqs[outidx].qiov = qiov; 3653 3654 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 3655 } else { 3656 outidx++; 3657 reqs[outidx].sector = reqs[i].sector; 3658 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 3659 reqs[outidx].qiov = reqs[i].qiov; 3660 } 3661 } 3662 3663 return outidx + 1; 3664 } 3665 3666 /* 3667 * Submit multiple AIO write requests at once. 3668 * 3669 * On success, the function returns 0 and all requests in the reqs array have 3670 * been submitted. In error case this function returns -1, and any of the 3671 * requests may or may not be submitted yet. In particular, this means that the 3672 * callback will be called for some of the requests, for others it won't. The 3673 * caller must check the error field of the BlockRequest to wait for the right 3674 * callbacks (if error != 0, no callback will be called). 3675 * 3676 * The implementation may modify the contents of the reqs array, e.g. to merge 3677 * requests. However, the fields opaque and error are left unmodified as they 3678 * are used to signal failure for a single request to the caller. 3679 */ 3680 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 3681 { 3682 MultiwriteCB *mcb; 3683 int i; 3684 3685 /* don't submit writes if we don't have a medium */ 3686 if (bs->drv == NULL) { 3687 for (i = 0; i < num_reqs; i++) { 3688 reqs[i].error = -ENOMEDIUM; 3689 } 3690 return -1; 3691 } 3692 3693 if (num_reqs == 0) { 3694 return 0; 3695 } 3696 3697 // Create MultiwriteCB structure 3698 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 3699 mcb->num_requests = 0; 3700 mcb->num_callbacks = num_reqs; 3701 3702 for (i = 0; i < num_reqs; i++) { 3703 mcb->callbacks[i].cb = reqs[i].cb; 3704 mcb->callbacks[i].opaque = reqs[i].opaque; 3705 } 3706 3707 // Check for mergable requests 3708 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 3709 3710 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 3711 3712 /* Run the aio requests. */ 3713 mcb->num_requests = num_reqs; 3714 for (i = 0; i < num_reqs; i++) { 3715 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, 3716 reqs[i].nb_sectors, multiwrite_cb, mcb); 3717 } 3718 3719 return 0; 3720 } 3721 3722 void bdrv_aio_cancel(BlockDriverAIOCB *acb) 3723 { 3724 acb->aiocb_info->cancel(acb); 3725 } 3726 3727 /**************************************************************/ 3728 /* async block device emulation */ 3729 3730 typedef struct BlockDriverAIOCBSync { 3731 BlockDriverAIOCB common; 3732 QEMUBH *bh; 3733 int ret; 3734 /* vector translation state */ 3735 QEMUIOVector *qiov; 3736 uint8_t *bounce; 3737 int is_write; 3738 } BlockDriverAIOCBSync; 3739 3740 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) 3741 { 3742 BlockDriverAIOCBSync *acb = 3743 container_of(blockacb, BlockDriverAIOCBSync, common); 3744 qemu_bh_delete(acb->bh); 3745 acb->bh = NULL; 3746 qemu_aio_release(acb); 3747 } 3748 3749 static const AIOCBInfo bdrv_em_aiocb_info = { 3750 .aiocb_size = sizeof(BlockDriverAIOCBSync), 3751 .cancel = bdrv_aio_cancel_em, 3752 }; 3753 3754 static void bdrv_aio_bh_cb(void *opaque) 3755 { 3756 BlockDriverAIOCBSync *acb = opaque; 3757 3758 if (!acb->is_write) 3759 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 3760 qemu_vfree(acb->bounce); 3761 acb->common.cb(acb->common.opaque, acb->ret); 3762 qemu_bh_delete(acb->bh); 3763 acb->bh = NULL; 3764 qemu_aio_release(acb); 3765 } 3766 3767 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 3768 int64_t sector_num, 3769 QEMUIOVector *qiov, 3770 int nb_sectors, 3771 BlockDriverCompletionFunc *cb, 3772 void *opaque, 3773 int is_write) 3774 3775 { 3776 BlockDriverAIOCBSync *acb; 3777 3778 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 3779 acb->is_write = is_write; 3780 acb->qiov = qiov; 3781 acb->bounce = qemu_blockalign(bs, qiov->size); 3782 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); 3783 3784 if (is_write) { 3785 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 3786 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 3787 } else { 3788 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 3789 } 3790 3791 qemu_bh_schedule(acb->bh); 3792 3793 return &acb->common; 3794 } 3795 3796 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 3797 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3798 BlockDriverCompletionFunc *cb, void *opaque) 3799 { 3800 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 3801 } 3802 3803 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 3804 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3805 BlockDriverCompletionFunc *cb, void *opaque) 3806 { 3807 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 3808 } 3809 3810 3811 typedef struct BlockDriverAIOCBCoroutine { 3812 BlockDriverAIOCB common; 3813 BlockRequest req; 3814 bool is_write; 3815 bool *done; 3816 QEMUBH* bh; 3817 } BlockDriverAIOCBCoroutine; 3818 3819 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) 3820 { 3821 BlockDriverAIOCBCoroutine *acb = 3822 container_of(blockacb, BlockDriverAIOCBCoroutine, common); 3823 bool done = false; 3824 3825 acb->done = &done; 3826 while (!done) { 3827 qemu_aio_wait(); 3828 } 3829 } 3830 3831 static const AIOCBInfo bdrv_em_co_aiocb_info = { 3832 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), 3833 .cancel = bdrv_aio_co_cancel_em, 3834 }; 3835 3836 static void bdrv_co_em_bh(void *opaque) 3837 { 3838 BlockDriverAIOCBCoroutine *acb = opaque; 3839 3840 acb->common.cb(acb->common.opaque, acb->req.error); 3841 3842 if (acb->done) { 3843 *acb->done = true; 3844 } 3845 3846 qemu_bh_delete(acb->bh); 3847 qemu_aio_release(acb); 3848 } 3849 3850 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 3851 static void coroutine_fn bdrv_co_do_rw(void *opaque) 3852 { 3853 BlockDriverAIOCBCoroutine *acb = opaque; 3854 BlockDriverState *bs = acb->common.bs; 3855 3856 if (!acb->is_write) { 3857 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 3858 acb->req.nb_sectors, acb->req.qiov, 0); 3859 } else { 3860 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 3861 acb->req.nb_sectors, acb->req.qiov, 0); 3862 } 3863 3864 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3865 qemu_bh_schedule(acb->bh); 3866 } 3867 3868 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 3869 int64_t sector_num, 3870 QEMUIOVector *qiov, 3871 int nb_sectors, 3872 BlockDriverCompletionFunc *cb, 3873 void *opaque, 3874 bool is_write) 3875 { 3876 Coroutine *co; 3877 BlockDriverAIOCBCoroutine *acb; 3878 3879 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 3880 acb->req.sector = sector_num; 3881 acb->req.nb_sectors = nb_sectors; 3882 acb->req.qiov = qiov; 3883 acb->is_write = is_write; 3884 acb->done = NULL; 3885 3886 co = qemu_coroutine_create(bdrv_co_do_rw); 3887 qemu_coroutine_enter(co, acb); 3888 3889 return &acb->common; 3890 } 3891 3892 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 3893 { 3894 BlockDriverAIOCBCoroutine *acb = opaque; 3895 BlockDriverState *bs = acb->common.bs; 3896 3897 acb->req.error = bdrv_co_flush(bs); 3898 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3899 qemu_bh_schedule(acb->bh); 3900 } 3901 3902 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 3903 BlockDriverCompletionFunc *cb, void *opaque) 3904 { 3905 trace_bdrv_aio_flush(bs, opaque); 3906 3907 Coroutine *co; 3908 BlockDriverAIOCBCoroutine *acb; 3909 3910 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 3911 acb->done = NULL; 3912 3913 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 3914 qemu_coroutine_enter(co, acb); 3915 3916 return &acb->common; 3917 } 3918 3919 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 3920 { 3921 BlockDriverAIOCBCoroutine *acb = opaque; 3922 BlockDriverState *bs = acb->common.bs; 3923 3924 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 3925 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3926 qemu_bh_schedule(acb->bh); 3927 } 3928 3929 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, 3930 int64_t sector_num, int nb_sectors, 3931 BlockDriverCompletionFunc *cb, void *opaque) 3932 { 3933 Coroutine *co; 3934 BlockDriverAIOCBCoroutine *acb; 3935 3936 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 3937 3938 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 3939 acb->req.sector = sector_num; 3940 acb->req.nb_sectors = nb_sectors; 3941 acb->done = NULL; 3942 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 3943 qemu_coroutine_enter(co, acb); 3944 3945 return &acb->common; 3946 } 3947 3948 void bdrv_init(void) 3949 { 3950 module_call_init(MODULE_INIT_BLOCK); 3951 } 3952 3953 void bdrv_init_with_whitelist(void) 3954 { 3955 use_bdrv_whitelist = 1; 3956 bdrv_init(); 3957 } 3958 3959 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 3960 BlockDriverCompletionFunc *cb, void *opaque) 3961 { 3962 BlockDriverAIOCB *acb; 3963 3964 acb = g_slice_alloc(aiocb_info->aiocb_size); 3965 acb->aiocb_info = aiocb_info; 3966 acb->bs = bs; 3967 acb->cb = cb; 3968 acb->opaque = opaque; 3969 return acb; 3970 } 3971 3972 void qemu_aio_release(void *p) 3973 { 3974 BlockDriverAIOCB *acb = p; 3975 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 3976 } 3977 3978 /**************************************************************/ 3979 /* Coroutine block device emulation */ 3980 3981 typedef struct CoroutineIOCompletion { 3982 Coroutine *coroutine; 3983 int ret; 3984 } CoroutineIOCompletion; 3985 3986 static void bdrv_co_io_em_complete(void *opaque, int ret) 3987 { 3988 CoroutineIOCompletion *co = opaque; 3989 3990 co->ret = ret; 3991 qemu_coroutine_enter(co->coroutine, NULL); 3992 } 3993 3994 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 3995 int nb_sectors, QEMUIOVector *iov, 3996 bool is_write) 3997 { 3998 CoroutineIOCompletion co = { 3999 .coroutine = qemu_coroutine_self(), 4000 }; 4001 BlockDriverAIOCB *acb; 4002 4003 if (is_write) { 4004 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 4005 bdrv_co_io_em_complete, &co); 4006 } else { 4007 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 4008 bdrv_co_io_em_complete, &co); 4009 } 4010 4011 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 4012 if (!acb) { 4013 return -EIO; 4014 } 4015 qemu_coroutine_yield(); 4016 4017 return co.ret; 4018 } 4019 4020 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 4021 int64_t sector_num, int nb_sectors, 4022 QEMUIOVector *iov) 4023 { 4024 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 4025 } 4026 4027 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 4028 int64_t sector_num, int nb_sectors, 4029 QEMUIOVector *iov) 4030 { 4031 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 4032 } 4033 4034 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 4035 { 4036 RwCo *rwco = opaque; 4037 4038 rwco->ret = bdrv_co_flush(rwco->bs); 4039 } 4040 4041 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 4042 { 4043 int ret; 4044 4045 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 4046 return 0; 4047 } 4048 4049 /* Write back cached data to the OS even with cache=unsafe */ 4050 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 4051 if (bs->drv->bdrv_co_flush_to_os) { 4052 ret = bs->drv->bdrv_co_flush_to_os(bs); 4053 if (ret < 0) { 4054 return ret; 4055 } 4056 } 4057 4058 /* But don't actually force it to the disk with cache=unsafe */ 4059 if (bs->open_flags & BDRV_O_NO_FLUSH) { 4060 goto flush_parent; 4061 } 4062 4063 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 4064 if (bs->drv->bdrv_co_flush_to_disk) { 4065 ret = bs->drv->bdrv_co_flush_to_disk(bs); 4066 } else if (bs->drv->bdrv_aio_flush) { 4067 BlockDriverAIOCB *acb; 4068 CoroutineIOCompletion co = { 4069 .coroutine = qemu_coroutine_self(), 4070 }; 4071 4072 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 4073 if (acb == NULL) { 4074 ret = -EIO; 4075 } else { 4076 qemu_coroutine_yield(); 4077 ret = co.ret; 4078 } 4079 } else { 4080 /* 4081 * Some block drivers always operate in either writethrough or unsafe 4082 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 4083 * know how the server works (because the behaviour is hardcoded or 4084 * depends on server-side configuration), so we can't ensure that 4085 * everything is safe on disk. Returning an error doesn't work because 4086 * that would break guests even if the server operates in writethrough 4087 * mode. 4088 * 4089 * Let's hope the user knows what he's doing. 4090 */ 4091 ret = 0; 4092 } 4093 if (ret < 0) { 4094 return ret; 4095 } 4096 4097 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 4098 * in the case of cache=unsafe, so there are no useless flushes. 4099 */ 4100 flush_parent: 4101 return bdrv_co_flush(bs->file); 4102 } 4103 4104 void bdrv_invalidate_cache(BlockDriverState *bs) 4105 { 4106 if (bs->drv && bs->drv->bdrv_invalidate_cache) { 4107 bs->drv->bdrv_invalidate_cache(bs); 4108 } 4109 } 4110 4111 void bdrv_invalidate_cache_all(void) 4112 { 4113 BlockDriverState *bs; 4114 4115 QTAILQ_FOREACH(bs, &bdrv_states, list) { 4116 bdrv_invalidate_cache(bs); 4117 } 4118 } 4119 4120 void bdrv_clear_incoming_migration_all(void) 4121 { 4122 BlockDriverState *bs; 4123 4124 QTAILQ_FOREACH(bs, &bdrv_states, list) { 4125 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); 4126 } 4127 } 4128 4129 int bdrv_flush(BlockDriverState *bs) 4130 { 4131 Coroutine *co; 4132 RwCo rwco = { 4133 .bs = bs, 4134 .ret = NOT_DONE, 4135 }; 4136 4137 if (qemu_in_coroutine()) { 4138 /* Fast-path if already in coroutine context */ 4139 bdrv_flush_co_entry(&rwco); 4140 } else { 4141 co = qemu_coroutine_create(bdrv_flush_co_entry); 4142 qemu_coroutine_enter(co, &rwco); 4143 while (rwco.ret == NOT_DONE) { 4144 qemu_aio_wait(); 4145 } 4146 } 4147 4148 return rwco.ret; 4149 } 4150 4151 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 4152 { 4153 RwCo *rwco = opaque; 4154 4155 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 4156 } 4157 4158 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 4159 int nb_sectors) 4160 { 4161 if (!bs->drv) { 4162 return -ENOMEDIUM; 4163 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { 4164 return -EIO; 4165 } else if (bs->read_only) { 4166 return -EROFS; 4167 } 4168 4169 if (bs->dirty_bitmap) { 4170 bdrv_reset_dirty(bs, sector_num, nb_sectors); 4171 } 4172 4173 /* Do nothing if disabled. */ 4174 if (!(bs->open_flags & BDRV_O_UNMAP)) { 4175 return 0; 4176 } 4177 4178 if (bs->drv->bdrv_co_discard) { 4179 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); 4180 } else if (bs->drv->bdrv_aio_discard) { 4181 BlockDriverAIOCB *acb; 4182 CoroutineIOCompletion co = { 4183 .coroutine = qemu_coroutine_self(), 4184 }; 4185 4186 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 4187 bdrv_co_io_em_complete, &co); 4188 if (acb == NULL) { 4189 return -EIO; 4190 } else { 4191 qemu_coroutine_yield(); 4192 return co.ret; 4193 } 4194 } else { 4195 return 0; 4196 } 4197 } 4198 4199 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 4200 { 4201 Coroutine *co; 4202 RwCo rwco = { 4203 .bs = bs, 4204 .sector_num = sector_num, 4205 .nb_sectors = nb_sectors, 4206 .ret = NOT_DONE, 4207 }; 4208 4209 if (qemu_in_coroutine()) { 4210 /* Fast-path if already in coroutine context */ 4211 bdrv_discard_co_entry(&rwco); 4212 } else { 4213 co = qemu_coroutine_create(bdrv_discard_co_entry); 4214 qemu_coroutine_enter(co, &rwco); 4215 while (rwco.ret == NOT_DONE) { 4216 qemu_aio_wait(); 4217 } 4218 } 4219 4220 return rwco.ret; 4221 } 4222 4223 /**************************************************************/ 4224 /* removable device support */ 4225 4226 /** 4227 * Return TRUE if the media is present 4228 */ 4229 int bdrv_is_inserted(BlockDriverState *bs) 4230 { 4231 BlockDriver *drv = bs->drv; 4232 4233 if (!drv) 4234 return 0; 4235 if (!drv->bdrv_is_inserted) 4236 return 1; 4237 return drv->bdrv_is_inserted(bs); 4238 } 4239 4240 /** 4241 * Return whether the media changed since the last call to this 4242 * function, or -ENOTSUP if we don't know. Most drivers don't know. 4243 */ 4244 int bdrv_media_changed(BlockDriverState *bs) 4245 { 4246 BlockDriver *drv = bs->drv; 4247 4248 if (drv && drv->bdrv_media_changed) { 4249 return drv->bdrv_media_changed(bs); 4250 } 4251 return -ENOTSUP; 4252 } 4253 4254 /** 4255 * If eject_flag is TRUE, eject the media. Otherwise, close the tray 4256 */ 4257 void bdrv_eject(BlockDriverState *bs, bool eject_flag) 4258 { 4259 BlockDriver *drv = bs->drv; 4260 4261 if (drv && drv->bdrv_eject) { 4262 drv->bdrv_eject(bs, eject_flag); 4263 } 4264 4265 if (bs->device_name[0] != '\0') { 4266 bdrv_emit_qmp_eject_event(bs, eject_flag); 4267 } 4268 } 4269 4270 /** 4271 * Lock or unlock the media (if it is locked, the user won't be able 4272 * to eject it manually). 4273 */ 4274 void bdrv_lock_medium(BlockDriverState *bs, bool locked) 4275 { 4276 BlockDriver *drv = bs->drv; 4277 4278 trace_bdrv_lock_medium(bs, locked); 4279 4280 if (drv && drv->bdrv_lock_medium) { 4281 drv->bdrv_lock_medium(bs, locked); 4282 } 4283 } 4284 4285 /* needed for generic scsi interface */ 4286 4287 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 4288 { 4289 BlockDriver *drv = bs->drv; 4290 4291 if (drv && drv->bdrv_ioctl) 4292 return drv->bdrv_ioctl(bs, req, buf); 4293 return -ENOTSUP; 4294 } 4295 4296 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 4297 unsigned long int req, void *buf, 4298 BlockDriverCompletionFunc *cb, void *opaque) 4299 { 4300 BlockDriver *drv = bs->drv; 4301 4302 if (drv && drv->bdrv_aio_ioctl) 4303 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 4304 return NULL; 4305 } 4306 4307 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) 4308 { 4309 bs->buffer_alignment = align; 4310 } 4311 4312 void *qemu_blockalign(BlockDriverState *bs, size_t size) 4313 { 4314 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); 4315 } 4316 4317 /* 4318 * Check if all memory in this vector is sector aligned. 4319 */ 4320 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 4321 { 4322 int i; 4323 4324 for (i = 0; i < qiov->niov; i++) { 4325 if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { 4326 return false; 4327 } 4328 } 4329 4330 return true; 4331 } 4332 4333 void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) 4334 { 4335 int64_t bitmap_size; 4336 4337 assert((granularity & (granularity - 1)) == 0); 4338 4339 if (granularity) { 4340 granularity >>= BDRV_SECTOR_BITS; 4341 assert(!bs->dirty_bitmap); 4342 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); 4343 bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); 4344 } else { 4345 if (bs->dirty_bitmap) { 4346 hbitmap_free(bs->dirty_bitmap); 4347 bs->dirty_bitmap = NULL; 4348 } 4349 } 4350 } 4351 4352 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) 4353 { 4354 if (bs->dirty_bitmap) { 4355 return hbitmap_get(bs->dirty_bitmap, sector); 4356 } else { 4357 return 0; 4358 } 4359 } 4360 4361 void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) 4362 { 4363 hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); 4364 } 4365 4366 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, 4367 int nr_sectors) 4368 { 4369 hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); 4370 } 4371 4372 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, 4373 int nr_sectors) 4374 { 4375 hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); 4376 } 4377 4378 int64_t bdrv_get_dirty_count(BlockDriverState *bs) 4379 { 4380 if (bs->dirty_bitmap) { 4381 return hbitmap_count(bs->dirty_bitmap); 4382 } else { 4383 return 0; 4384 } 4385 } 4386 4387 /* Get a reference to bs */ 4388 void bdrv_ref(BlockDriverState *bs) 4389 { 4390 bs->refcnt++; 4391 } 4392 4393 /* Release a previously grabbed reference to bs. 4394 * If after releasing, reference count is zero, the BlockDriverState is 4395 * deleted. */ 4396 void bdrv_unref(BlockDriverState *bs) 4397 { 4398 assert(bs->refcnt > 0); 4399 if (--bs->refcnt == 0) { 4400 bdrv_delete(bs); 4401 } 4402 } 4403 4404 void bdrv_set_in_use(BlockDriverState *bs, int in_use) 4405 { 4406 assert(bs->in_use != in_use); 4407 bs->in_use = in_use; 4408 } 4409 4410 int bdrv_in_use(BlockDriverState *bs) 4411 { 4412 return bs->in_use; 4413 } 4414 4415 void bdrv_iostatus_enable(BlockDriverState *bs) 4416 { 4417 bs->iostatus_enabled = true; 4418 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 4419 } 4420 4421 /* The I/O status is only enabled if the drive explicitly 4422 * enables it _and_ the VM is configured to stop on errors */ 4423 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) 4424 { 4425 return (bs->iostatus_enabled && 4426 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || 4427 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || 4428 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); 4429 } 4430 4431 void bdrv_iostatus_disable(BlockDriverState *bs) 4432 { 4433 bs->iostatus_enabled = false; 4434 } 4435 4436 void bdrv_iostatus_reset(BlockDriverState *bs) 4437 { 4438 if (bdrv_iostatus_is_enabled(bs)) { 4439 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 4440 if (bs->job) { 4441 block_job_iostatus_reset(bs->job); 4442 } 4443 } 4444 } 4445 4446 void bdrv_iostatus_set_err(BlockDriverState *bs, int error) 4447 { 4448 assert(bdrv_iostatus_is_enabled(bs)); 4449 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 4450 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : 4451 BLOCK_DEVICE_IO_STATUS_FAILED; 4452 } 4453 } 4454 4455 void 4456 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, 4457 enum BlockAcctType type) 4458 { 4459 assert(type < BDRV_MAX_IOTYPE); 4460 4461 cookie->bytes = bytes; 4462 cookie->start_time_ns = get_clock(); 4463 cookie->type = type; 4464 } 4465 4466 void 4467 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie) 4468 { 4469 assert(cookie->type < BDRV_MAX_IOTYPE); 4470 4471 bs->nr_bytes[cookie->type] += cookie->bytes; 4472 bs->nr_ops[cookie->type]++; 4473 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns; 4474 } 4475 4476 void bdrv_img_create(const char *filename, const char *fmt, 4477 const char *base_filename, const char *base_fmt, 4478 char *options, uint64_t img_size, int flags, 4479 Error **errp, bool quiet) 4480 { 4481 QEMUOptionParameter *param = NULL, *create_options = NULL; 4482 QEMUOptionParameter *backing_fmt, *backing_file, *size; 4483 BlockDriverState *bs = NULL; 4484 BlockDriver *drv, *proto_drv; 4485 BlockDriver *backing_drv = NULL; 4486 Error *local_err = NULL; 4487 int ret = 0; 4488 4489 /* Find driver and parse its options */ 4490 drv = bdrv_find_format(fmt); 4491 if (!drv) { 4492 error_setg(errp, "Unknown file format '%s'", fmt); 4493 return; 4494 } 4495 4496 proto_drv = bdrv_find_protocol(filename, true); 4497 if (!proto_drv) { 4498 error_setg(errp, "Unknown protocol '%s'", filename); 4499 return; 4500 } 4501 4502 create_options = append_option_parameters(create_options, 4503 drv->create_options); 4504 create_options = append_option_parameters(create_options, 4505 proto_drv->create_options); 4506 4507 /* Create parameter list with default values */ 4508 param = parse_option_parameters("", create_options, param); 4509 4510 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size); 4511 4512 /* Parse -o options */ 4513 if (options) { 4514 param = parse_option_parameters(options, create_options, param); 4515 if (param == NULL) { 4516 error_setg(errp, "Invalid options for file format '%s'.", fmt); 4517 goto out; 4518 } 4519 } 4520 4521 if (base_filename) { 4522 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE, 4523 base_filename)) { 4524 error_setg(errp, "Backing file not supported for file format '%s'", 4525 fmt); 4526 goto out; 4527 } 4528 } 4529 4530 if (base_fmt) { 4531 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) { 4532 error_setg(errp, "Backing file format not supported for file " 4533 "format '%s'", fmt); 4534 goto out; 4535 } 4536 } 4537 4538 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE); 4539 if (backing_file && backing_file->value.s) { 4540 if (!strcmp(filename, backing_file->value.s)) { 4541 error_setg(errp, "Error: Trying to create an image with the " 4542 "same filename as the backing file"); 4543 goto out; 4544 } 4545 } 4546 4547 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT); 4548 if (backing_fmt && backing_fmt->value.s) { 4549 backing_drv = bdrv_find_format(backing_fmt->value.s); 4550 if (!backing_drv) { 4551 error_setg(errp, "Unknown backing file format '%s'", 4552 backing_fmt->value.s); 4553 goto out; 4554 } 4555 } 4556 4557 // The size for the image must always be specified, with one exception: 4558 // If we are using a backing file, we can obtain the size from there 4559 size = get_option_parameter(param, BLOCK_OPT_SIZE); 4560 if (size && size->value.n == -1) { 4561 if (backing_file && backing_file->value.s) { 4562 uint64_t size; 4563 char buf[32]; 4564 int back_flags; 4565 4566 /* backing files always opened read-only */ 4567 back_flags = 4568 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 4569 4570 bs = bdrv_new(""); 4571 4572 ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags, 4573 backing_drv, &local_err); 4574 if (ret < 0) { 4575 error_setg_errno(errp, -ret, "Could not open '%s': %s", 4576 backing_file->value.s, 4577 error_get_pretty(local_err)); 4578 error_free(local_err); 4579 local_err = NULL; 4580 goto out; 4581 } 4582 bdrv_get_geometry(bs, &size); 4583 size *= 512; 4584 4585 snprintf(buf, sizeof(buf), "%" PRId64, size); 4586 set_option_parameter(param, BLOCK_OPT_SIZE, buf); 4587 } else { 4588 error_setg(errp, "Image creation needs a size parameter"); 4589 goto out; 4590 } 4591 } 4592 4593 if (!quiet) { 4594 printf("Formatting '%s', fmt=%s ", filename, fmt); 4595 print_option_parameters(param); 4596 puts(""); 4597 } 4598 ret = bdrv_create(drv, filename, param, &local_err); 4599 if (ret == -EFBIG) { 4600 /* This is generally a better message than whatever the driver would 4601 * deliver (especially because of the cluster_size_hint), since that 4602 * is most probably not much different from "image too large". */ 4603 const char *cluster_size_hint = ""; 4604 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) { 4605 cluster_size_hint = " (try using a larger cluster size)"; 4606 } 4607 error_setg(errp, "The image size is too large for file format '%s'" 4608 "%s", fmt, cluster_size_hint); 4609 error_free(local_err); 4610 local_err = NULL; 4611 } 4612 4613 out: 4614 free_option_parameters(create_options); 4615 free_option_parameters(param); 4616 4617 if (bs) { 4618 bdrv_unref(bs); 4619 } 4620 if (error_is_set(&local_err)) { 4621 error_propagate(errp, local_err); 4622 } 4623 } 4624 4625 AioContext *bdrv_get_aio_context(BlockDriverState *bs) 4626 { 4627 /* Currently BlockDriverState always uses the main loop AioContext */ 4628 return qemu_get_aio_context(); 4629 } 4630 4631 void bdrv_add_before_write_notifier(BlockDriverState *bs, 4632 NotifierWithReturn *notifier) 4633 { 4634 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 4635 } 4636 4637 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options) 4638 { 4639 if (bs->drv->bdrv_amend_options == NULL) { 4640 return -ENOTSUP; 4641 } 4642 return bs->drv->bdrv_amend_options(bs, options); 4643 } 4644