1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "config-host.h" 25 #include "qemu-common.h" 26 #include "trace.h" 27 #include "block/block_int.h" 28 #include "block/blockjob.h" 29 #include "qemu/module.h" 30 #include "qapi/qmp/qjson.h" 31 #include "sysemu/block-backend.h" 32 #include "sysemu/sysemu.h" 33 #include "qemu/notify.h" 34 #include "block/coroutine.h" 35 #include "block/qapi.h" 36 #include "qmp-commands.h" 37 #include "qemu/timer.h" 38 #include "qapi-event.h" 39 40 #ifdef CONFIG_BSD 41 #include <sys/types.h> 42 #include <sys/stat.h> 43 #include <sys/ioctl.h> 44 #include <sys/queue.h> 45 #ifndef __DragonFly__ 46 #include <sys/disk.h> 47 #endif 48 #endif 49 50 #ifdef _WIN32 51 #include <windows.h> 52 #endif 53 54 struct BdrvDirtyBitmap { 55 HBitmap *bitmap; 56 QLIST_ENTRY(BdrvDirtyBitmap) list; 57 }; 58 59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 60 61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 63 BlockCompletionFunc *cb, void *opaque); 64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 66 BlockCompletionFunc *cb, void *opaque); 67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 68 int64_t sector_num, int nb_sectors, 69 QEMUIOVector *iov); 70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 71 int64_t sector_num, int nb_sectors, 72 QEMUIOVector *iov); 73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 75 BdrvRequestFlags flags); 76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 78 BdrvRequestFlags flags); 79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 80 int64_t sector_num, 81 QEMUIOVector *qiov, 82 int nb_sectors, 83 BdrvRequestFlags flags, 84 BlockCompletionFunc *cb, 85 void *opaque, 86 bool is_write); 87 static void coroutine_fn bdrv_co_do_rw(void *opaque); 88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 90 91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states = 92 QTAILQ_HEAD_INITIALIZER(bdrv_states); 93 94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = 95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); 96 97 static QLIST_HEAD(, BlockDriver) bdrv_drivers = 98 QLIST_HEAD_INITIALIZER(bdrv_drivers); 99 100 /* If non-zero, use only whitelisted block drivers */ 101 static int use_bdrv_whitelist; 102 103 #ifdef _WIN32 104 static int is_windows_drive_prefix(const char *filename) 105 { 106 return (((filename[0] >= 'a' && filename[0] <= 'z') || 107 (filename[0] >= 'A' && filename[0] <= 'Z')) && 108 filename[1] == ':'); 109 } 110 111 int is_windows_drive(const char *filename) 112 { 113 if (is_windows_drive_prefix(filename) && 114 filename[2] == '\0') 115 return 1; 116 if (strstart(filename, "\\\\.\\", NULL) || 117 strstart(filename, "//./", NULL)) 118 return 1; 119 return 0; 120 } 121 #endif 122 123 /* throttling disk I/O limits */ 124 void bdrv_set_io_limits(BlockDriverState *bs, 125 ThrottleConfig *cfg) 126 { 127 int i; 128 129 throttle_config(&bs->throttle_state, cfg); 130 131 for (i = 0; i < 2; i++) { 132 qemu_co_enter_next(&bs->throttled_reqs[i]); 133 } 134 } 135 136 /* this function drain all the throttled IOs */ 137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 138 { 139 bool drained = false; 140 bool enabled = bs->io_limits_enabled; 141 int i; 142 143 bs->io_limits_enabled = false; 144 145 for (i = 0; i < 2; i++) { 146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 147 drained = true; 148 } 149 } 150 151 bs->io_limits_enabled = enabled; 152 153 return drained; 154 } 155 156 void bdrv_io_limits_disable(BlockDriverState *bs) 157 { 158 bs->io_limits_enabled = false; 159 160 bdrv_start_throttled_reqs(bs); 161 162 throttle_destroy(&bs->throttle_state); 163 } 164 165 static void bdrv_throttle_read_timer_cb(void *opaque) 166 { 167 BlockDriverState *bs = opaque; 168 qemu_co_enter_next(&bs->throttled_reqs[0]); 169 } 170 171 static void bdrv_throttle_write_timer_cb(void *opaque) 172 { 173 BlockDriverState *bs = opaque; 174 qemu_co_enter_next(&bs->throttled_reqs[1]); 175 } 176 177 /* should be called before bdrv_set_io_limits if a limit is set */ 178 void bdrv_io_limits_enable(BlockDriverState *bs) 179 { 180 assert(!bs->io_limits_enabled); 181 throttle_init(&bs->throttle_state, 182 bdrv_get_aio_context(bs), 183 QEMU_CLOCK_VIRTUAL, 184 bdrv_throttle_read_timer_cb, 185 bdrv_throttle_write_timer_cb, 186 bs); 187 bs->io_limits_enabled = true; 188 } 189 190 /* This function makes an IO wait if needed 191 * 192 * @nb_sectors: the number of sectors of the IO 193 * @is_write: is the IO a write 194 */ 195 static void bdrv_io_limits_intercept(BlockDriverState *bs, 196 unsigned int bytes, 197 bool is_write) 198 { 199 /* does this io must wait */ 200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); 201 202 /* if must wait or any request of this type throttled queue the IO */ 203 if (must_wait || 204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { 205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]); 206 } 207 208 /* the IO will be executed, do the accounting */ 209 throttle_account(&bs->throttle_state, is_write, bytes); 210 211 212 /* if the next request must wait -> do nothing */ 213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) { 214 return; 215 } 216 217 /* else queue next request for execution */ 218 qemu_co_queue_next(&bs->throttled_reqs[is_write]); 219 } 220 221 size_t bdrv_opt_mem_align(BlockDriverState *bs) 222 { 223 if (!bs || !bs->drv) { 224 /* 4k should be on the safe side */ 225 return 4096; 226 } 227 228 return bs->bl.opt_mem_alignment; 229 } 230 231 /* check if the path starts with "<protocol>:" */ 232 static int path_has_protocol(const char *path) 233 { 234 const char *p; 235 236 #ifdef _WIN32 237 if (is_windows_drive(path) || 238 is_windows_drive_prefix(path)) { 239 return 0; 240 } 241 p = path + strcspn(path, ":/\\"); 242 #else 243 p = path + strcspn(path, ":/"); 244 #endif 245 246 return *p == ':'; 247 } 248 249 int path_is_absolute(const char *path) 250 { 251 #ifdef _WIN32 252 /* specific case for names like: "\\.\d:" */ 253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) { 254 return 1; 255 } 256 return (*path == '/' || *path == '\\'); 257 #else 258 return (*path == '/'); 259 #endif 260 } 261 262 /* if filename is absolute, just copy it to dest. Otherwise, build a 263 path to it by considering it is relative to base_path. URL are 264 supported. */ 265 void path_combine(char *dest, int dest_size, 266 const char *base_path, 267 const char *filename) 268 { 269 const char *p, *p1; 270 int len; 271 272 if (dest_size <= 0) 273 return; 274 if (path_is_absolute(filename)) { 275 pstrcpy(dest, dest_size, filename); 276 } else { 277 p = strchr(base_path, ':'); 278 if (p) 279 p++; 280 else 281 p = base_path; 282 p1 = strrchr(base_path, '/'); 283 #ifdef _WIN32 284 { 285 const char *p2; 286 p2 = strrchr(base_path, '\\'); 287 if (!p1 || p2 > p1) 288 p1 = p2; 289 } 290 #endif 291 if (p1) 292 p1++; 293 else 294 p1 = base_path; 295 if (p1 > p) 296 p = p1; 297 len = p - base_path; 298 if (len > dest_size - 1) 299 len = dest_size - 1; 300 memcpy(dest, base_path, len); 301 dest[len] = '\0'; 302 pstrcat(dest, dest_size, filename); 303 } 304 } 305 306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz) 307 { 308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) { 309 pstrcpy(dest, sz, bs->backing_file); 310 } else { 311 path_combine(dest, sz, bs->filename, bs->backing_file); 312 } 313 } 314 315 void bdrv_register(BlockDriver *bdrv) 316 { 317 /* Block drivers without coroutine functions need emulation */ 318 if (!bdrv->bdrv_co_readv) { 319 bdrv->bdrv_co_readv = bdrv_co_readv_em; 320 bdrv->bdrv_co_writev = bdrv_co_writev_em; 321 322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 323 * the block driver lacks aio we need to emulate that too. 324 */ 325 if (!bdrv->bdrv_aio_readv) { 326 /* add AIO emulation layer */ 327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 329 } 330 } 331 332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); 333 } 334 335 BlockDriverState *bdrv_new_root(void) 336 { 337 BlockDriverState *bs = bdrv_new(); 338 339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list); 340 return bs; 341 } 342 343 BlockDriverState *bdrv_new(void) 344 { 345 BlockDriverState *bs; 346 int i; 347 348 bs = g_new0(BlockDriverState, 1); 349 QLIST_INIT(&bs->dirty_bitmaps); 350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 351 QLIST_INIT(&bs->op_blockers[i]); 352 } 353 bdrv_iostatus_disable(bs); 354 notifier_list_init(&bs->close_notifiers); 355 notifier_with_return_list_init(&bs->before_write_notifiers); 356 qemu_co_queue_init(&bs->throttled_reqs[0]); 357 qemu_co_queue_init(&bs->throttled_reqs[1]); 358 bs->refcnt = 1; 359 bs->aio_context = qemu_get_aio_context(); 360 361 return bs; 362 } 363 364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) 365 { 366 notifier_list_add(&bs->close_notifiers, notify); 367 } 368 369 BlockDriver *bdrv_find_format(const char *format_name) 370 { 371 BlockDriver *drv1; 372 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 373 if (!strcmp(drv1->format_name, format_name)) { 374 return drv1; 375 } 376 } 377 return NULL; 378 } 379 380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only) 381 { 382 static const char *whitelist_rw[] = { 383 CONFIG_BDRV_RW_WHITELIST 384 }; 385 static const char *whitelist_ro[] = { 386 CONFIG_BDRV_RO_WHITELIST 387 }; 388 const char **p; 389 390 if (!whitelist_rw[0] && !whitelist_ro[0]) { 391 return 1; /* no whitelist, anything goes */ 392 } 393 394 for (p = whitelist_rw; *p; p++) { 395 if (!strcmp(drv->format_name, *p)) { 396 return 1; 397 } 398 } 399 if (read_only) { 400 for (p = whitelist_ro; *p; p++) { 401 if (!strcmp(drv->format_name, *p)) { 402 return 1; 403 } 404 } 405 } 406 return 0; 407 } 408 409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name, 410 bool read_only) 411 { 412 BlockDriver *drv = bdrv_find_format(format_name); 413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL; 414 } 415 416 typedef struct CreateCo { 417 BlockDriver *drv; 418 char *filename; 419 QemuOpts *opts; 420 int ret; 421 Error *err; 422 } CreateCo; 423 424 static void coroutine_fn bdrv_create_co_entry(void *opaque) 425 { 426 Error *local_err = NULL; 427 int ret; 428 429 CreateCo *cco = opaque; 430 assert(cco->drv); 431 432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err); 433 if (local_err) { 434 error_propagate(&cco->err, local_err); 435 } 436 cco->ret = ret; 437 } 438 439 int bdrv_create(BlockDriver *drv, const char* filename, 440 QemuOpts *opts, Error **errp) 441 { 442 int ret; 443 444 Coroutine *co; 445 CreateCo cco = { 446 .drv = drv, 447 .filename = g_strdup(filename), 448 .opts = opts, 449 .ret = NOT_DONE, 450 .err = NULL, 451 }; 452 453 if (!drv->bdrv_create) { 454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); 455 ret = -ENOTSUP; 456 goto out; 457 } 458 459 if (qemu_in_coroutine()) { 460 /* Fast-path if already in coroutine context */ 461 bdrv_create_co_entry(&cco); 462 } else { 463 co = qemu_coroutine_create(bdrv_create_co_entry); 464 qemu_coroutine_enter(co, &cco); 465 while (cco.ret == NOT_DONE) { 466 aio_poll(qemu_get_aio_context(), true); 467 } 468 } 469 470 ret = cco.ret; 471 if (ret < 0) { 472 if (cco.err) { 473 error_propagate(errp, cco.err); 474 } else { 475 error_setg_errno(errp, -ret, "Could not create image"); 476 } 477 } 478 479 out: 480 g_free(cco.filename); 481 return ret; 482 } 483 484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) 485 { 486 BlockDriver *drv; 487 Error *local_err = NULL; 488 int ret; 489 490 drv = bdrv_find_protocol(filename, true); 491 if (drv == NULL) { 492 error_setg(errp, "Could not find protocol for file '%s'", filename); 493 return -ENOENT; 494 } 495 496 ret = bdrv_create(drv, filename, opts, &local_err); 497 if (local_err) { 498 error_propagate(errp, local_err); 499 } 500 return ret; 501 } 502 503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 504 { 505 BlockDriver *drv = bs->drv; 506 Error *local_err = NULL; 507 508 memset(&bs->bl, 0, sizeof(bs->bl)); 509 510 if (!drv) { 511 return; 512 } 513 514 /* Take some limits from the children as a default */ 515 if (bs->file) { 516 bdrv_refresh_limits(bs->file, &local_err); 517 if (local_err) { 518 error_propagate(errp, local_err); 519 return; 520 } 521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; 522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; 523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; 524 } else { 525 bs->bl.opt_mem_alignment = 512; 526 } 527 528 if (bs->backing_hd) { 529 bdrv_refresh_limits(bs->backing_hd, &local_err); 530 if (local_err) { 531 error_propagate(errp, local_err); 532 return; 533 } 534 bs->bl.opt_transfer_length = 535 MAX(bs->bl.opt_transfer_length, 536 bs->backing_hd->bl.opt_transfer_length); 537 bs->bl.max_transfer_length = 538 MIN_NON_ZERO(bs->bl.max_transfer_length, 539 bs->backing_hd->bl.max_transfer_length); 540 bs->bl.opt_mem_alignment = 541 MAX(bs->bl.opt_mem_alignment, 542 bs->backing_hd->bl.opt_mem_alignment); 543 } 544 545 /* Then let the driver override it */ 546 if (drv->bdrv_refresh_limits) { 547 drv->bdrv_refresh_limits(bs, errp); 548 } 549 } 550 551 /* 552 * Create a uniquely-named empty temporary file. 553 * Return 0 upon success, otherwise a negative errno value. 554 */ 555 int get_tmp_filename(char *filename, int size) 556 { 557 #ifdef _WIN32 558 char temp_dir[MAX_PATH]; 559 /* GetTempFileName requires that its output buffer (4th param) 560 have length MAX_PATH or greater. */ 561 assert(size >= MAX_PATH); 562 return (GetTempPath(MAX_PATH, temp_dir) 563 && GetTempFileName(temp_dir, "qem", 0, filename) 564 ? 0 : -GetLastError()); 565 #else 566 int fd; 567 const char *tmpdir; 568 tmpdir = getenv("TMPDIR"); 569 if (!tmpdir) { 570 tmpdir = "/var/tmp"; 571 } 572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { 573 return -EOVERFLOW; 574 } 575 fd = mkstemp(filename); 576 if (fd < 0) { 577 return -errno; 578 } 579 if (close(fd) != 0) { 580 unlink(filename); 581 return -errno; 582 } 583 return 0; 584 #endif 585 } 586 587 /* 588 * Detect host devices. By convention, /dev/cdrom[N] is always 589 * recognized as a host CDROM. 590 */ 591 static BlockDriver *find_hdev_driver(const char *filename) 592 { 593 int score_max = 0, score; 594 BlockDriver *drv = NULL, *d; 595 596 QLIST_FOREACH(d, &bdrv_drivers, list) { 597 if (d->bdrv_probe_device) { 598 score = d->bdrv_probe_device(filename); 599 if (score > score_max) { 600 score_max = score; 601 drv = d; 602 } 603 } 604 } 605 606 return drv; 607 } 608 609 BlockDriver *bdrv_find_protocol(const char *filename, 610 bool allow_protocol_prefix) 611 { 612 BlockDriver *drv1; 613 char protocol[128]; 614 int len; 615 const char *p; 616 617 /* TODO Drivers without bdrv_file_open must be specified explicitly */ 618 619 /* 620 * XXX(hch): we really should not let host device detection 621 * override an explicit protocol specification, but moving this 622 * later breaks access to device names with colons in them. 623 * Thanks to the brain-dead persistent naming schemes on udev- 624 * based Linux systems those actually are quite common. 625 */ 626 drv1 = find_hdev_driver(filename); 627 if (drv1) { 628 return drv1; 629 } 630 631 if (!path_has_protocol(filename) || !allow_protocol_prefix) { 632 return bdrv_find_format("file"); 633 } 634 635 p = strchr(filename, ':'); 636 assert(p != NULL); 637 len = p - filename; 638 if (len > sizeof(protocol) - 1) 639 len = sizeof(protocol) - 1; 640 memcpy(protocol, filename, len); 641 protocol[len] = '\0'; 642 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 643 if (drv1->protocol_name && 644 !strcmp(drv1->protocol_name, protocol)) { 645 return drv1; 646 } 647 } 648 return NULL; 649 } 650 651 /* 652 * Guess image format by probing its contents. 653 * This is not a good idea when your image is raw (CVE-2008-2004), but 654 * we do it anyway for backward compatibility. 655 * 656 * @buf contains the image's first @buf_size bytes. 657 * @buf_size is the buffer size in bytes (generally 2048, but can be smaller 658 * if the image file is smaller) 659 * @filename is its filename. 660 * 661 * For all block drivers, call the bdrv_probe() method to get its 662 * probing score. 663 * Return the first block driver with the highest probing score. 664 */ 665 static BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, 666 const char *filename) 667 { 668 int score_max = 0, score; 669 BlockDriver *drv = NULL, *d; 670 671 QLIST_FOREACH(d, &bdrv_drivers, list) { 672 if (d->bdrv_probe) { 673 score = d->bdrv_probe(buf, buf_size, filename); 674 if (score > score_max) { 675 score_max = score; 676 drv = d; 677 } 678 } 679 } 680 681 return drv; 682 } 683 684 static int find_image_format(BlockDriverState *bs, const char *filename, 685 BlockDriver **pdrv, Error **errp) 686 { 687 BlockDriver *drv; 688 uint8_t buf[2048]; 689 int ret = 0; 690 691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ 692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { 693 drv = bdrv_find_format("raw"); 694 if (!drv) { 695 error_setg(errp, "Could not find raw image format"); 696 ret = -ENOENT; 697 } 698 *pdrv = drv; 699 return ret; 700 } 701 702 ret = bdrv_pread(bs, 0, buf, sizeof(buf)); 703 if (ret < 0) { 704 error_setg_errno(errp, -ret, "Could not read image for determining its " 705 "format"); 706 *pdrv = NULL; 707 return ret; 708 } 709 710 drv = bdrv_probe_all(buf, ret, filename); 711 if (!drv) { 712 error_setg(errp, "Could not determine image format: No compatible " 713 "driver found"); 714 ret = -ENOENT; 715 } 716 *pdrv = drv; 717 return ret; 718 } 719 720 /** 721 * Set the current 'total_sectors' value 722 * Return 0 on success, -errno on error. 723 */ 724 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) 725 { 726 BlockDriver *drv = bs->drv; 727 728 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ 729 if (bs->sg) 730 return 0; 731 732 /* query actual device if possible, otherwise just trust the hint */ 733 if (drv->bdrv_getlength) { 734 int64_t length = drv->bdrv_getlength(bs); 735 if (length < 0) { 736 return length; 737 } 738 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE); 739 } 740 741 bs->total_sectors = hint; 742 return 0; 743 } 744 745 /** 746 * Set open flags for a given discard mode 747 * 748 * Return 0 on success, -1 if the discard mode was invalid. 749 */ 750 int bdrv_parse_discard_flags(const char *mode, int *flags) 751 { 752 *flags &= ~BDRV_O_UNMAP; 753 754 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) { 755 /* do nothing */ 756 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) { 757 *flags |= BDRV_O_UNMAP; 758 } else { 759 return -1; 760 } 761 762 return 0; 763 } 764 765 /** 766 * Set open flags for a given cache mode 767 * 768 * Return 0 on success, -1 if the cache mode was invalid. 769 */ 770 int bdrv_parse_cache_flags(const char *mode, int *flags) 771 { 772 *flags &= ~BDRV_O_CACHE_MASK; 773 774 if (!strcmp(mode, "off") || !strcmp(mode, "none")) { 775 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; 776 } else if (!strcmp(mode, "directsync")) { 777 *flags |= BDRV_O_NOCACHE; 778 } else if (!strcmp(mode, "writeback")) { 779 *flags |= BDRV_O_CACHE_WB; 780 } else if (!strcmp(mode, "unsafe")) { 781 *flags |= BDRV_O_CACHE_WB; 782 *flags |= BDRV_O_NO_FLUSH; 783 } else if (!strcmp(mode, "writethrough")) { 784 /* this is the default */ 785 } else { 786 return -1; 787 } 788 789 return 0; 790 } 791 792 /** 793 * The copy-on-read flag is actually a reference count so multiple users may 794 * use the feature without worrying about clobbering its previous state. 795 * Copy-on-read stays enabled until all users have called to disable it. 796 */ 797 void bdrv_enable_copy_on_read(BlockDriverState *bs) 798 { 799 bs->copy_on_read++; 800 } 801 802 void bdrv_disable_copy_on_read(BlockDriverState *bs) 803 { 804 assert(bs->copy_on_read > 0); 805 bs->copy_on_read--; 806 } 807 808 /* 809 * Returns the flags that a temporary snapshot should get, based on the 810 * originally requested flags (the originally requested image will have flags 811 * like a backing file) 812 */ 813 static int bdrv_temp_snapshot_flags(int flags) 814 { 815 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; 816 } 817 818 /* 819 * Returns the flags that bs->file should get, based on the given flags for 820 * the parent BDS 821 */ 822 static int bdrv_inherited_flags(int flags) 823 { 824 /* Enable protocol handling, disable format probing for bs->file */ 825 flags |= BDRV_O_PROTOCOL; 826 827 /* Our block drivers take care to send flushes and respect unmap policy, 828 * so we can enable both unconditionally on lower layers. */ 829 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP; 830 831 /* Clear flags that only apply to the top layer */ 832 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ); 833 834 return flags; 835 } 836 837 /* 838 * Returns the flags that bs->backing_hd should get, based on the given flags 839 * for the parent BDS 840 */ 841 static int bdrv_backing_flags(int flags) 842 { 843 /* backing files always opened read-only */ 844 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ); 845 846 /* snapshot=on is handled on the top layer */ 847 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY); 848 849 return flags; 850 } 851 852 static int bdrv_open_flags(BlockDriverState *bs, int flags) 853 { 854 int open_flags = flags | BDRV_O_CACHE_WB; 855 856 /* 857 * Clear flags that are internal to the block layer before opening the 858 * image. 859 */ 860 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL); 861 862 /* 863 * Snapshots should be writable. 864 */ 865 if (flags & BDRV_O_TEMPORARY) { 866 open_flags |= BDRV_O_RDWR; 867 } 868 869 return open_flags; 870 } 871 872 static void bdrv_assign_node_name(BlockDriverState *bs, 873 const char *node_name, 874 Error **errp) 875 { 876 if (!node_name) { 877 return; 878 } 879 880 /* Check for empty string or invalid characters */ 881 if (!id_wellformed(node_name)) { 882 error_setg(errp, "Invalid node name"); 883 return; 884 } 885 886 /* takes care of avoiding namespaces collisions */ 887 if (blk_by_name(node_name)) { 888 error_setg(errp, "node-name=%s is conflicting with a device id", 889 node_name); 890 return; 891 } 892 893 /* takes care of avoiding duplicates node names */ 894 if (bdrv_find_node(node_name)) { 895 error_setg(errp, "Duplicate node name"); 896 return; 897 } 898 899 /* copy node name into the bs and insert it into the graph list */ 900 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name); 901 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list); 902 } 903 904 /* 905 * Common part for opening disk images and files 906 * 907 * Removes all processed options from *options. 908 */ 909 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, 910 QDict *options, int flags, BlockDriver *drv, Error **errp) 911 { 912 int ret, open_flags; 913 const char *filename; 914 const char *node_name = NULL; 915 Error *local_err = NULL; 916 917 assert(drv != NULL); 918 assert(bs->file == NULL); 919 assert(options != NULL && bs->options != options); 920 921 if (file != NULL) { 922 filename = file->filename; 923 } else { 924 filename = qdict_get_try_str(options, "filename"); 925 } 926 927 if (drv->bdrv_needs_filename && !filename) { 928 error_setg(errp, "The '%s' block driver requires a file name", 929 drv->format_name); 930 return -EINVAL; 931 } 932 933 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); 934 935 node_name = qdict_get_try_str(options, "node-name"); 936 bdrv_assign_node_name(bs, node_name, &local_err); 937 if (local_err) { 938 error_propagate(errp, local_err); 939 return -EINVAL; 940 } 941 qdict_del(options, "node-name"); 942 943 /* bdrv_open() with directly using a protocol as drv. This layer is already 944 * opened, so assign it to bs (while file becomes a closed BlockDriverState) 945 * and return immediately. */ 946 if (file != NULL && drv->bdrv_file_open) { 947 bdrv_swap(file, bs); 948 return 0; 949 } 950 951 bs->open_flags = flags; 952 bs->guest_block_size = 512; 953 bs->request_alignment = 512; 954 bs->zero_beyond_eof = true; 955 open_flags = bdrv_open_flags(bs, flags); 956 bs->read_only = !(open_flags & BDRV_O_RDWR); 957 bs->growable = !!(flags & BDRV_O_PROTOCOL); 958 959 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) { 960 error_setg(errp, 961 !bs->read_only && bdrv_is_whitelisted(drv, true) 962 ? "Driver '%s' can only be used for read-only devices" 963 : "Driver '%s' is not whitelisted", 964 drv->format_name); 965 return -ENOTSUP; 966 } 967 968 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ 969 if (flags & BDRV_O_COPY_ON_READ) { 970 if (!bs->read_only) { 971 bdrv_enable_copy_on_read(bs); 972 } else { 973 error_setg(errp, "Can't use copy-on-read on read-only device"); 974 return -EINVAL; 975 } 976 } 977 978 if (filename != NULL) { 979 pstrcpy(bs->filename, sizeof(bs->filename), filename); 980 } else { 981 bs->filename[0] = '\0'; 982 } 983 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename); 984 985 bs->drv = drv; 986 bs->opaque = g_malloc0(drv->instance_size); 987 988 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); 989 990 /* Open the image, either directly or using a protocol */ 991 if (drv->bdrv_file_open) { 992 assert(file == NULL); 993 assert(!drv->bdrv_needs_filename || filename != NULL); 994 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err); 995 } else { 996 if (file == NULL) { 997 error_setg(errp, "Can't use '%s' as a block driver for the " 998 "protocol level", drv->format_name); 999 ret = -EINVAL; 1000 goto free_and_fail; 1001 } 1002 bs->file = file; 1003 ret = drv->bdrv_open(bs, options, open_flags, &local_err); 1004 } 1005 1006 if (ret < 0) { 1007 if (local_err) { 1008 error_propagate(errp, local_err); 1009 } else if (bs->filename[0]) { 1010 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename); 1011 } else { 1012 error_setg_errno(errp, -ret, "Could not open image"); 1013 } 1014 goto free_and_fail; 1015 } 1016 1017 ret = refresh_total_sectors(bs, bs->total_sectors); 1018 if (ret < 0) { 1019 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 1020 goto free_and_fail; 1021 } 1022 1023 bdrv_refresh_limits(bs, &local_err); 1024 if (local_err) { 1025 error_propagate(errp, local_err); 1026 ret = -EINVAL; 1027 goto free_and_fail; 1028 } 1029 1030 assert(bdrv_opt_mem_align(bs) != 0); 1031 assert((bs->request_alignment != 0) || bs->sg); 1032 return 0; 1033 1034 free_and_fail: 1035 bs->file = NULL; 1036 g_free(bs->opaque); 1037 bs->opaque = NULL; 1038 bs->drv = NULL; 1039 return ret; 1040 } 1041 1042 static QDict *parse_json_filename(const char *filename, Error **errp) 1043 { 1044 QObject *options_obj; 1045 QDict *options; 1046 int ret; 1047 1048 ret = strstart(filename, "json:", &filename); 1049 assert(ret); 1050 1051 options_obj = qobject_from_json(filename); 1052 if (!options_obj) { 1053 error_setg(errp, "Could not parse the JSON options"); 1054 return NULL; 1055 } 1056 1057 if (qobject_type(options_obj) != QTYPE_QDICT) { 1058 qobject_decref(options_obj); 1059 error_setg(errp, "Invalid JSON object given"); 1060 return NULL; 1061 } 1062 1063 options = qobject_to_qdict(options_obj); 1064 qdict_flatten(options); 1065 1066 return options; 1067 } 1068 1069 /* 1070 * Fills in default options for opening images and converts the legacy 1071 * filename/flags pair to option QDict entries. 1072 */ 1073 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags, 1074 BlockDriver *drv, Error **errp) 1075 { 1076 const char *filename = *pfilename; 1077 const char *drvname; 1078 bool protocol = flags & BDRV_O_PROTOCOL; 1079 bool parse_filename = false; 1080 Error *local_err = NULL; 1081 1082 /* Parse json: pseudo-protocol */ 1083 if (filename && g_str_has_prefix(filename, "json:")) { 1084 QDict *json_options = parse_json_filename(filename, &local_err); 1085 if (local_err) { 1086 error_propagate(errp, local_err); 1087 return -EINVAL; 1088 } 1089 1090 /* Options given in the filename have lower priority than options 1091 * specified directly */ 1092 qdict_join(*options, json_options, false); 1093 QDECREF(json_options); 1094 *pfilename = filename = NULL; 1095 } 1096 1097 /* Fetch the file name from the options QDict if necessary */ 1098 if (protocol && filename) { 1099 if (!qdict_haskey(*options, "filename")) { 1100 qdict_put(*options, "filename", qstring_from_str(filename)); 1101 parse_filename = true; 1102 } else { 1103 error_setg(errp, "Can't specify 'file' and 'filename' options at " 1104 "the same time"); 1105 return -EINVAL; 1106 } 1107 } 1108 1109 /* Find the right block driver */ 1110 filename = qdict_get_try_str(*options, "filename"); 1111 drvname = qdict_get_try_str(*options, "driver"); 1112 1113 if (drv) { 1114 if (drvname) { 1115 error_setg(errp, "Driver specified twice"); 1116 return -EINVAL; 1117 } 1118 drvname = drv->format_name; 1119 qdict_put(*options, "driver", qstring_from_str(drvname)); 1120 } else { 1121 if (!drvname && protocol) { 1122 if (filename) { 1123 drv = bdrv_find_protocol(filename, parse_filename); 1124 if (!drv) { 1125 error_setg(errp, "Unknown protocol"); 1126 return -EINVAL; 1127 } 1128 1129 drvname = drv->format_name; 1130 qdict_put(*options, "driver", qstring_from_str(drvname)); 1131 } else { 1132 error_setg(errp, "Must specify either driver or file"); 1133 return -EINVAL; 1134 } 1135 } else if (drvname) { 1136 drv = bdrv_find_format(drvname); 1137 if (!drv) { 1138 error_setg(errp, "Unknown driver '%s'", drvname); 1139 return -ENOENT; 1140 } 1141 } 1142 } 1143 1144 assert(drv || !protocol); 1145 1146 /* Driver-specific filename parsing */ 1147 if (drv && drv->bdrv_parse_filename && parse_filename) { 1148 drv->bdrv_parse_filename(filename, *options, &local_err); 1149 if (local_err) { 1150 error_propagate(errp, local_err); 1151 return -EINVAL; 1152 } 1153 1154 if (!drv->bdrv_needs_filename) { 1155 qdict_del(*options, "filename"); 1156 } 1157 } 1158 1159 return 0; 1160 } 1161 1162 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd) 1163 { 1164 1165 if (bs->backing_hd) { 1166 assert(bs->backing_blocker); 1167 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker); 1168 } else if (backing_hd) { 1169 error_setg(&bs->backing_blocker, 1170 "device is used as backing hd of '%s'", 1171 bdrv_get_device_name(bs)); 1172 } 1173 1174 bs->backing_hd = backing_hd; 1175 if (!backing_hd) { 1176 error_free(bs->backing_blocker); 1177 bs->backing_blocker = NULL; 1178 goto out; 1179 } 1180 bs->open_flags &= ~BDRV_O_NO_BACKING; 1181 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename); 1182 pstrcpy(bs->backing_format, sizeof(bs->backing_format), 1183 backing_hd->drv ? backing_hd->drv->format_name : ""); 1184 1185 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker); 1186 /* Otherwise we won't be able to commit due to check in bdrv_commit */ 1187 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, 1188 bs->backing_blocker); 1189 out: 1190 bdrv_refresh_limits(bs, NULL); 1191 } 1192 1193 /* 1194 * Opens the backing file for a BlockDriverState if not yet open 1195 * 1196 * options is a QDict of options to pass to the block drivers, or NULL for an 1197 * empty set of options. The reference to the QDict is transferred to this 1198 * function (even on failure), so if the caller intends to reuse the dictionary, 1199 * it needs to use QINCREF() before calling bdrv_file_open. 1200 */ 1201 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) 1202 { 1203 char *backing_filename = g_malloc0(PATH_MAX); 1204 int ret = 0; 1205 BlockDriver *back_drv = NULL; 1206 BlockDriverState *backing_hd; 1207 Error *local_err = NULL; 1208 1209 if (bs->backing_hd != NULL) { 1210 QDECREF(options); 1211 goto free_exit; 1212 } 1213 1214 /* NULL means an empty set of options */ 1215 if (options == NULL) { 1216 options = qdict_new(); 1217 } 1218 1219 bs->open_flags &= ~BDRV_O_NO_BACKING; 1220 if (qdict_haskey(options, "file.filename")) { 1221 backing_filename[0] = '\0'; 1222 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) { 1223 QDECREF(options); 1224 goto free_exit; 1225 } else { 1226 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX); 1227 } 1228 1229 if (!bs->drv || !bs->drv->supports_backing) { 1230 ret = -EINVAL; 1231 error_setg(errp, "Driver doesn't support backing files"); 1232 QDECREF(options); 1233 goto free_exit; 1234 } 1235 1236 backing_hd = bdrv_new(); 1237 1238 if (bs->backing_format[0] != '\0') { 1239 back_drv = bdrv_find_format(bs->backing_format); 1240 } 1241 1242 assert(bs->backing_hd == NULL); 1243 ret = bdrv_open(&backing_hd, 1244 *backing_filename ? backing_filename : NULL, NULL, options, 1245 bdrv_backing_flags(bs->open_flags), back_drv, &local_err); 1246 if (ret < 0) { 1247 bdrv_unref(backing_hd); 1248 backing_hd = NULL; 1249 bs->open_flags |= BDRV_O_NO_BACKING; 1250 error_setg(errp, "Could not open backing file: %s", 1251 error_get_pretty(local_err)); 1252 error_free(local_err); 1253 goto free_exit; 1254 } 1255 bdrv_set_backing_hd(bs, backing_hd); 1256 1257 free_exit: 1258 g_free(backing_filename); 1259 return ret; 1260 } 1261 1262 /* 1263 * Opens a disk image whose options are given as BlockdevRef in another block 1264 * device's options. 1265 * 1266 * If allow_none is true, no image will be opened if filename is false and no 1267 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned. 1268 * 1269 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict. 1270 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict 1271 * itself, all options starting with "${bdref_key}." are considered part of the 1272 * BlockdevRef. 1273 * 1274 * The BlockdevRef will be removed from the options QDict. 1275 * 1276 * To conform with the behavior of bdrv_open(), *pbs has to be NULL. 1277 */ 1278 int bdrv_open_image(BlockDriverState **pbs, const char *filename, 1279 QDict *options, const char *bdref_key, int flags, 1280 bool allow_none, Error **errp) 1281 { 1282 QDict *image_options; 1283 int ret; 1284 char *bdref_key_dot; 1285 const char *reference; 1286 1287 assert(pbs); 1288 assert(*pbs == NULL); 1289 1290 bdref_key_dot = g_strdup_printf("%s.", bdref_key); 1291 qdict_extract_subqdict(options, &image_options, bdref_key_dot); 1292 g_free(bdref_key_dot); 1293 1294 reference = qdict_get_try_str(options, bdref_key); 1295 if (!filename && !reference && !qdict_size(image_options)) { 1296 if (allow_none) { 1297 ret = 0; 1298 } else { 1299 error_setg(errp, "A block device must be specified for \"%s\"", 1300 bdref_key); 1301 ret = -EINVAL; 1302 } 1303 QDECREF(image_options); 1304 goto done; 1305 } 1306 1307 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp); 1308 1309 done: 1310 qdict_del(options, bdref_key); 1311 return ret; 1312 } 1313 1314 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) 1315 { 1316 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ 1317 char *tmp_filename = g_malloc0(PATH_MAX + 1); 1318 int64_t total_size; 1319 BlockDriver *bdrv_qcow2; 1320 QemuOpts *opts = NULL; 1321 QDict *snapshot_options; 1322 BlockDriverState *bs_snapshot; 1323 Error *local_err; 1324 int ret; 1325 1326 /* if snapshot, we create a temporary backing file and open it 1327 instead of opening 'filename' directly */ 1328 1329 /* Get the required size from the image */ 1330 total_size = bdrv_getlength(bs); 1331 if (total_size < 0) { 1332 ret = total_size; 1333 error_setg_errno(errp, -total_size, "Could not get image size"); 1334 goto out; 1335 } 1336 1337 /* Create the temporary image */ 1338 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1); 1339 if (ret < 0) { 1340 error_setg_errno(errp, -ret, "Could not get temporary filename"); 1341 goto out; 1342 } 1343 1344 bdrv_qcow2 = bdrv_find_format("qcow2"); 1345 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0, 1346 &error_abort); 1347 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size); 1348 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err); 1349 qemu_opts_del(opts); 1350 if (ret < 0) { 1351 error_setg_errno(errp, -ret, "Could not create temporary overlay " 1352 "'%s': %s", tmp_filename, 1353 error_get_pretty(local_err)); 1354 error_free(local_err); 1355 goto out; 1356 } 1357 1358 /* Prepare a new options QDict for the temporary file */ 1359 snapshot_options = qdict_new(); 1360 qdict_put(snapshot_options, "file.driver", 1361 qstring_from_str("file")); 1362 qdict_put(snapshot_options, "file.filename", 1363 qstring_from_str(tmp_filename)); 1364 1365 bs_snapshot = bdrv_new(); 1366 1367 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options, 1368 flags, bdrv_qcow2, &local_err); 1369 if (ret < 0) { 1370 error_propagate(errp, local_err); 1371 goto out; 1372 } 1373 1374 bdrv_append(bs_snapshot, bs); 1375 1376 out: 1377 g_free(tmp_filename); 1378 return ret; 1379 } 1380 1381 /* 1382 * Opens a disk image (raw, qcow2, vmdk, ...) 1383 * 1384 * options is a QDict of options to pass to the block drivers, or NULL for an 1385 * empty set of options. The reference to the QDict belongs to the block layer 1386 * after the call (even on failure), so if the caller intends to reuse the 1387 * dictionary, it needs to use QINCREF() before calling bdrv_open. 1388 * 1389 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there. 1390 * If it is not NULL, the referenced BDS will be reused. 1391 * 1392 * The reference parameter may be used to specify an existing block device which 1393 * should be opened. If specified, neither options nor a filename may be given, 1394 * nor can an existing BDS be reused (that is, *pbs has to be NULL). 1395 */ 1396 int bdrv_open(BlockDriverState **pbs, const char *filename, 1397 const char *reference, QDict *options, int flags, 1398 BlockDriver *drv, Error **errp) 1399 { 1400 int ret; 1401 BlockDriverState *file = NULL, *bs; 1402 const char *drvname; 1403 Error *local_err = NULL; 1404 int snapshot_flags = 0; 1405 1406 assert(pbs); 1407 1408 if (reference) { 1409 bool options_non_empty = options ? qdict_size(options) : false; 1410 QDECREF(options); 1411 1412 if (*pbs) { 1413 error_setg(errp, "Cannot reuse an existing BDS when referencing " 1414 "another block device"); 1415 return -EINVAL; 1416 } 1417 1418 if (filename || options_non_empty) { 1419 error_setg(errp, "Cannot reference an existing block device with " 1420 "additional options or a new filename"); 1421 return -EINVAL; 1422 } 1423 1424 bs = bdrv_lookup_bs(reference, reference, errp); 1425 if (!bs) { 1426 return -ENODEV; 1427 } 1428 bdrv_ref(bs); 1429 *pbs = bs; 1430 return 0; 1431 } 1432 1433 if (*pbs) { 1434 bs = *pbs; 1435 } else { 1436 bs = bdrv_new(); 1437 } 1438 1439 /* NULL means an empty set of options */ 1440 if (options == NULL) { 1441 options = qdict_new(); 1442 } 1443 1444 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err); 1445 if (local_err) { 1446 goto fail; 1447 } 1448 1449 /* Find the right image format driver */ 1450 drv = NULL; 1451 drvname = qdict_get_try_str(options, "driver"); 1452 if (drvname) { 1453 drv = bdrv_find_format(drvname); 1454 qdict_del(options, "driver"); 1455 if (!drv) { 1456 error_setg(errp, "Unknown driver: '%s'", drvname); 1457 ret = -EINVAL; 1458 goto fail; 1459 } 1460 } 1461 1462 assert(drvname || !(flags & BDRV_O_PROTOCOL)); 1463 if (drv && !drv->bdrv_file_open) { 1464 /* If the user explicitly wants a format driver here, we'll need to add 1465 * another layer for the protocol in bs->file */ 1466 flags &= ~BDRV_O_PROTOCOL; 1467 } 1468 1469 bs->options = options; 1470 options = qdict_clone_shallow(options); 1471 1472 /* Open image file without format layer */ 1473 if ((flags & BDRV_O_PROTOCOL) == 0) { 1474 if (flags & BDRV_O_RDWR) { 1475 flags |= BDRV_O_ALLOW_RDWR; 1476 } 1477 if (flags & BDRV_O_SNAPSHOT) { 1478 snapshot_flags = bdrv_temp_snapshot_flags(flags); 1479 flags = bdrv_backing_flags(flags); 1480 } 1481 1482 assert(file == NULL); 1483 ret = bdrv_open_image(&file, filename, options, "file", 1484 bdrv_inherited_flags(flags), 1485 true, &local_err); 1486 if (ret < 0) { 1487 goto fail; 1488 } 1489 } 1490 1491 /* Image format probing */ 1492 if (!drv && file) { 1493 ret = find_image_format(file, filename, &drv, &local_err); 1494 if (ret < 0) { 1495 goto fail; 1496 } 1497 } else if (!drv) { 1498 error_setg(errp, "Must specify either driver or file"); 1499 ret = -EINVAL; 1500 goto fail; 1501 } 1502 1503 /* Open the image */ 1504 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err); 1505 if (ret < 0) { 1506 goto fail; 1507 } 1508 1509 if (file && (bs->file != file)) { 1510 bdrv_unref(file); 1511 file = NULL; 1512 } 1513 1514 /* If there is a backing file, use it */ 1515 if ((flags & BDRV_O_NO_BACKING) == 0) { 1516 QDict *backing_options; 1517 1518 qdict_extract_subqdict(options, &backing_options, "backing."); 1519 ret = bdrv_open_backing_file(bs, backing_options, &local_err); 1520 if (ret < 0) { 1521 goto close_and_fail; 1522 } 1523 } 1524 1525 bdrv_refresh_filename(bs); 1526 1527 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the 1528 * temporary snapshot afterwards. */ 1529 if (snapshot_flags) { 1530 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err); 1531 if (local_err) { 1532 goto close_and_fail; 1533 } 1534 } 1535 1536 /* Check if any unknown options were used */ 1537 if (options && (qdict_size(options) != 0)) { 1538 const QDictEntry *entry = qdict_first(options); 1539 if (flags & BDRV_O_PROTOCOL) { 1540 error_setg(errp, "Block protocol '%s' doesn't support the option " 1541 "'%s'", drv->format_name, entry->key); 1542 } else { 1543 error_setg(errp, "Block format '%s' used by device '%s' doesn't " 1544 "support the option '%s'", drv->format_name, 1545 bdrv_get_device_name(bs), entry->key); 1546 } 1547 1548 ret = -EINVAL; 1549 goto close_and_fail; 1550 } 1551 1552 if (!bdrv_key_required(bs)) { 1553 if (bs->blk) { 1554 blk_dev_change_media_cb(bs->blk, true); 1555 } 1556 } else if (!runstate_check(RUN_STATE_PRELAUNCH) 1557 && !runstate_check(RUN_STATE_INMIGRATE) 1558 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */ 1559 error_setg(errp, 1560 "Guest must be stopped for opening of encrypted image"); 1561 ret = -EBUSY; 1562 goto close_and_fail; 1563 } 1564 1565 QDECREF(options); 1566 *pbs = bs; 1567 return 0; 1568 1569 fail: 1570 if (file != NULL) { 1571 bdrv_unref(file); 1572 } 1573 QDECREF(bs->options); 1574 QDECREF(options); 1575 bs->options = NULL; 1576 if (!*pbs) { 1577 /* If *pbs is NULL, a new BDS has been created in this function and 1578 needs to be freed now. Otherwise, it does not need to be closed, 1579 since it has not really been opened yet. */ 1580 bdrv_unref(bs); 1581 } 1582 if (local_err) { 1583 error_propagate(errp, local_err); 1584 } 1585 return ret; 1586 1587 close_and_fail: 1588 /* See fail path, but now the BDS has to be always closed */ 1589 if (*pbs) { 1590 bdrv_close(bs); 1591 } else { 1592 bdrv_unref(bs); 1593 } 1594 QDECREF(options); 1595 if (local_err) { 1596 error_propagate(errp, local_err); 1597 } 1598 return ret; 1599 } 1600 1601 typedef struct BlockReopenQueueEntry { 1602 bool prepared; 1603 BDRVReopenState state; 1604 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; 1605 } BlockReopenQueueEntry; 1606 1607 /* 1608 * Adds a BlockDriverState to a simple queue for an atomic, transactional 1609 * reopen of multiple devices. 1610 * 1611 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT 1612 * already performed, or alternatively may be NULL a new BlockReopenQueue will 1613 * be created and initialized. This newly created BlockReopenQueue should be 1614 * passed back in for subsequent calls that are intended to be of the same 1615 * atomic 'set'. 1616 * 1617 * bs is the BlockDriverState to add to the reopen queue. 1618 * 1619 * flags contains the open flags for the associated bs 1620 * 1621 * returns a pointer to bs_queue, which is either the newly allocated 1622 * bs_queue, or the existing bs_queue being used. 1623 * 1624 */ 1625 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, 1626 BlockDriverState *bs, int flags) 1627 { 1628 assert(bs != NULL); 1629 1630 BlockReopenQueueEntry *bs_entry; 1631 if (bs_queue == NULL) { 1632 bs_queue = g_new0(BlockReopenQueue, 1); 1633 QSIMPLEQ_INIT(bs_queue); 1634 } 1635 1636 /* bdrv_open() masks this flag out */ 1637 flags &= ~BDRV_O_PROTOCOL; 1638 1639 if (bs->file) { 1640 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags)); 1641 } 1642 1643 bs_entry = g_new0(BlockReopenQueueEntry, 1); 1644 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); 1645 1646 bs_entry->state.bs = bs; 1647 bs_entry->state.flags = flags; 1648 1649 return bs_queue; 1650 } 1651 1652 /* 1653 * Reopen multiple BlockDriverStates atomically & transactionally. 1654 * 1655 * The queue passed in (bs_queue) must have been built up previous 1656 * via bdrv_reopen_queue(). 1657 * 1658 * Reopens all BDS specified in the queue, with the appropriate 1659 * flags. All devices are prepared for reopen, and failure of any 1660 * device will cause all device changes to be abandonded, and intermediate 1661 * data cleaned up. 1662 * 1663 * If all devices prepare successfully, then the changes are committed 1664 * to all devices. 1665 * 1666 */ 1667 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) 1668 { 1669 int ret = -1; 1670 BlockReopenQueueEntry *bs_entry, *next; 1671 Error *local_err = NULL; 1672 1673 assert(bs_queue != NULL); 1674 1675 bdrv_drain_all(); 1676 1677 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1678 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { 1679 error_propagate(errp, local_err); 1680 goto cleanup; 1681 } 1682 bs_entry->prepared = true; 1683 } 1684 1685 /* If we reach this point, we have success and just need to apply the 1686 * changes 1687 */ 1688 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1689 bdrv_reopen_commit(&bs_entry->state); 1690 } 1691 1692 ret = 0; 1693 1694 cleanup: 1695 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { 1696 if (ret && bs_entry->prepared) { 1697 bdrv_reopen_abort(&bs_entry->state); 1698 } 1699 g_free(bs_entry); 1700 } 1701 g_free(bs_queue); 1702 return ret; 1703 } 1704 1705 1706 /* Reopen a single BlockDriverState with the specified flags. */ 1707 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) 1708 { 1709 int ret = -1; 1710 Error *local_err = NULL; 1711 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); 1712 1713 ret = bdrv_reopen_multiple(queue, &local_err); 1714 if (local_err != NULL) { 1715 error_propagate(errp, local_err); 1716 } 1717 return ret; 1718 } 1719 1720 1721 /* 1722 * Prepares a BlockDriverState for reopen. All changes are staged in the 1723 * 'opaque' field of the BDRVReopenState, which is used and allocated by 1724 * the block driver layer .bdrv_reopen_prepare() 1725 * 1726 * bs is the BlockDriverState to reopen 1727 * flags are the new open flags 1728 * queue is the reopen queue 1729 * 1730 * Returns 0 on success, non-zero on error. On error errp will be set 1731 * as well. 1732 * 1733 * On failure, bdrv_reopen_abort() will be called to clean up any data. 1734 * It is the responsibility of the caller to then call the abort() or 1735 * commit() for any other BDS that have been left in a prepare() state 1736 * 1737 */ 1738 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, 1739 Error **errp) 1740 { 1741 int ret = -1; 1742 Error *local_err = NULL; 1743 BlockDriver *drv; 1744 1745 assert(reopen_state != NULL); 1746 assert(reopen_state->bs->drv != NULL); 1747 drv = reopen_state->bs->drv; 1748 1749 /* if we are to stay read-only, do not allow permission change 1750 * to r/w */ 1751 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && 1752 reopen_state->flags & BDRV_O_RDWR) { 1753 error_set(errp, QERR_DEVICE_IS_READ_ONLY, 1754 bdrv_get_device_name(reopen_state->bs)); 1755 goto error; 1756 } 1757 1758 1759 ret = bdrv_flush(reopen_state->bs); 1760 if (ret) { 1761 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", 1762 strerror(-ret)); 1763 goto error; 1764 } 1765 1766 if (drv->bdrv_reopen_prepare) { 1767 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); 1768 if (ret) { 1769 if (local_err != NULL) { 1770 error_propagate(errp, local_err); 1771 } else { 1772 error_setg(errp, "failed while preparing to reopen image '%s'", 1773 reopen_state->bs->filename); 1774 } 1775 goto error; 1776 } 1777 } else { 1778 /* It is currently mandatory to have a bdrv_reopen_prepare() 1779 * handler for each supported drv. */ 1780 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, 1781 drv->format_name, bdrv_get_device_name(reopen_state->bs), 1782 "reopening of file"); 1783 ret = -1; 1784 goto error; 1785 } 1786 1787 ret = 0; 1788 1789 error: 1790 return ret; 1791 } 1792 1793 /* 1794 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and 1795 * makes them final by swapping the staging BlockDriverState contents into 1796 * the active BlockDriverState contents. 1797 */ 1798 void bdrv_reopen_commit(BDRVReopenState *reopen_state) 1799 { 1800 BlockDriver *drv; 1801 1802 assert(reopen_state != NULL); 1803 drv = reopen_state->bs->drv; 1804 assert(drv != NULL); 1805 1806 /* If there are any driver level actions to take */ 1807 if (drv->bdrv_reopen_commit) { 1808 drv->bdrv_reopen_commit(reopen_state); 1809 } 1810 1811 /* set BDS specific flags now */ 1812 reopen_state->bs->open_flags = reopen_state->flags; 1813 reopen_state->bs->enable_write_cache = !!(reopen_state->flags & 1814 BDRV_O_CACHE_WB); 1815 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); 1816 1817 bdrv_refresh_limits(reopen_state->bs, NULL); 1818 } 1819 1820 /* 1821 * Abort the reopen, and delete and free the staged changes in 1822 * reopen_state 1823 */ 1824 void bdrv_reopen_abort(BDRVReopenState *reopen_state) 1825 { 1826 BlockDriver *drv; 1827 1828 assert(reopen_state != NULL); 1829 drv = reopen_state->bs->drv; 1830 assert(drv != NULL); 1831 1832 if (drv->bdrv_reopen_abort) { 1833 drv->bdrv_reopen_abort(reopen_state); 1834 } 1835 } 1836 1837 1838 void bdrv_close(BlockDriverState *bs) 1839 { 1840 BdrvAioNotifier *ban, *ban_next; 1841 1842 if (bs->job) { 1843 block_job_cancel_sync(bs->job); 1844 } 1845 bdrv_drain_all(); /* complete I/O */ 1846 bdrv_flush(bs); 1847 bdrv_drain_all(); /* in case flush left pending I/O */ 1848 notifier_list_notify(&bs->close_notifiers, bs); 1849 1850 if (bs->drv) { 1851 if (bs->backing_hd) { 1852 BlockDriverState *backing_hd = bs->backing_hd; 1853 bdrv_set_backing_hd(bs, NULL); 1854 bdrv_unref(backing_hd); 1855 } 1856 bs->drv->bdrv_close(bs); 1857 g_free(bs->opaque); 1858 bs->opaque = NULL; 1859 bs->drv = NULL; 1860 bs->copy_on_read = 0; 1861 bs->backing_file[0] = '\0'; 1862 bs->backing_format[0] = '\0'; 1863 bs->total_sectors = 0; 1864 bs->encrypted = 0; 1865 bs->valid_key = 0; 1866 bs->sg = 0; 1867 bs->growable = 0; 1868 bs->zero_beyond_eof = false; 1869 QDECREF(bs->options); 1870 bs->options = NULL; 1871 QDECREF(bs->full_open_options); 1872 bs->full_open_options = NULL; 1873 1874 if (bs->file != NULL) { 1875 bdrv_unref(bs->file); 1876 bs->file = NULL; 1877 } 1878 } 1879 1880 if (bs->blk) { 1881 blk_dev_change_media_cb(bs->blk, false); 1882 } 1883 1884 /*throttling disk I/O limits*/ 1885 if (bs->io_limits_enabled) { 1886 bdrv_io_limits_disable(bs); 1887 } 1888 1889 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) { 1890 g_free(ban); 1891 } 1892 QLIST_INIT(&bs->aio_notifiers); 1893 } 1894 1895 void bdrv_close_all(void) 1896 { 1897 BlockDriverState *bs; 1898 1899 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 1900 AioContext *aio_context = bdrv_get_aio_context(bs); 1901 1902 aio_context_acquire(aio_context); 1903 bdrv_close(bs); 1904 aio_context_release(aio_context); 1905 } 1906 } 1907 1908 /* Check if any requests are in-flight (including throttled requests) */ 1909 static bool bdrv_requests_pending(BlockDriverState *bs) 1910 { 1911 if (!QLIST_EMPTY(&bs->tracked_requests)) { 1912 return true; 1913 } 1914 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 1915 return true; 1916 } 1917 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 1918 return true; 1919 } 1920 if (bs->file && bdrv_requests_pending(bs->file)) { 1921 return true; 1922 } 1923 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 1924 return true; 1925 } 1926 return false; 1927 } 1928 1929 static bool bdrv_drain_one(BlockDriverState *bs) 1930 { 1931 bool bs_busy; 1932 1933 bdrv_flush_io_queue(bs); 1934 bdrv_start_throttled_reqs(bs); 1935 bs_busy = bdrv_requests_pending(bs); 1936 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); 1937 return bs_busy; 1938 } 1939 1940 /* 1941 * Wait for pending requests to complete on a single BlockDriverState subtree 1942 * 1943 * See the warning in bdrv_drain_all(). This function can only be called if 1944 * you are sure nothing can generate I/O because you have op blockers 1945 * installed. 1946 * 1947 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 1948 * AioContext. 1949 */ 1950 void bdrv_drain(BlockDriverState *bs) 1951 { 1952 while (bdrv_drain_one(bs)) { 1953 /* Keep iterating */ 1954 } 1955 } 1956 1957 /* 1958 * Wait for pending requests to complete across all BlockDriverStates 1959 * 1960 * This function does not flush data to disk, use bdrv_flush_all() for that 1961 * after calling this function. 1962 * 1963 * Note that completion of an asynchronous I/O operation can trigger any 1964 * number of other I/O operations on other devices---for example a coroutine 1965 * can be arbitrarily complex and a constant flow of I/O can come until the 1966 * coroutine is complete. Because of this, it is not possible to have a 1967 * function to drain a single device's I/O queue. 1968 */ 1969 void bdrv_drain_all(void) 1970 { 1971 /* Always run first iteration so any pending completion BHs run */ 1972 bool busy = true; 1973 BlockDriverState *bs; 1974 1975 while (busy) { 1976 busy = false; 1977 1978 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 1979 AioContext *aio_context = bdrv_get_aio_context(bs); 1980 1981 aio_context_acquire(aio_context); 1982 busy |= bdrv_drain_one(bs); 1983 aio_context_release(aio_context); 1984 } 1985 } 1986 } 1987 1988 /* make a BlockDriverState anonymous by removing from bdrv_state and 1989 * graph_bdrv_state list. 1990 Also, NULL terminate the device_name to prevent double remove */ 1991 void bdrv_make_anon(BlockDriverState *bs) 1992 { 1993 /* 1994 * Take care to remove bs from bdrv_states only when it's actually 1995 * in it. Note that bs->device_list.tqe_prev is initially null, 1996 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish 1997 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by 1998 * resetting it to null on remove. 1999 */ 2000 if (bs->device_list.tqe_prev) { 2001 QTAILQ_REMOVE(&bdrv_states, bs, device_list); 2002 bs->device_list.tqe_prev = NULL; 2003 } 2004 if (bs->node_name[0] != '\0') { 2005 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list); 2006 } 2007 bs->node_name[0] = '\0'; 2008 } 2009 2010 static void bdrv_rebind(BlockDriverState *bs) 2011 { 2012 if (bs->drv && bs->drv->bdrv_rebind) { 2013 bs->drv->bdrv_rebind(bs); 2014 } 2015 } 2016 2017 static void bdrv_move_feature_fields(BlockDriverState *bs_dest, 2018 BlockDriverState *bs_src) 2019 { 2020 /* move some fields that need to stay attached to the device */ 2021 2022 /* dev info */ 2023 bs_dest->guest_block_size = bs_src->guest_block_size; 2024 bs_dest->copy_on_read = bs_src->copy_on_read; 2025 2026 bs_dest->enable_write_cache = bs_src->enable_write_cache; 2027 2028 /* i/o throttled req */ 2029 memcpy(&bs_dest->throttle_state, 2030 &bs_src->throttle_state, 2031 sizeof(ThrottleState)); 2032 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0]; 2033 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1]; 2034 bs_dest->io_limits_enabled = bs_src->io_limits_enabled; 2035 2036 /* r/w error */ 2037 bs_dest->on_read_error = bs_src->on_read_error; 2038 bs_dest->on_write_error = bs_src->on_write_error; 2039 2040 /* i/o status */ 2041 bs_dest->iostatus_enabled = bs_src->iostatus_enabled; 2042 bs_dest->iostatus = bs_src->iostatus; 2043 2044 /* dirty bitmap */ 2045 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps; 2046 2047 /* reference count */ 2048 bs_dest->refcnt = bs_src->refcnt; 2049 2050 /* job */ 2051 bs_dest->job = bs_src->job; 2052 2053 /* keep the same entry in bdrv_states */ 2054 bs_dest->device_list = bs_src->device_list; 2055 bs_dest->blk = bs_src->blk; 2056 2057 memcpy(bs_dest->op_blockers, bs_src->op_blockers, 2058 sizeof(bs_dest->op_blockers)); 2059 } 2060 2061 /* 2062 * Swap bs contents for two image chains while they are live, 2063 * while keeping required fields on the BlockDriverState that is 2064 * actually attached to a device. 2065 * 2066 * This will modify the BlockDriverState fields, and swap contents 2067 * between bs_new and bs_old. Both bs_new and bs_old are modified. 2068 * 2069 * bs_new must not be attached to a BlockBackend. 2070 * 2071 * This function does not create any image files. 2072 */ 2073 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) 2074 { 2075 BlockDriverState tmp; 2076 2077 /* The code needs to swap the node_name but simply swapping node_list won't 2078 * work so first remove the nodes from the graph list, do the swap then 2079 * insert them back if needed. 2080 */ 2081 if (bs_new->node_name[0] != '\0') { 2082 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list); 2083 } 2084 if (bs_old->node_name[0] != '\0') { 2085 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list); 2086 } 2087 2088 /* bs_new must be unattached and shouldn't have anything fancy enabled */ 2089 assert(!bs_new->blk); 2090 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps)); 2091 assert(bs_new->job == NULL); 2092 assert(bs_new->io_limits_enabled == false); 2093 assert(!throttle_have_timer(&bs_new->throttle_state)); 2094 2095 tmp = *bs_new; 2096 *bs_new = *bs_old; 2097 *bs_old = tmp; 2098 2099 /* there are some fields that should not be swapped, move them back */ 2100 bdrv_move_feature_fields(&tmp, bs_old); 2101 bdrv_move_feature_fields(bs_old, bs_new); 2102 bdrv_move_feature_fields(bs_new, &tmp); 2103 2104 /* bs_new must remain unattached */ 2105 assert(!bs_new->blk); 2106 2107 /* Check a few fields that should remain attached to the device */ 2108 assert(bs_new->job == NULL); 2109 assert(bs_new->io_limits_enabled == false); 2110 assert(!throttle_have_timer(&bs_new->throttle_state)); 2111 2112 /* insert the nodes back into the graph node list if needed */ 2113 if (bs_new->node_name[0] != '\0') { 2114 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list); 2115 } 2116 if (bs_old->node_name[0] != '\0') { 2117 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list); 2118 } 2119 2120 bdrv_rebind(bs_new); 2121 bdrv_rebind(bs_old); 2122 } 2123 2124 /* 2125 * Add new bs contents at the top of an image chain while the chain is 2126 * live, while keeping required fields on the top layer. 2127 * 2128 * This will modify the BlockDriverState fields, and swap contents 2129 * between bs_new and bs_top. Both bs_new and bs_top are modified. 2130 * 2131 * bs_new must not be attached to a BlockBackend. 2132 * 2133 * This function does not create any image files. 2134 */ 2135 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) 2136 { 2137 bdrv_swap(bs_new, bs_top); 2138 2139 /* The contents of 'tmp' will become bs_top, as we are 2140 * swapping bs_new and bs_top contents. */ 2141 bdrv_set_backing_hd(bs_top, bs_new); 2142 } 2143 2144 static void bdrv_delete(BlockDriverState *bs) 2145 { 2146 assert(!bs->job); 2147 assert(bdrv_op_blocker_is_empty(bs)); 2148 assert(!bs->refcnt); 2149 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 2150 2151 bdrv_close(bs); 2152 2153 /* remove from list, if necessary */ 2154 bdrv_make_anon(bs); 2155 2156 g_free(bs); 2157 } 2158 2159 /* 2160 * Run consistency checks on an image 2161 * 2162 * Returns 0 if the check could be completed (it doesn't mean that the image is 2163 * free of errors) or -errno when an internal error occurred. The results of the 2164 * check are stored in res. 2165 */ 2166 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 2167 { 2168 if (bs->drv == NULL) { 2169 return -ENOMEDIUM; 2170 } 2171 if (bs->drv->bdrv_check == NULL) { 2172 return -ENOTSUP; 2173 } 2174 2175 memset(res, 0, sizeof(*res)); 2176 return bs->drv->bdrv_check(bs, res, fix); 2177 } 2178 2179 #define COMMIT_BUF_SECTORS 2048 2180 2181 /* commit COW file into the raw image */ 2182 int bdrv_commit(BlockDriverState *bs) 2183 { 2184 BlockDriver *drv = bs->drv; 2185 int64_t sector, total_sectors, length, backing_length; 2186 int n, ro, open_flags; 2187 int ret = 0; 2188 uint8_t *buf = NULL; 2189 char filename[PATH_MAX]; 2190 2191 if (!drv) 2192 return -ENOMEDIUM; 2193 2194 if (!bs->backing_hd) { 2195 return -ENOTSUP; 2196 } 2197 2198 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) || 2199 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) { 2200 return -EBUSY; 2201 } 2202 2203 ro = bs->backing_hd->read_only; 2204 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ 2205 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); 2206 open_flags = bs->backing_hd->open_flags; 2207 2208 if (ro) { 2209 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { 2210 return -EACCES; 2211 } 2212 } 2213 2214 length = bdrv_getlength(bs); 2215 if (length < 0) { 2216 ret = length; 2217 goto ro_cleanup; 2218 } 2219 2220 backing_length = bdrv_getlength(bs->backing_hd); 2221 if (backing_length < 0) { 2222 ret = backing_length; 2223 goto ro_cleanup; 2224 } 2225 2226 /* If our top snapshot is larger than the backing file image, 2227 * grow the backing file image if possible. If not possible, 2228 * we must return an error */ 2229 if (length > backing_length) { 2230 ret = bdrv_truncate(bs->backing_hd, length); 2231 if (ret < 0) { 2232 goto ro_cleanup; 2233 } 2234 } 2235 2236 total_sectors = length >> BDRV_SECTOR_BITS; 2237 2238 /* qemu_try_blockalign() for bs will choose an alignment that works for 2239 * bs->backing_hd as well, so no need to compare the alignment manually. */ 2240 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); 2241 if (buf == NULL) { 2242 ret = -ENOMEM; 2243 goto ro_cleanup; 2244 } 2245 2246 for (sector = 0; sector < total_sectors; sector += n) { 2247 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); 2248 if (ret < 0) { 2249 goto ro_cleanup; 2250 } 2251 if (ret) { 2252 ret = bdrv_read(bs, sector, buf, n); 2253 if (ret < 0) { 2254 goto ro_cleanup; 2255 } 2256 2257 ret = bdrv_write(bs->backing_hd, sector, buf, n); 2258 if (ret < 0) { 2259 goto ro_cleanup; 2260 } 2261 } 2262 } 2263 2264 if (drv->bdrv_make_empty) { 2265 ret = drv->bdrv_make_empty(bs); 2266 if (ret < 0) { 2267 goto ro_cleanup; 2268 } 2269 bdrv_flush(bs); 2270 } 2271 2272 /* 2273 * Make sure all data we wrote to the backing device is actually 2274 * stable on disk. 2275 */ 2276 if (bs->backing_hd) { 2277 bdrv_flush(bs->backing_hd); 2278 } 2279 2280 ret = 0; 2281 ro_cleanup: 2282 qemu_vfree(buf); 2283 2284 if (ro) { 2285 /* ignoring error return here */ 2286 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); 2287 } 2288 2289 return ret; 2290 } 2291 2292 int bdrv_commit_all(void) 2293 { 2294 BlockDriverState *bs; 2295 2296 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 2297 AioContext *aio_context = bdrv_get_aio_context(bs); 2298 2299 aio_context_acquire(aio_context); 2300 if (bs->drv && bs->backing_hd) { 2301 int ret = bdrv_commit(bs); 2302 if (ret < 0) { 2303 aio_context_release(aio_context); 2304 return ret; 2305 } 2306 } 2307 aio_context_release(aio_context); 2308 } 2309 return 0; 2310 } 2311 2312 /** 2313 * Remove an active request from the tracked requests list 2314 * 2315 * This function should be called when a tracked request is completing. 2316 */ 2317 static void tracked_request_end(BdrvTrackedRequest *req) 2318 { 2319 if (req->serialising) { 2320 req->bs->serialising_in_flight--; 2321 } 2322 2323 QLIST_REMOVE(req, list); 2324 qemu_co_queue_restart_all(&req->wait_queue); 2325 } 2326 2327 /** 2328 * Add an active request to the tracked requests list 2329 */ 2330 static void tracked_request_begin(BdrvTrackedRequest *req, 2331 BlockDriverState *bs, 2332 int64_t offset, 2333 unsigned int bytes, bool is_write) 2334 { 2335 *req = (BdrvTrackedRequest){ 2336 .bs = bs, 2337 .offset = offset, 2338 .bytes = bytes, 2339 .is_write = is_write, 2340 .co = qemu_coroutine_self(), 2341 .serialising = false, 2342 .overlap_offset = offset, 2343 .overlap_bytes = bytes, 2344 }; 2345 2346 qemu_co_queue_init(&req->wait_queue); 2347 2348 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 2349 } 2350 2351 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 2352 { 2353 int64_t overlap_offset = req->offset & ~(align - 1); 2354 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 2355 - overlap_offset; 2356 2357 if (!req->serialising) { 2358 req->bs->serialising_in_flight++; 2359 req->serialising = true; 2360 } 2361 2362 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 2363 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 2364 } 2365 2366 /** 2367 * Round a region to cluster boundaries 2368 */ 2369 void bdrv_round_to_clusters(BlockDriverState *bs, 2370 int64_t sector_num, int nb_sectors, 2371 int64_t *cluster_sector_num, 2372 int *cluster_nb_sectors) 2373 { 2374 BlockDriverInfo bdi; 2375 2376 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 2377 *cluster_sector_num = sector_num; 2378 *cluster_nb_sectors = nb_sectors; 2379 } else { 2380 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 2381 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 2382 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 2383 nb_sectors, c); 2384 } 2385 } 2386 2387 static int bdrv_get_cluster_size(BlockDriverState *bs) 2388 { 2389 BlockDriverInfo bdi; 2390 int ret; 2391 2392 ret = bdrv_get_info(bs, &bdi); 2393 if (ret < 0 || bdi.cluster_size == 0) { 2394 return bs->request_alignment; 2395 } else { 2396 return bdi.cluster_size; 2397 } 2398 } 2399 2400 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 2401 int64_t offset, unsigned int bytes) 2402 { 2403 /* aaaa bbbb */ 2404 if (offset >= req->overlap_offset + req->overlap_bytes) { 2405 return false; 2406 } 2407 /* bbbb aaaa */ 2408 if (req->overlap_offset >= offset + bytes) { 2409 return false; 2410 } 2411 return true; 2412 } 2413 2414 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 2415 { 2416 BlockDriverState *bs = self->bs; 2417 BdrvTrackedRequest *req; 2418 bool retry; 2419 bool waited = false; 2420 2421 if (!bs->serialising_in_flight) { 2422 return false; 2423 } 2424 2425 do { 2426 retry = false; 2427 QLIST_FOREACH(req, &bs->tracked_requests, list) { 2428 if (req == self || (!req->serialising && !self->serialising)) { 2429 continue; 2430 } 2431 if (tracked_request_overlaps(req, self->overlap_offset, 2432 self->overlap_bytes)) 2433 { 2434 /* Hitting this means there was a reentrant request, for 2435 * example, a block driver issuing nested requests. This must 2436 * never happen since it means deadlock. 2437 */ 2438 assert(qemu_coroutine_self() != req->co); 2439 2440 /* If the request is already (indirectly) waiting for us, or 2441 * will wait for us as soon as it wakes up, then just go on 2442 * (instead of producing a deadlock in the former case). */ 2443 if (!req->waiting_for) { 2444 self->waiting_for = req; 2445 qemu_co_queue_wait(&req->wait_queue); 2446 self->waiting_for = NULL; 2447 retry = true; 2448 waited = true; 2449 break; 2450 } 2451 } 2452 } 2453 } while (retry); 2454 2455 return waited; 2456 } 2457 2458 /* 2459 * Return values: 2460 * 0 - success 2461 * -EINVAL - backing format specified, but no file 2462 * -ENOSPC - can't update the backing file because no space is left in the 2463 * image file header 2464 * -ENOTSUP - format driver doesn't support changing the backing file 2465 */ 2466 int bdrv_change_backing_file(BlockDriverState *bs, 2467 const char *backing_file, const char *backing_fmt) 2468 { 2469 BlockDriver *drv = bs->drv; 2470 int ret; 2471 2472 /* Backing file format doesn't make sense without a backing file */ 2473 if (backing_fmt && !backing_file) { 2474 return -EINVAL; 2475 } 2476 2477 if (drv->bdrv_change_backing_file != NULL) { 2478 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); 2479 } else { 2480 ret = -ENOTSUP; 2481 } 2482 2483 if (ret == 0) { 2484 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 2485 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 2486 } 2487 return ret; 2488 } 2489 2490 /* 2491 * Finds the image layer in the chain that has 'bs' as its backing file. 2492 * 2493 * active is the current topmost image. 2494 * 2495 * Returns NULL if bs is not found in active's image chain, 2496 * or if active == bs. 2497 * 2498 * Returns the bottommost base image if bs == NULL. 2499 */ 2500 BlockDriverState *bdrv_find_overlay(BlockDriverState *active, 2501 BlockDriverState *bs) 2502 { 2503 while (active && bs != active->backing_hd) { 2504 active = active->backing_hd; 2505 } 2506 2507 return active; 2508 } 2509 2510 /* Given a BDS, searches for the base layer. */ 2511 BlockDriverState *bdrv_find_base(BlockDriverState *bs) 2512 { 2513 return bdrv_find_overlay(bs, NULL); 2514 } 2515 2516 typedef struct BlkIntermediateStates { 2517 BlockDriverState *bs; 2518 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; 2519 } BlkIntermediateStates; 2520 2521 2522 /* 2523 * Drops images above 'base' up to and including 'top', and sets the image 2524 * above 'top' to have base as its backing file. 2525 * 2526 * Requires that the overlay to 'top' is opened r/w, so that the backing file 2527 * information in 'bs' can be properly updated. 2528 * 2529 * E.g., this will convert the following chain: 2530 * bottom <- base <- intermediate <- top <- active 2531 * 2532 * to 2533 * 2534 * bottom <- base <- active 2535 * 2536 * It is allowed for bottom==base, in which case it converts: 2537 * 2538 * base <- intermediate <- top <- active 2539 * 2540 * to 2541 * 2542 * base <- active 2543 * 2544 * If backing_file_str is non-NULL, it will be used when modifying top's 2545 * overlay image metadata. 2546 * 2547 * Error conditions: 2548 * if active == top, that is considered an error 2549 * 2550 */ 2551 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, 2552 BlockDriverState *base, const char *backing_file_str) 2553 { 2554 BlockDriverState *intermediate; 2555 BlockDriverState *base_bs = NULL; 2556 BlockDriverState *new_top_bs = NULL; 2557 BlkIntermediateStates *intermediate_state, *next; 2558 int ret = -EIO; 2559 2560 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; 2561 QSIMPLEQ_INIT(&states_to_delete); 2562 2563 if (!top->drv || !base->drv) { 2564 goto exit; 2565 } 2566 2567 new_top_bs = bdrv_find_overlay(active, top); 2568 2569 if (new_top_bs == NULL) { 2570 /* we could not find the image above 'top', this is an error */ 2571 goto exit; 2572 } 2573 2574 /* special case of new_top_bs->backing_hd already pointing to base - nothing 2575 * to do, no intermediate images */ 2576 if (new_top_bs->backing_hd == base) { 2577 ret = 0; 2578 goto exit; 2579 } 2580 2581 intermediate = top; 2582 2583 /* now we will go down through the list, and add each BDS we find 2584 * into our deletion queue, until we hit the 'base' 2585 */ 2586 while (intermediate) { 2587 intermediate_state = g_new0(BlkIntermediateStates, 1); 2588 intermediate_state->bs = intermediate; 2589 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); 2590 2591 if (intermediate->backing_hd == base) { 2592 base_bs = intermediate->backing_hd; 2593 break; 2594 } 2595 intermediate = intermediate->backing_hd; 2596 } 2597 if (base_bs == NULL) { 2598 /* something went wrong, we did not end at the base. safely 2599 * unravel everything, and exit with error */ 2600 goto exit; 2601 } 2602 2603 /* success - we can delete the intermediate states, and link top->base */ 2604 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename; 2605 ret = bdrv_change_backing_file(new_top_bs, backing_file_str, 2606 base_bs->drv ? base_bs->drv->format_name : ""); 2607 if (ret) { 2608 goto exit; 2609 } 2610 bdrv_set_backing_hd(new_top_bs, base_bs); 2611 2612 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2613 /* so that bdrv_close() does not recursively close the chain */ 2614 bdrv_set_backing_hd(intermediate_state->bs, NULL); 2615 bdrv_unref(intermediate_state->bs); 2616 } 2617 ret = 0; 2618 2619 exit: 2620 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2621 g_free(intermediate_state); 2622 } 2623 return ret; 2624 } 2625 2626 2627 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 2628 size_t size) 2629 { 2630 int64_t len; 2631 2632 if (size > INT_MAX) { 2633 return -EIO; 2634 } 2635 2636 if (!bdrv_is_inserted(bs)) 2637 return -ENOMEDIUM; 2638 2639 if (bs->growable) 2640 return 0; 2641 2642 len = bdrv_getlength(bs); 2643 2644 if (offset < 0) 2645 return -EIO; 2646 2647 if ((offset > len) || (len - offset < size)) 2648 return -EIO; 2649 2650 return 0; 2651 } 2652 2653 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 2654 int nb_sectors) 2655 { 2656 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2657 return -EIO; 2658 } 2659 2660 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 2661 nb_sectors * BDRV_SECTOR_SIZE); 2662 } 2663 2664 typedef struct RwCo { 2665 BlockDriverState *bs; 2666 int64_t offset; 2667 QEMUIOVector *qiov; 2668 bool is_write; 2669 int ret; 2670 BdrvRequestFlags flags; 2671 } RwCo; 2672 2673 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 2674 { 2675 RwCo *rwco = opaque; 2676 2677 if (!rwco->is_write) { 2678 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 2679 rwco->qiov->size, rwco->qiov, 2680 rwco->flags); 2681 } else { 2682 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 2683 rwco->qiov->size, rwco->qiov, 2684 rwco->flags); 2685 } 2686 } 2687 2688 /* 2689 * Process a vectored synchronous request using coroutines 2690 */ 2691 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 2692 QEMUIOVector *qiov, bool is_write, 2693 BdrvRequestFlags flags) 2694 { 2695 Coroutine *co; 2696 RwCo rwco = { 2697 .bs = bs, 2698 .offset = offset, 2699 .qiov = qiov, 2700 .is_write = is_write, 2701 .ret = NOT_DONE, 2702 .flags = flags, 2703 }; 2704 2705 /** 2706 * In sync call context, when the vcpu is blocked, this throttling timer 2707 * will not fire; so the I/O throttling function has to be disabled here 2708 * if it has been enabled. 2709 */ 2710 if (bs->io_limits_enabled) { 2711 fprintf(stderr, "Disabling I/O throttling on '%s' due " 2712 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 2713 bdrv_io_limits_disable(bs); 2714 } 2715 2716 if (qemu_in_coroutine()) { 2717 /* Fast-path if already in coroutine context */ 2718 bdrv_rw_co_entry(&rwco); 2719 } else { 2720 AioContext *aio_context = bdrv_get_aio_context(bs); 2721 2722 co = qemu_coroutine_create(bdrv_rw_co_entry); 2723 qemu_coroutine_enter(co, &rwco); 2724 while (rwco.ret == NOT_DONE) { 2725 aio_poll(aio_context, true); 2726 } 2727 } 2728 return rwco.ret; 2729 } 2730 2731 /* 2732 * Process a synchronous request using coroutines 2733 */ 2734 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 2735 int nb_sectors, bool is_write, BdrvRequestFlags flags) 2736 { 2737 QEMUIOVector qiov; 2738 struct iovec iov = { 2739 .iov_base = (void *)buf, 2740 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 2741 }; 2742 2743 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2744 return -EINVAL; 2745 } 2746 2747 qemu_iovec_init_external(&qiov, &iov, 1); 2748 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 2749 &qiov, is_write, flags); 2750 } 2751 2752 /* return < 0 if error. See bdrv_write() for the return codes */ 2753 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 2754 uint8_t *buf, int nb_sectors) 2755 { 2756 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 2757 } 2758 2759 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 2760 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 2761 uint8_t *buf, int nb_sectors) 2762 { 2763 bool enabled; 2764 int ret; 2765 2766 enabled = bs->io_limits_enabled; 2767 bs->io_limits_enabled = false; 2768 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 2769 bs->io_limits_enabled = enabled; 2770 return ret; 2771 } 2772 2773 /* Return < 0 if error. Important errors are: 2774 -EIO generic I/O error (may happen for all errors) 2775 -ENOMEDIUM No media inserted. 2776 -EINVAL Invalid sector number or nb_sectors 2777 -EACCES Trying to write a read-only device 2778 */ 2779 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 2780 const uint8_t *buf, int nb_sectors) 2781 { 2782 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 2783 } 2784 2785 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 2786 int nb_sectors, BdrvRequestFlags flags) 2787 { 2788 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 2789 BDRV_REQ_ZERO_WRITE | flags); 2790 } 2791 2792 /* 2793 * Completely zero out a block device with the help of bdrv_write_zeroes. 2794 * The operation is sped up by checking the block status and only writing 2795 * zeroes to the device if they currently do not return zeroes. Optional 2796 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 2797 * 2798 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 2799 */ 2800 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 2801 { 2802 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 2803 int n; 2804 2805 target_sectors = bdrv_nb_sectors(bs); 2806 if (target_sectors < 0) { 2807 return target_sectors; 2808 } 2809 2810 for (;;) { 2811 nb_sectors = target_sectors - sector_num; 2812 if (nb_sectors <= 0) { 2813 return 0; 2814 } 2815 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2816 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE; 2817 } 2818 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 2819 if (ret < 0) { 2820 error_report("error getting block status at sector %" PRId64 ": %s", 2821 sector_num, strerror(-ret)); 2822 return ret; 2823 } 2824 if (ret & BDRV_BLOCK_ZERO) { 2825 sector_num += n; 2826 continue; 2827 } 2828 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 2829 if (ret < 0) { 2830 error_report("error writing zeroes at sector %" PRId64 ": %s", 2831 sector_num, strerror(-ret)); 2832 return ret; 2833 } 2834 sector_num += n; 2835 } 2836 } 2837 2838 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 2839 { 2840 QEMUIOVector qiov; 2841 struct iovec iov = { 2842 .iov_base = (void *)buf, 2843 .iov_len = bytes, 2844 }; 2845 int ret; 2846 2847 if (bytes < 0) { 2848 return -EINVAL; 2849 } 2850 2851 qemu_iovec_init_external(&qiov, &iov, 1); 2852 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 2853 if (ret < 0) { 2854 return ret; 2855 } 2856 2857 return bytes; 2858 } 2859 2860 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 2861 { 2862 int ret; 2863 2864 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 2865 if (ret < 0) { 2866 return ret; 2867 } 2868 2869 return qiov->size; 2870 } 2871 2872 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 2873 const void *buf, int bytes) 2874 { 2875 QEMUIOVector qiov; 2876 struct iovec iov = { 2877 .iov_base = (void *) buf, 2878 .iov_len = bytes, 2879 }; 2880 2881 if (bytes < 0) { 2882 return -EINVAL; 2883 } 2884 2885 qemu_iovec_init_external(&qiov, &iov, 1); 2886 return bdrv_pwritev(bs, offset, &qiov); 2887 } 2888 2889 /* 2890 * Writes to the file and ensures that no writes are reordered across this 2891 * request (acts as a barrier) 2892 * 2893 * Returns 0 on success, -errno in error cases. 2894 */ 2895 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 2896 const void *buf, int count) 2897 { 2898 int ret; 2899 2900 ret = bdrv_pwrite(bs, offset, buf, count); 2901 if (ret < 0) { 2902 return ret; 2903 } 2904 2905 /* No flush needed for cache modes that already do it */ 2906 if (bs->enable_write_cache) { 2907 bdrv_flush(bs); 2908 } 2909 2910 return 0; 2911 } 2912 2913 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 2914 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2915 { 2916 /* Perform I/O through a temporary buffer so that users who scribble over 2917 * their read buffer while the operation is in progress do not end up 2918 * modifying the image file. This is critical for zero-copy guest I/O 2919 * where anything might happen inside guest memory. 2920 */ 2921 void *bounce_buffer; 2922 2923 BlockDriver *drv = bs->drv; 2924 struct iovec iov; 2925 QEMUIOVector bounce_qiov; 2926 int64_t cluster_sector_num; 2927 int cluster_nb_sectors; 2928 size_t skip_bytes; 2929 int ret; 2930 2931 /* Cover entire cluster so no additional backing file I/O is required when 2932 * allocating cluster in the image file. 2933 */ 2934 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2935 &cluster_sector_num, &cluster_nb_sectors); 2936 2937 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 2938 cluster_sector_num, cluster_nb_sectors); 2939 2940 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 2941 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 2942 if (bounce_buffer == NULL) { 2943 ret = -ENOMEM; 2944 goto err; 2945 } 2946 2947 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 2948 2949 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 2950 &bounce_qiov); 2951 if (ret < 0) { 2952 goto err; 2953 } 2954 2955 if (drv->bdrv_co_write_zeroes && 2956 buffer_is_zero(bounce_buffer, iov.iov_len)) { 2957 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 2958 cluster_nb_sectors, 0); 2959 } else { 2960 /* This does not change the data on the disk, it is not necessary 2961 * to flush even in cache=writethrough mode. 2962 */ 2963 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 2964 &bounce_qiov); 2965 } 2966 2967 if (ret < 0) { 2968 /* It might be okay to ignore write errors for guest requests. If this 2969 * is a deliberate copy-on-read then we don't want to ignore the error. 2970 * Simply report it in all cases. 2971 */ 2972 goto err; 2973 } 2974 2975 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 2976 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 2977 nb_sectors * BDRV_SECTOR_SIZE); 2978 2979 err: 2980 qemu_vfree(bounce_buffer); 2981 return ret; 2982 } 2983 2984 /* 2985 * Forwards an already correctly aligned request to the BlockDriver. This 2986 * handles copy on read and zeroing after EOF; any other features must be 2987 * implemented by the caller. 2988 */ 2989 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 2990 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 2991 int64_t align, QEMUIOVector *qiov, int flags) 2992 { 2993 BlockDriver *drv = bs->drv; 2994 int ret; 2995 2996 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 2997 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 2998 2999 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 3000 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 3001 assert(!qiov || bytes == qiov->size); 3002 3003 /* Handle Copy on Read and associated serialisation */ 3004 if (flags & BDRV_REQ_COPY_ON_READ) { 3005 /* If we touch the same cluster it counts as an overlap. This 3006 * guarantees that allocating writes will be serialized and not race 3007 * with each other for the same cluster. For example, in copy-on-read 3008 * it ensures that the CoR read and write operations are atomic and 3009 * guest writes cannot interleave between them. */ 3010 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 3011 } 3012 3013 wait_serialising_requests(req); 3014 3015 if (flags & BDRV_REQ_COPY_ON_READ) { 3016 int pnum; 3017 3018 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 3019 if (ret < 0) { 3020 goto out; 3021 } 3022 3023 if (!ret || pnum != nb_sectors) { 3024 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 3025 goto out; 3026 } 3027 } 3028 3029 /* Forward the request to the BlockDriver */ 3030 if (!(bs->zero_beyond_eof && bs->growable)) { 3031 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 3032 } else { 3033 /* Read zeros after EOF of growable BDSes */ 3034 int64_t total_sectors, max_nb_sectors; 3035 3036 total_sectors = bdrv_nb_sectors(bs); 3037 if (total_sectors < 0) { 3038 ret = total_sectors; 3039 goto out; 3040 } 3041 3042 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 3043 align >> BDRV_SECTOR_BITS); 3044 if (max_nb_sectors > 0) { 3045 QEMUIOVector local_qiov; 3046 size_t local_sectors; 3047 3048 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS); 3049 local_sectors = MIN(max_nb_sectors, nb_sectors); 3050 3051 qemu_iovec_init(&local_qiov, qiov->niov); 3052 qemu_iovec_concat(&local_qiov, qiov, 0, 3053 local_sectors * BDRV_SECTOR_SIZE); 3054 3055 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors, 3056 &local_qiov); 3057 3058 qemu_iovec_destroy(&local_qiov); 3059 } else { 3060 ret = 0; 3061 } 3062 3063 /* Reading beyond end of file is supposed to produce zeroes */ 3064 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 3065 uint64_t offset = MAX(0, total_sectors - sector_num); 3066 uint64_t bytes = (sector_num + nb_sectors - offset) * 3067 BDRV_SECTOR_SIZE; 3068 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 3069 } 3070 } 3071 3072 out: 3073 return ret; 3074 } 3075 3076 /* 3077 * Handle a read request in coroutine context 3078 */ 3079 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 3080 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 3081 BdrvRequestFlags flags) 3082 { 3083 BlockDriver *drv = bs->drv; 3084 BdrvTrackedRequest req; 3085 3086 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 3087 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 3088 uint8_t *head_buf = NULL; 3089 uint8_t *tail_buf = NULL; 3090 QEMUIOVector local_qiov; 3091 bool use_local_qiov = false; 3092 int ret; 3093 3094 if (!drv) { 3095 return -ENOMEDIUM; 3096 } 3097 if (bdrv_check_byte_request(bs, offset, bytes)) { 3098 return -EIO; 3099 } 3100 3101 if (bs->copy_on_read) { 3102 flags |= BDRV_REQ_COPY_ON_READ; 3103 } 3104 3105 /* throttling disk I/O */ 3106 if (bs->io_limits_enabled) { 3107 bdrv_io_limits_intercept(bs, bytes, false); 3108 } 3109 3110 /* Align read if necessary by padding qiov */ 3111 if (offset & (align - 1)) { 3112 head_buf = qemu_blockalign(bs, align); 3113 qemu_iovec_init(&local_qiov, qiov->niov + 2); 3114 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 3115 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3116 use_local_qiov = true; 3117 3118 bytes += offset & (align - 1); 3119 offset = offset & ~(align - 1); 3120 } 3121 3122 if ((offset + bytes) & (align - 1)) { 3123 if (!use_local_qiov) { 3124 qemu_iovec_init(&local_qiov, qiov->niov + 1); 3125 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3126 use_local_qiov = true; 3127 } 3128 tail_buf = qemu_blockalign(bs, align); 3129 qemu_iovec_add(&local_qiov, tail_buf, 3130 align - ((offset + bytes) & (align - 1))); 3131 3132 bytes = ROUND_UP(bytes, align); 3133 } 3134 3135 tracked_request_begin(&req, bs, offset, bytes, false); 3136 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 3137 use_local_qiov ? &local_qiov : qiov, 3138 flags); 3139 tracked_request_end(&req); 3140 3141 if (use_local_qiov) { 3142 qemu_iovec_destroy(&local_qiov); 3143 qemu_vfree(head_buf); 3144 qemu_vfree(tail_buf); 3145 } 3146 3147 return ret; 3148 } 3149 3150 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 3151 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 3152 BdrvRequestFlags flags) 3153 { 3154 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { 3155 return -EINVAL; 3156 } 3157 3158 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 3159 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 3160 } 3161 3162 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 3163 int nb_sectors, QEMUIOVector *qiov) 3164 { 3165 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 3166 3167 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 3168 } 3169 3170 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 3171 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 3172 { 3173 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 3174 3175 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 3176 BDRV_REQ_COPY_ON_READ); 3177 } 3178 3179 /* if no limit is specified in the BlockLimits use a default 3180 * of 32768 512-byte sectors (16 MiB) per request. 3181 */ 3182 #define MAX_WRITE_ZEROES_DEFAULT 32768 3183 3184 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 3185 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 3186 { 3187 BlockDriver *drv = bs->drv; 3188 QEMUIOVector qiov; 3189 struct iovec iov = {0}; 3190 int ret = 0; 3191 3192 int max_write_zeroes = bs->bl.max_write_zeroes ? 3193 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; 3194 3195 while (nb_sectors > 0 && !ret) { 3196 int num = nb_sectors; 3197 3198 /* Align request. Block drivers can expect the "bulk" of the request 3199 * to be aligned. 3200 */ 3201 if (bs->bl.write_zeroes_alignment 3202 && num > bs->bl.write_zeroes_alignment) { 3203 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 3204 /* Make a small request up to the first aligned sector. */ 3205 num = bs->bl.write_zeroes_alignment; 3206 num -= sector_num % bs->bl.write_zeroes_alignment; 3207 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 3208 /* Shorten the request to the last aligned sector. num cannot 3209 * underflow because num > bs->bl.write_zeroes_alignment. 3210 */ 3211 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 3212 } 3213 } 3214 3215 /* limit request size */ 3216 if (num > max_write_zeroes) { 3217 num = max_write_zeroes; 3218 } 3219 3220 ret = -ENOTSUP; 3221 /* First try the efficient write zeroes operation */ 3222 if (drv->bdrv_co_write_zeroes) { 3223 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 3224 } 3225 3226 if (ret == -ENOTSUP) { 3227 /* Fall back to bounce buffer if write zeroes is unsupported */ 3228 iov.iov_len = num * BDRV_SECTOR_SIZE; 3229 if (iov.iov_base == NULL) { 3230 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 3231 if (iov.iov_base == NULL) { 3232 ret = -ENOMEM; 3233 goto fail; 3234 } 3235 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 3236 } 3237 qemu_iovec_init_external(&qiov, &iov, 1); 3238 3239 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 3240 3241 /* Keep bounce buffer around if it is big enough for all 3242 * all future requests. 3243 */ 3244 if (num < max_write_zeroes) { 3245 qemu_vfree(iov.iov_base); 3246 iov.iov_base = NULL; 3247 } 3248 } 3249 3250 sector_num += num; 3251 nb_sectors -= num; 3252 } 3253 3254 fail: 3255 qemu_vfree(iov.iov_base); 3256 return ret; 3257 } 3258 3259 /* 3260 * Forwards an already correctly aligned write request to the BlockDriver. 3261 */ 3262 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 3263 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 3264 QEMUIOVector *qiov, int flags) 3265 { 3266 BlockDriver *drv = bs->drv; 3267 bool waited; 3268 int ret; 3269 3270 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 3271 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 3272 3273 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 3274 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 3275 assert(!qiov || bytes == qiov->size); 3276 3277 waited = wait_serialising_requests(req); 3278 assert(!waited || !req->serialising); 3279 assert(req->overlap_offset <= offset); 3280 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 3281 3282 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 3283 3284 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 3285 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 3286 qemu_iovec_is_zero(qiov)) { 3287 flags |= BDRV_REQ_ZERO_WRITE; 3288 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 3289 flags |= BDRV_REQ_MAY_UNMAP; 3290 } 3291 } 3292 3293 if (ret < 0) { 3294 /* Do nothing, write notifier decided to fail this request */ 3295 } else if (flags & BDRV_REQ_ZERO_WRITE) { 3296 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); 3297 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 3298 } else { 3299 BLKDBG_EVENT(bs, BLKDBG_PWRITEV); 3300 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 3301 } 3302 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); 3303 3304 if (ret == 0 && !bs->enable_write_cache) { 3305 ret = bdrv_co_flush(bs); 3306 } 3307 3308 bdrv_set_dirty(bs, sector_num, nb_sectors); 3309 3310 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); 3311 3312 if (bs->growable && ret >= 0) { 3313 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 3314 } 3315 3316 return ret; 3317 } 3318 3319 /* 3320 * Handle a write request in coroutine context 3321 */ 3322 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 3323 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 3324 BdrvRequestFlags flags) 3325 { 3326 BdrvTrackedRequest req; 3327 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 3328 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 3329 uint8_t *head_buf = NULL; 3330 uint8_t *tail_buf = NULL; 3331 QEMUIOVector local_qiov; 3332 bool use_local_qiov = false; 3333 int ret; 3334 3335 if (!bs->drv) { 3336 return -ENOMEDIUM; 3337 } 3338 if (bs->read_only) { 3339 return -EACCES; 3340 } 3341 if (bdrv_check_byte_request(bs, offset, bytes)) { 3342 return -EIO; 3343 } 3344 3345 /* throttling disk I/O */ 3346 if (bs->io_limits_enabled) { 3347 bdrv_io_limits_intercept(bs, bytes, true); 3348 } 3349 3350 /* 3351 * Align write if necessary by performing a read-modify-write cycle. 3352 * Pad qiov with the read parts and be sure to have a tracked request not 3353 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 3354 */ 3355 tracked_request_begin(&req, bs, offset, bytes, true); 3356 3357 if (offset & (align - 1)) { 3358 QEMUIOVector head_qiov; 3359 struct iovec head_iov; 3360 3361 mark_request_serialising(&req, align); 3362 wait_serialising_requests(&req); 3363 3364 head_buf = qemu_blockalign(bs, align); 3365 head_iov = (struct iovec) { 3366 .iov_base = head_buf, 3367 .iov_len = align, 3368 }; 3369 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 3370 3371 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 3372 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 3373 align, &head_qiov, 0); 3374 if (ret < 0) { 3375 goto fail; 3376 } 3377 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 3378 3379 qemu_iovec_init(&local_qiov, qiov->niov + 2); 3380 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 3381 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3382 use_local_qiov = true; 3383 3384 bytes += offset & (align - 1); 3385 offset = offset & ~(align - 1); 3386 } 3387 3388 if ((offset + bytes) & (align - 1)) { 3389 QEMUIOVector tail_qiov; 3390 struct iovec tail_iov; 3391 size_t tail_bytes; 3392 bool waited; 3393 3394 mark_request_serialising(&req, align); 3395 waited = wait_serialising_requests(&req); 3396 assert(!waited || !use_local_qiov); 3397 3398 tail_buf = qemu_blockalign(bs, align); 3399 tail_iov = (struct iovec) { 3400 .iov_base = tail_buf, 3401 .iov_len = align, 3402 }; 3403 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 3404 3405 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 3406 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 3407 align, &tail_qiov, 0); 3408 if (ret < 0) { 3409 goto fail; 3410 } 3411 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 3412 3413 if (!use_local_qiov) { 3414 qemu_iovec_init(&local_qiov, qiov->niov + 1); 3415 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3416 use_local_qiov = true; 3417 } 3418 3419 tail_bytes = (offset + bytes) & (align - 1); 3420 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 3421 3422 bytes = ROUND_UP(bytes, align); 3423 } 3424 3425 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 3426 use_local_qiov ? &local_qiov : qiov, 3427 flags); 3428 3429 fail: 3430 tracked_request_end(&req); 3431 3432 if (use_local_qiov) { 3433 qemu_iovec_destroy(&local_qiov); 3434 } 3435 qemu_vfree(head_buf); 3436 qemu_vfree(tail_buf); 3437 3438 return ret; 3439 } 3440 3441 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 3442 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 3443 BdrvRequestFlags flags) 3444 { 3445 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { 3446 return -EINVAL; 3447 } 3448 3449 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 3450 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 3451 } 3452 3453 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 3454 int nb_sectors, QEMUIOVector *qiov) 3455 { 3456 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 3457 3458 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 3459 } 3460 3461 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 3462 int64_t sector_num, int nb_sectors, 3463 BdrvRequestFlags flags) 3464 { 3465 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 3466 3467 if (!(bs->open_flags & BDRV_O_UNMAP)) { 3468 flags &= ~BDRV_REQ_MAY_UNMAP; 3469 } 3470 3471 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 3472 BDRV_REQ_ZERO_WRITE | flags); 3473 } 3474 3475 /** 3476 * Truncate file to 'offset' bytes (needed only for file protocols) 3477 */ 3478 int bdrv_truncate(BlockDriverState *bs, int64_t offset) 3479 { 3480 BlockDriver *drv = bs->drv; 3481 int ret; 3482 if (!drv) 3483 return -ENOMEDIUM; 3484 if (!drv->bdrv_truncate) 3485 return -ENOTSUP; 3486 if (bs->read_only) 3487 return -EACCES; 3488 3489 ret = drv->bdrv_truncate(bs, offset); 3490 if (ret == 0) { 3491 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3492 if (bs->blk) { 3493 blk_dev_resize_cb(bs->blk); 3494 } 3495 } 3496 return ret; 3497 } 3498 3499 /** 3500 * Length of a allocated file in bytes. Sparse files are counted by actual 3501 * allocated space. Return < 0 if error or unknown. 3502 */ 3503 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) 3504 { 3505 BlockDriver *drv = bs->drv; 3506 if (!drv) { 3507 return -ENOMEDIUM; 3508 } 3509 if (drv->bdrv_get_allocated_file_size) { 3510 return drv->bdrv_get_allocated_file_size(bs); 3511 } 3512 if (bs->file) { 3513 return bdrv_get_allocated_file_size(bs->file); 3514 } 3515 return -ENOTSUP; 3516 } 3517 3518 /** 3519 * Return number of sectors on success, -errno on error. 3520 */ 3521 int64_t bdrv_nb_sectors(BlockDriverState *bs) 3522 { 3523 BlockDriver *drv = bs->drv; 3524 3525 if (!drv) 3526 return -ENOMEDIUM; 3527 3528 if (drv->has_variable_length) { 3529 int ret = refresh_total_sectors(bs, bs->total_sectors); 3530 if (ret < 0) { 3531 return ret; 3532 } 3533 } 3534 return bs->total_sectors; 3535 } 3536 3537 /** 3538 * Return length in bytes on success, -errno on error. 3539 * The length is always a multiple of BDRV_SECTOR_SIZE. 3540 */ 3541 int64_t bdrv_getlength(BlockDriverState *bs) 3542 { 3543 int64_t ret = bdrv_nb_sectors(bs); 3544 3545 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; 3546 } 3547 3548 /* return 0 as number of sectors if no device present or error */ 3549 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) 3550 { 3551 int64_t nb_sectors = bdrv_nb_sectors(bs); 3552 3553 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; 3554 } 3555 3556 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, 3557 BlockdevOnError on_write_error) 3558 { 3559 bs->on_read_error = on_read_error; 3560 bs->on_write_error = on_write_error; 3561 } 3562 3563 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) 3564 { 3565 return is_read ? bs->on_read_error : bs->on_write_error; 3566 } 3567 3568 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) 3569 { 3570 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; 3571 3572 switch (on_err) { 3573 case BLOCKDEV_ON_ERROR_ENOSPC: 3574 return (error == ENOSPC) ? 3575 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; 3576 case BLOCKDEV_ON_ERROR_STOP: 3577 return BLOCK_ERROR_ACTION_STOP; 3578 case BLOCKDEV_ON_ERROR_REPORT: 3579 return BLOCK_ERROR_ACTION_REPORT; 3580 case BLOCKDEV_ON_ERROR_IGNORE: 3581 return BLOCK_ERROR_ACTION_IGNORE; 3582 default: 3583 abort(); 3584 } 3585 } 3586 3587 static void send_qmp_error_event(BlockDriverState *bs, 3588 BlockErrorAction action, 3589 bool is_read, int error) 3590 { 3591 IoOperationType optype; 3592 3593 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; 3594 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action, 3595 bdrv_iostatus_is_enabled(bs), 3596 error == ENOSPC, strerror(error), 3597 &error_abort); 3598 } 3599 3600 /* This is done by device models because, while the block layer knows 3601 * about the error, it does not know whether an operation comes from 3602 * the device or the block layer (from a job, for example). 3603 */ 3604 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, 3605 bool is_read, int error) 3606 { 3607 assert(error >= 0); 3608 3609 if (action == BLOCK_ERROR_ACTION_STOP) { 3610 /* First set the iostatus, so that "info block" returns an iostatus 3611 * that matches the events raised so far (an additional error iostatus 3612 * is fine, but not a lost one). 3613 */ 3614 bdrv_iostatus_set_err(bs, error); 3615 3616 /* Then raise the request to stop the VM and the event. 3617 * qemu_system_vmstop_request_prepare has two effects. First, 3618 * it ensures that the STOP event always comes after the 3619 * BLOCK_IO_ERROR event. Second, it ensures that even if management 3620 * can observe the STOP event and do a "cont" before the STOP 3621 * event is issued, the VM will not stop. In this case, vm_start() 3622 * also ensures that the STOP/RESUME pair of events is emitted. 3623 */ 3624 qemu_system_vmstop_request_prepare(); 3625 send_qmp_error_event(bs, action, is_read, error); 3626 qemu_system_vmstop_request(RUN_STATE_IO_ERROR); 3627 } else { 3628 send_qmp_error_event(bs, action, is_read, error); 3629 } 3630 } 3631 3632 int bdrv_is_read_only(BlockDriverState *bs) 3633 { 3634 return bs->read_only; 3635 } 3636 3637 int bdrv_is_sg(BlockDriverState *bs) 3638 { 3639 return bs->sg; 3640 } 3641 3642 int bdrv_enable_write_cache(BlockDriverState *bs) 3643 { 3644 return bs->enable_write_cache; 3645 } 3646 3647 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) 3648 { 3649 bs->enable_write_cache = wce; 3650 3651 /* so a reopen() will preserve wce */ 3652 if (wce) { 3653 bs->open_flags |= BDRV_O_CACHE_WB; 3654 } else { 3655 bs->open_flags &= ~BDRV_O_CACHE_WB; 3656 } 3657 } 3658 3659 int bdrv_is_encrypted(BlockDriverState *bs) 3660 { 3661 if (bs->backing_hd && bs->backing_hd->encrypted) 3662 return 1; 3663 return bs->encrypted; 3664 } 3665 3666 int bdrv_key_required(BlockDriverState *bs) 3667 { 3668 BlockDriverState *backing_hd = bs->backing_hd; 3669 3670 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) 3671 return 1; 3672 return (bs->encrypted && !bs->valid_key); 3673 } 3674 3675 int bdrv_set_key(BlockDriverState *bs, const char *key) 3676 { 3677 int ret; 3678 if (bs->backing_hd && bs->backing_hd->encrypted) { 3679 ret = bdrv_set_key(bs->backing_hd, key); 3680 if (ret < 0) 3681 return ret; 3682 if (!bs->encrypted) 3683 return 0; 3684 } 3685 if (!bs->encrypted) { 3686 return -EINVAL; 3687 } else if (!bs->drv || !bs->drv->bdrv_set_key) { 3688 return -ENOMEDIUM; 3689 } 3690 ret = bs->drv->bdrv_set_key(bs, key); 3691 if (ret < 0) { 3692 bs->valid_key = 0; 3693 } else if (!bs->valid_key) { 3694 bs->valid_key = 1; 3695 if (bs->blk) { 3696 /* call the change callback now, we skipped it on open */ 3697 blk_dev_change_media_cb(bs->blk, true); 3698 } 3699 } 3700 return ret; 3701 } 3702 3703 const char *bdrv_get_format_name(BlockDriverState *bs) 3704 { 3705 return bs->drv ? bs->drv->format_name : NULL; 3706 } 3707 3708 static int qsort_strcmp(const void *a, const void *b) 3709 { 3710 return strcmp(a, b); 3711 } 3712 3713 void bdrv_iterate_format(void (*it)(void *opaque, const char *name), 3714 void *opaque) 3715 { 3716 BlockDriver *drv; 3717 int count = 0; 3718 int i; 3719 const char **formats = NULL; 3720 3721 QLIST_FOREACH(drv, &bdrv_drivers, list) { 3722 if (drv->format_name) { 3723 bool found = false; 3724 int i = count; 3725 while (formats && i && !found) { 3726 found = !strcmp(formats[--i], drv->format_name); 3727 } 3728 3729 if (!found) { 3730 formats = g_renew(const char *, formats, count + 1); 3731 formats[count++] = drv->format_name; 3732 } 3733 } 3734 } 3735 3736 qsort(formats, count, sizeof(formats[0]), qsort_strcmp); 3737 3738 for (i = 0; i < count; i++) { 3739 it(opaque, formats[i]); 3740 } 3741 3742 g_free(formats); 3743 } 3744 3745 /* This function is to find block backend bs */ 3746 /* TODO convert callers to blk_by_name(), then remove */ 3747 BlockDriverState *bdrv_find(const char *name) 3748 { 3749 BlockBackend *blk = blk_by_name(name); 3750 3751 return blk ? blk_bs(blk) : NULL; 3752 } 3753 3754 /* This function is to find a node in the bs graph */ 3755 BlockDriverState *bdrv_find_node(const char *node_name) 3756 { 3757 BlockDriverState *bs; 3758 3759 assert(node_name); 3760 3761 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { 3762 if (!strcmp(node_name, bs->node_name)) { 3763 return bs; 3764 } 3765 } 3766 return NULL; 3767 } 3768 3769 /* Put this QMP function here so it can access the static graph_bdrv_states. */ 3770 BlockDeviceInfoList *bdrv_named_nodes_list(void) 3771 { 3772 BlockDeviceInfoList *list, *entry; 3773 BlockDriverState *bs; 3774 3775 list = NULL; 3776 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { 3777 entry = g_malloc0(sizeof(*entry)); 3778 entry->value = bdrv_block_device_info(bs); 3779 entry->next = list; 3780 list = entry; 3781 } 3782 3783 return list; 3784 } 3785 3786 BlockDriverState *bdrv_lookup_bs(const char *device, 3787 const char *node_name, 3788 Error **errp) 3789 { 3790 BlockBackend *blk; 3791 BlockDriverState *bs; 3792 3793 if (device) { 3794 blk = blk_by_name(device); 3795 3796 if (blk) { 3797 return blk_bs(blk); 3798 } 3799 } 3800 3801 if (node_name) { 3802 bs = bdrv_find_node(node_name); 3803 3804 if (bs) { 3805 return bs; 3806 } 3807 } 3808 3809 error_setg(errp, "Cannot find device=%s nor node_name=%s", 3810 device ? device : "", 3811 node_name ? node_name : ""); 3812 return NULL; 3813 } 3814 3815 /* If 'base' is in the same chain as 'top', return true. Otherwise, 3816 * return false. If either argument is NULL, return false. */ 3817 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) 3818 { 3819 while (top && top != base) { 3820 top = top->backing_hd; 3821 } 3822 3823 return top != NULL; 3824 } 3825 3826 BlockDriverState *bdrv_next_node(BlockDriverState *bs) 3827 { 3828 if (!bs) { 3829 return QTAILQ_FIRST(&graph_bdrv_states); 3830 } 3831 return QTAILQ_NEXT(bs, node_list); 3832 } 3833 3834 BlockDriverState *bdrv_next(BlockDriverState *bs) 3835 { 3836 if (!bs) { 3837 return QTAILQ_FIRST(&bdrv_states); 3838 } 3839 return QTAILQ_NEXT(bs, device_list); 3840 } 3841 3842 const char *bdrv_get_node_name(const BlockDriverState *bs) 3843 { 3844 return bs->node_name; 3845 } 3846 3847 /* TODO check what callers really want: bs->node_name or blk_name() */ 3848 const char *bdrv_get_device_name(const BlockDriverState *bs) 3849 { 3850 return bs->blk ? blk_name(bs->blk) : ""; 3851 } 3852 3853 int bdrv_get_flags(BlockDriverState *bs) 3854 { 3855 return bs->open_flags; 3856 } 3857 3858 int bdrv_flush_all(void) 3859 { 3860 BlockDriverState *bs; 3861 int result = 0; 3862 3863 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 3864 AioContext *aio_context = bdrv_get_aio_context(bs); 3865 int ret; 3866 3867 aio_context_acquire(aio_context); 3868 ret = bdrv_flush(bs); 3869 if (ret < 0 && !result) { 3870 result = ret; 3871 } 3872 aio_context_release(aio_context); 3873 } 3874 3875 return result; 3876 } 3877 3878 int bdrv_has_zero_init_1(BlockDriverState *bs) 3879 { 3880 return 1; 3881 } 3882 3883 int bdrv_has_zero_init(BlockDriverState *bs) 3884 { 3885 assert(bs->drv); 3886 3887 /* If BS is a copy on write image, it is initialized to 3888 the contents of the base image, which may not be zeroes. */ 3889 if (bs->backing_hd) { 3890 return 0; 3891 } 3892 if (bs->drv->bdrv_has_zero_init) { 3893 return bs->drv->bdrv_has_zero_init(bs); 3894 } 3895 3896 /* safe default */ 3897 return 0; 3898 } 3899 3900 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) 3901 { 3902 BlockDriverInfo bdi; 3903 3904 if (bs->backing_hd) { 3905 return false; 3906 } 3907 3908 if (bdrv_get_info(bs, &bdi) == 0) { 3909 return bdi.unallocated_blocks_are_zero; 3910 } 3911 3912 return false; 3913 } 3914 3915 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) 3916 { 3917 BlockDriverInfo bdi; 3918 3919 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { 3920 return false; 3921 } 3922 3923 if (bdrv_get_info(bs, &bdi) == 0) { 3924 return bdi.can_write_zeroes_with_unmap; 3925 } 3926 3927 return false; 3928 } 3929 3930 typedef struct BdrvCoGetBlockStatusData { 3931 BlockDriverState *bs; 3932 BlockDriverState *base; 3933 int64_t sector_num; 3934 int nb_sectors; 3935 int *pnum; 3936 int64_t ret; 3937 bool done; 3938 } BdrvCoGetBlockStatusData; 3939 3940 /* 3941 * Returns the allocation status of the specified sectors. 3942 * Drivers not implementing the functionality are assumed to not support 3943 * backing files, hence all their sectors are reported as allocated. 3944 * 3945 * If 'sector_num' is beyond the end of the disk image the return value is 0 3946 * and 'pnum' is set to 0. 3947 * 3948 * 'pnum' is set to the number of sectors (including and immediately following 3949 * the specified sector) that are known to be in the same 3950 * allocated/unallocated state. 3951 * 3952 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 3953 * beyond the end of the disk image it will be clamped. 3954 */ 3955 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 3956 int64_t sector_num, 3957 int nb_sectors, int *pnum) 3958 { 3959 int64_t total_sectors; 3960 int64_t n; 3961 int64_t ret, ret2; 3962 3963 total_sectors = bdrv_nb_sectors(bs); 3964 if (total_sectors < 0) { 3965 return total_sectors; 3966 } 3967 3968 if (sector_num >= total_sectors) { 3969 *pnum = 0; 3970 return 0; 3971 } 3972 3973 n = total_sectors - sector_num; 3974 if (n < nb_sectors) { 3975 nb_sectors = n; 3976 } 3977 3978 if (!bs->drv->bdrv_co_get_block_status) { 3979 *pnum = nb_sectors; 3980 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 3981 if (bs->drv->protocol_name) { 3982 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 3983 } 3984 return ret; 3985 } 3986 3987 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 3988 if (ret < 0) { 3989 *pnum = 0; 3990 return ret; 3991 } 3992 3993 if (ret & BDRV_BLOCK_RAW) { 3994 assert(ret & BDRV_BLOCK_OFFSET_VALID); 3995 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 3996 *pnum, pnum); 3997 } 3998 3999 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 4000 ret |= BDRV_BLOCK_ALLOCATED; 4001 } 4002 4003 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { 4004 if (bdrv_unallocated_blocks_are_zero(bs)) { 4005 ret |= BDRV_BLOCK_ZERO; 4006 } else if (bs->backing_hd) { 4007 BlockDriverState *bs2 = bs->backing_hd; 4008 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 4009 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 4010 ret |= BDRV_BLOCK_ZERO; 4011 } 4012 } 4013 } 4014 4015 if (bs->file && 4016 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 4017 (ret & BDRV_BLOCK_OFFSET_VALID)) { 4018 int file_pnum; 4019 4020 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 4021 *pnum, &file_pnum); 4022 if (ret2 >= 0) { 4023 /* Ignore errors. This is just providing extra information, it 4024 * is useful but not necessary. 4025 */ 4026 if (!file_pnum) { 4027 /* !file_pnum indicates an offset at or beyond the EOF; it is 4028 * perfectly valid for the format block driver to point to such 4029 * offsets, so catch it and mark everything as zero */ 4030 ret |= BDRV_BLOCK_ZERO; 4031 } else { 4032 /* Limit request to the range reported by the protocol driver */ 4033 *pnum = file_pnum; 4034 ret |= (ret2 & BDRV_BLOCK_ZERO); 4035 } 4036 } 4037 } 4038 4039 return ret; 4040 } 4041 4042 /* Coroutine wrapper for bdrv_get_block_status() */ 4043 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) 4044 { 4045 BdrvCoGetBlockStatusData *data = opaque; 4046 BlockDriverState *bs = data->bs; 4047 4048 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, 4049 data->pnum); 4050 data->done = true; 4051 } 4052 4053 /* 4054 * Synchronous wrapper around bdrv_co_get_block_status(). 4055 * 4056 * See bdrv_co_get_block_status() for details. 4057 */ 4058 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, 4059 int nb_sectors, int *pnum) 4060 { 4061 Coroutine *co; 4062 BdrvCoGetBlockStatusData data = { 4063 .bs = bs, 4064 .sector_num = sector_num, 4065 .nb_sectors = nb_sectors, 4066 .pnum = pnum, 4067 .done = false, 4068 }; 4069 4070 if (qemu_in_coroutine()) { 4071 /* Fast-path if already in coroutine context */ 4072 bdrv_get_block_status_co_entry(&data); 4073 } else { 4074 AioContext *aio_context = bdrv_get_aio_context(bs); 4075 4076 co = qemu_coroutine_create(bdrv_get_block_status_co_entry); 4077 qemu_coroutine_enter(co, &data); 4078 while (!data.done) { 4079 aio_poll(aio_context, true); 4080 } 4081 } 4082 return data.ret; 4083 } 4084 4085 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 4086 int nb_sectors, int *pnum) 4087 { 4088 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 4089 if (ret < 0) { 4090 return ret; 4091 } 4092 return !!(ret & BDRV_BLOCK_ALLOCATED); 4093 } 4094 4095 /* 4096 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 4097 * 4098 * Return true if the given sector is allocated in any image between 4099 * BASE and TOP (inclusive). BASE can be NULL to check if the given 4100 * sector is allocated in any image of the chain. Return false otherwise. 4101 * 4102 * 'pnum' is set to the number of sectors (including and immediately following 4103 * the specified sector) that are known to be in the same 4104 * allocated/unallocated state. 4105 * 4106 */ 4107 int bdrv_is_allocated_above(BlockDriverState *top, 4108 BlockDriverState *base, 4109 int64_t sector_num, 4110 int nb_sectors, int *pnum) 4111 { 4112 BlockDriverState *intermediate; 4113 int ret, n = nb_sectors; 4114 4115 intermediate = top; 4116 while (intermediate && intermediate != base) { 4117 int pnum_inter; 4118 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 4119 &pnum_inter); 4120 if (ret < 0) { 4121 return ret; 4122 } else if (ret) { 4123 *pnum = pnum_inter; 4124 return 1; 4125 } 4126 4127 /* 4128 * [sector_num, nb_sectors] is unallocated on top but intermediate 4129 * might have 4130 * 4131 * [sector_num+x, nr_sectors] allocated. 4132 */ 4133 if (n > pnum_inter && 4134 (intermediate == top || 4135 sector_num + pnum_inter < intermediate->total_sectors)) { 4136 n = pnum_inter; 4137 } 4138 4139 intermediate = intermediate->backing_hd; 4140 } 4141 4142 *pnum = n; 4143 return 0; 4144 } 4145 4146 const char *bdrv_get_encrypted_filename(BlockDriverState *bs) 4147 { 4148 if (bs->backing_hd && bs->backing_hd->encrypted) 4149 return bs->backing_file; 4150 else if (bs->encrypted) 4151 return bs->filename; 4152 else 4153 return NULL; 4154 } 4155 4156 void bdrv_get_backing_filename(BlockDriverState *bs, 4157 char *filename, int filename_size) 4158 { 4159 pstrcpy(filename, filename_size, bs->backing_file); 4160 } 4161 4162 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 4163 const uint8_t *buf, int nb_sectors) 4164 { 4165 BlockDriver *drv = bs->drv; 4166 if (!drv) 4167 return -ENOMEDIUM; 4168 if (!drv->bdrv_write_compressed) 4169 return -ENOTSUP; 4170 if (bdrv_check_request(bs, sector_num, nb_sectors)) 4171 return -EIO; 4172 4173 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 4174 4175 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 4176 } 4177 4178 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 4179 { 4180 BlockDriver *drv = bs->drv; 4181 if (!drv) 4182 return -ENOMEDIUM; 4183 if (!drv->bdrv_get_info) 4184 return -ENOTSUP; 4185 memset(bdi, 0, sizeof(*bdi)); 4186 return drv->bdrv_get_info(bs, bdi); 4187 } 4188 4189 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) 4190 { 4191 BlockDriver *drv = bs->drv; 4192 if (drv && drv->bdrv_get_specific_info) { 4193 return drv->bdrv_get_specific_info(bs); 4194 } 4195 return NULL; 4196 } 4197 4198 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 4199 int64_t pos, int size) 4200 { 4201 QEMUIOVector qiov; 4202 struct iovec iov = { 4203 .iov_base = (void *) buf, 4204 .iov_len = size, 4205 }; 4206 4207 qemu_iovec_init_external(&qiov, &iov, 1); 4208 return bdrv_writev_vmstate(bs, &qiov, pos); 4209 } 4210 4211 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 4212 { 4213 BlockDriver *drv = bs->drv; 4214 4215 if (!drv) { 4216 return -ENOMEDIUM; 4217 } else if (drv->bdrv_save_vmstate) { 4218 return drv->bdrv_save_vmstate(bs, qiov, pos); 4219 } else if (bs->file) { 4220 return bdrv_writev_vmstate(bs->file, qiov, pos); 4221 } 4222 4223 return -ENOTSUP; 4224 } 4225 4226 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 4227 int64_t pos, int size) 4228 { 4229 BlockDriver *drv = bs->drv; 4230 if (!drv) 4231 return -ENOMEDIUM; 4232 if (drv->bdrv_load_vmstate) 4233 return drv->bdrv_load_vmstate(bs, buf, pos, size); 4234 if (bs->file) 4235 return bdrv_load_vmstate(bs->file, buf, pos, size); 4236 return -ENOTSUP; 4237 } 4238 4239 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) 4240 { 4241 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { 4242 return; 4243 } 4244 4245 bs->drv->bdrv_debug_event(bs, event); 4246 } 4247 4248 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, 4249 const char *tag) 4250 { 4251 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { 4252 bs = bs->file; 4253 } 4254 4255 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { 4256 return bs->drv->bdrv_debug_breakpoint(bs, event, tag); 4257 } 4258 4259 return -ENOTSUP; 4260 } 4261 4262 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) 4263 { 4264 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { 4265 bs = bs->file; 4266 } 4267 4268 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { 4269 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); 4270 } 4271 4272 return -ENOTSUP; 4273 } 4274 4275 int bdrv_debug_resume(BlockDriverState *bs, const char *tag) 4276 { 4277 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { 4278 bs = bs->file; 4279 } 4280 4281 if (bs && bs->drv && bs->drv->bdrv_debug_resume) { 4282 return bs->drv->bdrv_debug_resume(bs, tag); 4283 } 4284 4285 return -ENOTSUP; 4286 } 4287 4288 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) 4289 { 4290 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { 4291 bs = bs->file; 4292 } 4293 4294 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { 4295 return bs->drv->bdrv_debug_is_suspended(bs, tag); 4296 } 4297 4298 return false; 4299 } 4300 4301 int bdrv_is_snapshot(BlockDriverState *bs) 4302 { 4303 return !!(bs->open_flags & BDRV_O_SNAPSHOT); 4304 } 4305 4306 /* backing_file can either be relative, or absolute, or a protocol. If it is 4307 * relative, it must be relative to the chain. So, passing in bs->filename 4308 * from a BDS as backing_file should not be done, as that may be relative to 4309 * the CWD rather than the chain. */ 4310 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, 4311 const char *backing_file) 4312 { 4313 char *filename_full = NULL; 4314 char *backing_file_full = NULL; 4315 char *filename_tmp = NULL; 4316 int is_protocol = 0; 4317 BlockDriverState *curr_bs = NULL; 4318 BlockDriverState *retval = NULL; 4319 4320 if (!bs || !bs->drv || !backing_file) { 4321 return NULL; 4322 } 4323 4324 filename_full = g_malloc(PATH_MAX); 4325 backing_file_full = g_malloc(PATH_MAX); 4326 filename_tmp = g_malloc(PATH_MAX); 4327 4328 is_protocol = path_has_protocol(backing_file); 4329 4330 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { 4331 4332 /* If either of the filename paths is actually a protocol, then 4333 * compare unmodified paths; otherwise make paths relative */ 4334 if (is_protocol || path_has_protocol(curr_bs->backing_file)) { 4335 if (strcmp(backing_file, curr_bs->backing_file) == 0) { 4336 retval = curr_bs->backing_hd; 4337 break; 4338 } 4339 } else { 4340 /* If not an absolute filename path, make it relative to the current 4341 * image's filename path */ 4342 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 4343 backing_file); 4344 4345 /* We are going to compare absolute pathnames */ 4346 if (!realpath(filename_tmp, filename_full)) { 4347 continue; 4348 } 4349 4350 /* We need to make sure the backing filename we are comparing against 4351 * is relative to the current image filename (or absolute) */ 4352 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 4353 curr_bs->backing_file); 4354 4355 if (!realpath(filename_tmp, backing_file_full)) { 4356 continue; 4357 } 4358 4359 if (strcmp(backing_file_full, filename_full) == 0) { 4360 retval = curr_bs->backing_hd; 4361 break; 4362 } 4363 } 4364 } 4365 4366 g_free(filename_full); 4367 g_free(backing_file_full); 4368 g_free(filename_tmp); 4369 return retval; 4370 } 4371 4372 int bdrv_get_backing_file_depth(BlockDriverState *bs) 4373 { 4374 if (!bs->drv) { 4375 return 0; 4376 } 4377 4378 if (!bs->backing_hd) { 4379 return 0; 4380 } 4381 4382 return 1 + bdrv_get_backing_file_depth(bs->backing_hd); 4383 } 4384 4385 /**************************************************************/ 4386 /* async I/Os */ 4387 4388 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 4389 QEMUIOVector *qiov, int nb_sectors, 4390 BlockCompletionFunc *cb, void *opaque) 4391 { 4392 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 4393 4394 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 4395 cb, opaque, false); 4396 } 4397 4398 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 4399 QEMUIOVector *qiov, int nb_sectors, 4400 BlockCompletionFunc *cb, void *opaque) 4401 { 4402 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 4403 4404 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 4405 cb, opaque, true); 4406 } 4407 4408 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 4409 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 4410 BlockCompletionFunc *cb, void *opaque) 4411 { 4412 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 4413 4414 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 4415 BDRV_REQ_ZERO_WRITE | flags, 4416 cb, opaque, true); 4417 } 4418 4419 4420 typedef struct MultiwriteCB { 4421 int error; 4422 int num_requests; 4423 int num_callbacks; 4424 struct { 4425 BlockCompletionFunc *cb; 4426 void *opaque; 4427 QEMUIOVector *free_qiov; 4428 } callbacks[]; 4429 } MultiwriteCB; 4430 4431 static void multiwrite_user_cb(MultiwriteCB *mcb) 4432 { 4433 int i; 4434 4435 for (i = 0; i < mcb->num_callbacks; i++) { 4436 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 4437 if (mcb->callbacks[i].free_qiov) { 4438 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 4439 } 4440 g_free(mcb->callbacks[i].free_qiov); 4441 } 4442 } 4443 4444 static void multiwrite_cb(void *opaque, int ret) 4445 { 4446 MultiwriteCB *mcb = opaque; 4447 4448 trace_multiwrite_cb(mcb, ret); 4449 4450 if (ret < 0 && !mcb->error) { 4451 mcb->error = ret; 4452 } 4453 4454 mcb->num_requests--; 4455 if (mcb->num_requests == 0) { 4456 multiwrite_user_cb(mcb); 4457 g_free(mcb); 4458 } 4459 } 4460 4461 static int multiwrite_req_compare(const void *a, const void *b) 4462 { 4463 const BlockRequest *req1 = a, *req2 = b; 4464 4465 /* 4466 * Note that we can't simply subtract req2->sector from req1->sector 4467 * here as that could overflow the return value. 4468 */ 4469 if (req1->sector > req2->sector) { 4470 return 1; 4471 } else if (req1->sector < req2->sector) { 4472 return -1; 4473 } else { 4474 return 0; 4475 } 4476 } 4477 4478 /* 4479 * Takes a bunch of requests and tries to merge them. Returns the number of 4480 * requests that remain after merging. 4481 */ 4482 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 4483 int num_reqs, MultiwriteCB *mcb) 4484 { 4485 int i, outidx; 4486 4487 // Sort requests by start sector 4488 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 4489 4490 // Check if adjacent requests touch the same clusters. If so, combine them, 4491 // filling up gaps with zero sectors. 4492 outidx = 0; 4493 for (i = 1; i < num_reqs; i++) { 4494 int merge = 0; 4495 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 4496 4497 // Handle exactly sequential writes and overlapping writes. 4498 if (reqs[i].sector <= oldreq_last) { 4499 merge = 1; 4500 } 4501 4502 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 4503 merge = 0; 4504 } 4505 4506 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 4507 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 4508 merge = 0; 4509 } 4510 4511 if (merge) { 4512 size_t size; 4513 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 4514 qemu_iovec_init(qiov, 4515 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 4516 4517 // Add the first request to the merged one. If the requests are 4518 // overlapping, drop the last sectors of the first request. 4519 size = (reqs[i].sector - reqs[outidx].sector) << 9; 4520 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 4521 4522 // We should need to add any zeros between the two requests 4523 assert (reqs[i].sector <= oldreq_last); 4524 4525 // Add the second request 4526 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 4527 4528 // Add tail of first request, if necessary 4529 if (qiov->size < reqs[outidx].qiov->size) { 4530 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 4531 reqs[outidx].qiov->size - qiov->size); 4532 } 4533 4534 reqs[outidx].nb_sectors = qiov->size >> 9; 4535 reqs[outidx].qiov = qiov; 4536 4537 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 4538 } else { 4539 outidx++; 4540 reqs[outidx].sector = reqs[i].sector; 4541 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 4542 reqs[outidx].qiov = reqs[i].qiov; 4543 } 4544 } 4545 4546 return outidx + 1; 4547 } 4548 4549 /* 4550 * Submit multiple AIO write requests at once. 4551 * 4552 * On success, the function returns 0 and all requests in the reqs array have 4553 * been submitted. In error case this function returns -1, and any of the 4554 * requests may or may not be submitted yet. In particular, this means that the 4555 * callback will be called for some of the requests, for others it won't. The 4556 * caller must check the error field of the BlockRequest to wait for the right 4557 * callbacks (if error != 0, no callback will be called). 4558 * 4559 * The implementation may modify the contents of the reqs array, e.g. to merge 4560 * requests. However, the fields opaque and error are left unmodified as they 4561 * are used to signal failure for a single request to the caller. 4562 */ 4563 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 4564 { 4565 MultiwriteCB *mcb; 4566 int i; 4567 4568 /* don't submit writes if we don't have a medium */ 4569 if (bs->drv == NULL) { 4570 for (i = 0; i < num_reqs; i++) { 4571 reqs[i].error = -ENOMEDIUM; 4572 } 4573 return -1; 4574 } 4575 4576 if (num_reqs == 0) { 4577 return 0; 4578 } 4579 4580 // Create MultiwriteCB structure 4581 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 4582 mcb->num_requests = 0; 4583 mcb->num_callbacks = num_reqs; 4584 4585 for (i = 0; i < num_reqs; i++) { 4586 mcb->callbacks[i].cb = reqs[i].cb; 4587 mcb->callbacks[i].opaque = reqs[i].opaque; 4588 } 4589 4590 // Check for mergable requests 4591 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 4592 4593 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 4594 4595 /* Run the aio requests. */ 4596 mcb->num_requests = num_reqs; 4597 for (i = 0; i < num_reqs; i++) { 4598 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 4599 reqs[i].nb_sectors, reqs[i].flags, 4600 multiwrite_cb, mcb, 4601 true); 4602 } 4603 4604 return 0; 4605 } 4606 4607 void bdrv_aio_cancel(BlockAIOCB *acb) 4608 { 4609 qemu_aio_ref(acb); 4610 bdrv_aio_cancel_async(acb); 4611 while (acb->refcnt > 1) { 4612 if (acb->aiocb_info->get_aio_context) { 4613 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 4614 } else if (acb->bs) { 4615 aio_poll(bdrv_get_aio_context(acb->bs), true); 4616 } else { 4617 abort(); 4618 } 4619 } 4620 qemu_aio_unref(acb); 4621 } 4622 4623 /* Async version of aio cancel. The caller is not blocked if the acb implements 4624 * cancel_async, otherwise we do nothing and let the request normally complete. 4625 * In either case the completion callback must be called. */ 4626 void bdrv_aio_cancel_async(BlockAIOCB *acb) 4627 { 4628 if (acb->aiocb_info->cancel_async) { 4629 acb->aiocb_info->cancel_async(acb); 4630 } 4631 } 4632 4633 /**************************************************************/ 4634 /* async block device emulation */ 4635 4636 typedef struct BlockAIOCBSync { 4637 BlockAIOCB common; 4638 QEMUBH *bh; 4639 int ret; 4640 /* vector translation state */ 4641 QEMUIOVector *qiov; 4642 uint8_t *bounce; 4643 int is_write; 4644 } BlockAIOCBSync; 4645 4646 static const AIOCBInfo bdrv_em_aiocb_info = { 4647 .aiocb_size = sizeof(BlockAIOCBSync), 4648 }; 4649 4650 static void bdrv_aio_bh_cb(void *opaque) 4651 { 4652 BlockAIOCBSync *acb = opaque; 4653 4654 if (!acb->is_write && acb->ret >= 0) { 4655 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 4656 } 4657 qemu_vfree(acb->bounce); 4658 acb->common.cb(acb->common.opaque, acb->ret); 4659 qemu_bh_delete(acb->bh); 4660 acb->bh = NULL; 4661 qemu_aio_unref(acb); 4662 } 4663 4664 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 4665 int64_t sector_num, 4666 QEMUIOVector *qiov, 4667 int nb_sectors, 4668 BlockCompletionFunc *cb, 4669 void *opaque, 4670 int is_write) 4671 4672 { 4673 BlockAIOCBSync *acb; 4674 4675 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 4676 acb->is_write = is_write; 4677 acb->qiov = qiov; 4678 acb->bounce = qemu_try_blockalign(bs, qiov->size); 4679 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 4680 4681 if (acb->bounce == NULL) { 4682 acb->ret = -ENOMEM; 4683 } else if (is_write) { 4684 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 4685 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 4686 } else { 4687 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 4688 } 4689 4690 qemu_bh_schedule(acb->bh); 4691 4692 return &acb->common; 4693 } 4694 4695 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 4696 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 4697 BlockCompletionFunc *cb, void *opaque) 4698 { 4699 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 4700 } 4701 4702 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 4703 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 4704 BlockCompletionFunc *cb, void *opaque) 4705 { 4706 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 4707 } 4708 4709 4710 typedef struct BlockAIOCBCoroutine { 4711 BlockAIOCB common; 4712 BlockRequest req; 4713 bool is_write; 4714 bool *done; 4715 QEMUBH* bh; 4716 } BlockAIOCBCoroutine; 4717 4718 static const AIOCBInfo bdrv_em_co_aiocb_info = { 4719 .aiocb_size = sizeof(BlockAIOCBCoroutine), 4720 }; 4721 4722 static void bdrv_co_em_bh(void *opaque) 4723 { 4724 BlockAIOCBCoroutine *acb = opaque; 4725 4726 acb->common.cb(acb->common.opaque, acb->req.error); 4727 4728 qemu_bh_delete(acb->bh); 4729 qemu_aio_unref(acb); 4730 } 4731 4732 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 4733 static void coroutine_fn bdrv_co_do_rw(void *opaque) 4734 { 4735 BlockAIOCBCoroutine *acb = opaque; 4736 BlockDriverState *bs = acb->common.bs; 4737 4738 if (!acb->is_write) { 4739 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 4740 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 4741 } else { 4742 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 4743 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 4744 } 4745 4746 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4747 qemu_bh_schedule(acb->bh); 4748 } 4749 4750 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 4751 int64_t sector_num, 4752 QEMUIOVector *qiov, 4753 int nb_sectors, 4754 BdrvRequestFlags flags, 4755 BlockCompletionFunc *cb, 4756 void *opaque, 4757 bool is_write) 4758 { 4759 Coroutine *co; 4760 BlockAIOCBCoroutine *acb; 4761 4762 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4763 acb->req.sector = sector_num; 4764 acb->req.nb_sectors = nb_sectors; 4765 acb->req.qiov = qiov; 4766 acb->req.flags = flags; 4767 acb->is_write = is_write; 4768 4769 co = qemu_coroutine_create(bdrv_co_do_rw); 4770 qemu_coroutine_enter(co, acb); 4771 4772 return &acb->common; 4773 } 4774 4775 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 4776 { 4777 BlockAIOCBCoroutine *acb = opaque; 4778 BlockDriverState *bs = acb->common.bs; 4779 4780 acb->req.error = bdrv_co_flush(bs); 4781 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4782 qemu_bh_schedule(acb->bh); 4783 } 4784 4785 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 4786 BlockCompletionFunc *cb, void *opaque) 4787 { 4788 trace_bdrv_aio_flush(bs, opaque); 4789 4790 Coroutine *co; 4791 BlockAIOCBCoroutine *acb; 4792 4793 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4794 4795 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 4796 qemu_coroutine_enter(co, acb); 4797 4798 return &acb->common; 4799 } 4800 4801 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 4802 { 4803 BlockAIOCBCoroutine *acb = opaque; 4804 BlockDriverState *bs = acb->common.bs; 4805 4806 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 4807 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4808 qemu_bh_schedule(acb->bh); 4809 } 4810 4811 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 4812 int64_t sector_num, int nb_sectors, 4813 BlockCompletionFunc *cb, void *opaque) 4814 { 4815 Coroutine *co; 4816 BlockAIOCBCoroutine *acb; 4817 4818 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 4819 4820 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4821 acb->req.sector = sector_num; 4822 acb->req.nb_sectors = nb_sectors; 4823 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 4824 qemu_coroutine_enter(co, acb); 4825 4826 return &acb->common; 4827 } 4828 4829 void bdrv_init(void) 4830 { 4831 module_call_init(MODULE_INIT_BLOCK); 4832 } 4833 4834 void bdrv_init_with_whitelist(void) 4835 { 4836 use_bdrv_whitelist = 1; 4837 bdrv_init(); 4838 } 4839 4840 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 4841 BlockCompletionFunc *cb, void *opaque) 4842 { 4843 BlockAIOCB *acb; 4844 4845 acb = g_slice_alloc(aiocb_info->aiocb_size); 4846 acb->aiocb_info = aiocb_info; 4847 acb->bs = bs; 4848 acb->cb = cb; 4849 acb->opaque = opaque; 4850 acb->refcnt = 1; 4851 return acb; 4852 } 4853 4854 void qemu_aio_ref(void *p) 4855 { 4856 BlockAIOCB *acb = p; 4857 acb->refcnt++; 4858 } 4859 4860 void qemu_aio_unref(void *p) 4861 { 4862 BlockAIOCB *acb = p; 4863 assert(acb->refcnt > 0); 4864 if (--acb->refcnt == 0) { 4865 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 4866 } 4867 } 4868 4869 /**************************************************************/ 4870 /* Coroutine block device emulation */ 4871 4872 typedef struct CoroutineIOCompletion { 4873 Coroutine *coroutine; 4874 int ret; 4875 } CoroutineIOCompletion; 4876 4877 static void bdrv_co_io_em_complete(void *opaque, int ret) 4878 { 4879 CoroutineIOCompletion *co = opaque; 4880 4881 co->ret = ret; 4882 qemu_coroutine_enter(co->coroutine, NULL); 4883 } 4884 4885 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 4886 int nb_sectors, QEMUIOVector *iov, 4887 bool is_write) 4888 { 4889 CoroutineIOCompletion co = { 4890 .coroutine = qemu_coroutine_self(), 4891 }; 4892 BlockAIOCB *acb; 4893 4894 if (is_write) { 4895 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 4896 bdrv_co_io_em_complete, &co); 4897 } else { 4898 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 4899 bdrv_co_io_em_complete, &co); 4900 } 4901 4902 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 4903 if (!acb) { 4904 return -EIO; 4905 } 4906 qemu_coroutine_yield(); 4907 4908 return co.ret; 4909 } 4910 4911 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 4912 int64_t sector_num, int nb_sectors, 4913 QEMUIOVector *iov) 4914 { 4915 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 4916 } 4917 4918 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 4919 int64_t sector_num, int nb_sectors, 4920 QEMUIOVector *iov) 4921 { 4922 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 4923 } 4924 4925 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 4926 { 4927 RwCo *rwco = opaque; 4928 4929 rwco->ret = bdrv_co_flush(rwco->bs); 4930 } 4931 4932 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 4933 { 4934 int ret; 4935 4936 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 4937 return 0; 4938 } 4939 4940 /* Write back cached data to the OS even with cache=unsafe */ 4941 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 4942 if (bs->drv->bdrv_co_flush_to_os) { 4943 ret = bs->drv->bdrv_co_flush_to_os(bs); 4944 if (ret < 0) { 4945 return ret; 4946 } 4947 } 4948 4949 /* But don't actually force it to the disk with cache=unsafe */ 4950 if (bs->open_flags & BDRV_O_NO_FLUSH) { 4951 goto flush_parent; 4952 } 4953 4954 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 4955 if (bs->drv->bdrv_co_flush_to_disk) { 4956 ret = bs->drv->bdrv_co_flush_to_disk(bs); 4957 } else if (bs->drv->bdrv_aio_flush) { 4958 BlockAIOCB *acb; 4959 CoroutineIOCompletion co = { 4960 .coroutine = qemu_coroutine_self(), 4961 }; 4962 4963 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 4964 if (acb == NULL) { 4965 ret = -EIO; 4966 } else { 4967 qemu_coroutine_yield(); 4968 ret = co.ret; 4969 } 4970 } else { 4971 /* 4972 * Some block drivers always operate in either writethrough or unsafe 4973 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 4974 * know how the server works (because the behaviour is hardcoded or 4975 * depends on server-side configuration), so we can't ensure that 4976 * everything is safe on disk. Returning an error doesn't work because 4977 * that would break guests even if the server operates in writethrough 4978 * mode. 4979 * 4980 * Let's hope the user knows what he's doing. 4981 */ 4982 ret = 0; 4983 } 4984 if (ret < 0) { 4985 return ret; 4986 } 4987 4988 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 4989 * in the case of cache=unsafe, so there are no useless flushes. 4990 */ 4991 flush_parent: 4992 return bdrv_co_flush(bs->file); 4993 } 4994 4995 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) 4996 { 4997 Error *local_err = NULL; 4998 int ret; 4999 5000 if (!bs->drv) { 5001 return; 5002 } 5003 5004 if (!(bs->open_flags & BDRV_O_INCOMING)) { 5005 return; 5006 } 5007 bs->open_flags &= ~BDRV_O_INCOMING; 5008 5009 if (bs->drv->bdrv_invalidate_cache) { 5010 bs->drv->bdrv_invalidate_cache(bs, &local_err); 5011 } else if (bs->file) { 5012 bdrv_invalidate_cache(bs->file, &local_err); 5013 } 5014 if (local_err) { 5015 error_propagate(errp, local_err); 5016 return; 5017 } 5018 5019 ret = refresh_total_sectors(bs, bs->total_sectors); 5020 if (ret < 0) { 5021 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 5022 return; 5023 } 5024 } 5025 5026 void bdrv_invalidate_cache_all(Error **errp) 5027 { 5028 BlockDriverState *bs; 5029 Error *local_err = NULL; 5030 5031 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 5032 AioContext *aio_context = bdrv_get_aio_context(bs); 5033 5034 aio_context_acquire(aio_context); 5035 bdrv_invalidate_cache(bs, &local_err); 5036 aio_context_release(aio_context); 5037 if (local_err) { 5038 error_propagate(errp, local_err); 5039 return; 5040 } 5041 } 5042 } 5043 5044 int bdrv_flush(BlockDriverState *bs) 5045 { 5046 Coroutine *co; 5047 RwCo rwco = { 5048 .bs = bs, 5049 .ret = NOT_DONE, 5050 }; 5051 5052 if (qemu_in_coroutine()) { 5053 /* Fast-path if already in coroutine context */ 5054 bdrv_flush_co_entry(&rwco); 5055 } else { 5056 AioContext *aio_context = bdrv_get_aio_context(bs); 5057 5058 co = qemu_coroutine_create(bdrv_flush_co_entry); 5059 qemu_coroutine_enter(co, &rwco); 5060 while (rwco.ret == NOT_DONE) { 5061 aio_poll(aio_context, true); 5062 } 5063 } 5064 5065 return rwco.ret; 5066 } 5067 5068 typedef struct DiscardCo { 5069 BlockDriverState *bs; 5070 int64_t sector_num; 5071 int nb_sectors; 5072 int ret; 5073 } DiscardCo; 5074 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 5075 { 5076 DiscardCo *rwco = opaque; 5077 5078 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 5079 } 5080 5081 /* if no limit is specified in the BlockLimits use a default 5082 * of 32768 512-byte sectors (16 MiB) per request. 5083 */ 5084 #define MAX_DISCARD_DEFAULT 32768 5085 5086 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 5087 int nb_sectors) 5088 { 5089 int max_discard; 5090 5091 if (!bs->drv) { 5092 return -ENOMEDIUM; 5093 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { 5094 return -EIO; 5095 } else if (bs->read_only) { 5096 return -EROFS; 5097 } 5098 5099 bdrv_reset_dirty(bs, sector_num, nb_sectors); 5100 5101 /* Do nothing if disabled. */ 5102 if (!(bs->open_flags & BDRV_O_UNMAP)) { 5103 return 0; 5104 } 5105 5106 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 5107 return 0; 5108 } 5109 5110 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; 5111 while (nb_sectors > 0) { 5112 int ret; 5113 int num = nb_sectors; 5114 5115 /* align request */ 5116 if (bs->bl.discard_alignment && 5117 num >= bs->bl.discard_alignment && 5118 sector_num % bs->bl.discard_alignment) { 5119 if (num > bs->bl.discard_alignment) { 5120 num = bs->bl.discard_alignment; 5121 } 5122 num -= sector_num % bs->bl.discard_alignment; 5123 } 5124 5125 /* limit request size */ 5126 if (num > max_discard) { 5127 num = max_discard; 5128 } 5129 5130 if (bs->drv->bdrv_co_discard) { 5131 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 5132 } else { 5133 BlockAIOCB *acb; 5134 CoroutineIOCompletion co = { 5135 .coroutine = qemu_coroutine_self(), 5136 }; 5137 5138 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 5139 bdrv_co_io_em_complete, &co); 5140 if (acb == NULL) { 5141 return -EIO; 5142 } else { 5143 qemu_coroutine_yield(); 5144 ret = co.ret; 5145 } 5146 } 5147 if (ret && ret != -ENOTSUP) { 5148 return ret; 5149 } 5150 5151 sector_num += num; 5152 nb_sectors -= num; 5153 } 5154 return 0; 5155 } 5156 5157 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 5158 { 5159 Coroutine *co; 5160 DiscardCo rwco = { 5161 .bs = bs, 5162 .sector_num = sector_num, 5163 .nb_sectors = nb_sectors, 5164 .ret = NOT_DONE, 5165 }; 5166 5167 if (qemu_in_coroutine()) { 5168 /* Fast-path if already in coroutine context */ 5169 bdrv_discard_co_entry(&rwco); 5170 } else { 5171 AioContext *aio_context = bdrv_get_aio_context(bs); 5172 5173 co = qemu_coroutine_create(bdrv_discard_co_entry); 5174 qemu_coroutine_enter(co, &rwco); 5175 while (rwco.ret == NOT_DONE) { 5176 aio_poll(aio_context, true); 5177 } 5178 } 5179 5180 return rwco.ret; 5181 } 5182 5183 /**************************************************************/ 5184 /* removable device support */ 5185 5186 /** 5187 * Return TRUE if the media is present 5188 */ 5189 int bdrv_is_inserted(BlockDriverState *bs) 5190 { 5191 BlockDriver *drv = bs->drv; 5192 5193 if (!drv) 5194 return 0; 5195 if (!drv->bdrv_is_inserted) 5196 return 1; 5197 return drv->bdrv_is_inserted(bs); 5198 } 5199 5200 /** 5201 * Return whether the media changed since the last call to this 5202 * function, or -ENOTSUP if we don't know. Most drivers don't know. 5203 */ 5204 int bdrv_media_changed(BlockDriverState *bs) 5205 { 5206 BlockDriver *drv = bs->drv; 5207 5208 if (drv && drv->bdrv_media_changed) { 5209 return drv->bdrv_media_changed(bs); 5210 } 5211 return -ENOTSUP; 5212 } 5213 5214 /** 5215 * If eject_flag is TRUE, eject the media. Otherwise, close the tray 5216 */ 5217 void bdrv_eject(BlockDriverState *bs, bool eject_flag) 5218 { 5219 BlockDriver *drv = bs->drv; 5220 const char *device_name; 5221 5222 if (drv && drv->bdrv_eject) { 5223 drv->bdrv_eject(bs, eject_flag); 5224 } 5225 5226 device_name = bdrv_get_device_name(bs); 5227 if (device_name[0] != '\0') { 5228 qapi_event_send_device_tray_moved(device_name, 5229 eject_flag, &error_abort); 5230 } 5231 } 5232 5233 /** 5234 * Lock or unlock the media (if it is locked, the user won't be able 5235 * to eject it manually). 5236 */ 5237 void bdrv_lock_medium(BlockDriverState *bs, bool locked) 5238 { 5239 BlockDriver *drv = bs->drv; 5240 5241 trace_bdrv_lock_medium(bs, locked); 5242 5243 if (drv && drv->bdrv_lock_medium) { 5244 drv->bdrv_lock_medium(bs, locked); 5245 } 5246 } 5247 5248 /* needed for generic scsi interface */ 5249 5250 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 5251 { 5252 BlockDriver *drv = bs->drv; 5253 5254 if (drv && drv->bdrv_ioctl) 5255 return drv->bdrv_ioctl(bs, req, buf); 5256 return -ENOTSUP; 5257 } 5258 5259 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 5260 unsigned long int req, void *buf, 5261 BlockCompletionFunc *cb, void *opaque) 5262 { 5263 BlockDriver *drv = bs->drv; 5264 5265 if (drv && drv->bdrv_aio_ioctl) 5266 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 5267 return NULL; 5268 } 5269 5270 void bdrv_set_guest_block_size(BlockDriverState *bs, int align) 5271 { 5272 bs->guest_block_size = align; 5273 } 5274 5275 void *qemu_blockalign(BlockDriverState *bs, size_t size) 5276 { 5277 return qemu_memalign(bdrv_opt_mem_align(bs), size); 5278 } 5279 5280 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 5281 { 5282 return memset(qemu_blockalign(bs, size), 0, size); 5283 } 5284 5285 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 5286 { 5287 size_t align = bdrv_opt_mem_align(bs); 5288 5289 /* Ensure that NULL is never returned on success */ 5290 assert(align > 0); 5291 if (size == 0) { 5292 size = align; 5293 } 5294 5295 return qemu_try_memalign(align, size); 5296 } 5297 5298 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 5299 { 5300 void *mem = qemu_try_blockalign(bs, size); 5301 5302 if (mem) { 5303 memset(mem, 0, size); 5304 } 5305 5306 return mem; 5307 } 5308 5309 /* 5310 * Check if all memory in this vector is sector aligned. 5311 */ 5312 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 5313 { 5314 int i; 5315 size_t alignment = bdrv_opt_mem_align(bs); 5316 5317 for (i = 0; i < qiov->niov; i++) { 5318 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 5319 return false; 5320 } 5321 if (qiov->iov[i].iov_len % alignment) { 5322 return false; 5323 } 5324 } 5325 5326 return true; 5327 } 5328 5329 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, 5330 Error **errp) 5331 { 5332 int64_t bitmap_size; 5333 BdrvDirtyBitmap *bitmap; 5334 5335 assert((granularity & (granularity - 1)) == 0); 5336 5337 granularity >>= BDRV_SECTOR_BITS; 5338 assert(granularity); 5339 bitmap_size = bdrv_nb_sectors(bs); 5340 if (bitmap_size < 0) { 5341 error_setg_errno(errp, -bitmap_size, "could not get length of device"); 5342 errno = -bitmap_size; 5343 return NULL; 5344 } 5345 bitmap = g_new0(BdrvDirtyBitmap, 1); 5346 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); 5347 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); 5348 return bitmap; 5349 } 5350 5351 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) 5352 { 5353 BdrvDirtyBitmap *bm, *next; 5354 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { 5355 if (bm == bitmap) { 5356 QLIST_REMOVE(bitmap, list); 5357 hbitmap_free(bitmap->bitmap); 5358 g_free(bitmap); 5359 return; 5360 } 5361 } 5362 } 5363 5364 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) 5365 { 5366 BdrvDirtyBitmap *bm; 5367 BlockDirtyInfoList *list = NULL; 5368 BlockDirtyInfoList **plist = &list; 5369 5370 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { 5371 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); 5372 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); 5373 info->count = bdrv_get_dirty_count(bs, bm); 5374 info->granularity = 5375 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap)); 5376 entry->value = info; 5377 *plist = entry; 5378 plist = &entry->next; 5379 } 5380 5381 return list; 5382 } 5383 5384 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector) 5385 { 5386 if (bitmap) { 5387 return hbitmap_get(bitmap->bitmap, sector); 5388 } else { 5389 return 0; 5390 } 5391 } 5392 5393 void bdrv_dirty_iter_init(BlockDriverState *bs, 5394 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) 5395 { 5396 hbitmap_iter_init(hbi, bitmap->bitmap, 0); 5397 } 5398 5399 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, 5400 int nr_sectors) 5401 { 5402 BdrvDirtyBitmap *bitmap; 5403 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { 5404 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); 5405 } 5406 } 5407 5408 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) 5409 { 5410 BdrvDirtyBitmap *bitmap; 5411 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { 5412 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); 5413 } 5414 } 5415 5416 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) 5417 { 5418 return hbitmap_count(bitmap->bitmap); 5419 } 5420 5421 /* Get a reference to bs */ 5422 void bdrv_ref(BlockDriverState *bs) 5423 { 5424 bs->refcnt++; 5425 } 5426 5427 /* Release a previously grabbed reference to bs. 5428 * If after releasing, reference count is zero, the BlockDriverState is 5429 * deleted. */ 5430 void bdrv_unref(BlockDriverState *bs) 5431 { 5432 if (!bs) { 5433 return; 5434 } 5435 assert(bs->refcnt > 0); 5436 if (--bs->refcnt == 0) { 5437 bdrv_delete(bs); 5438 } 5439 } 5440 5441 struct BdrvOpBlocker { 5442 Error *reason; 5443 QLIST_ENTRY(BdrvOpBlocker) list; 5444 }; 5445 5446 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp) 5447 { 5448 BdrvOpBlocker *blocker; 5449 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5450 if (!QLIST_EMPTY(&bs->op_blockers[op])) { 5451 blocker = QLIST_FIRST(&bs->op_blockers[op]); 5452 if (errp) { 5453 error_setg(errp, "Device '%s' is busy: %s", 5454 bdrv_get_device_name(bs), 5455 error_get_pretty(blocker->reason)); 5456 } 5457 return true; 5458 } 5459 return false; 5460 } 5461 5462 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason) 5463 { 5464 BdrvOpBlocker *blocker; 5465 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5466 5467 blocker = g_new0(BdrvOpBlocker, 1); 5468 blocker->reason = reason; 5469 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list); 5470 } 5471 5472 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason) 5473 { 5474 BdrvOpBlocker *blocker, *next; 5475 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5476 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) { 5477 if (blocker->reason == reason) { 5478 QLIST_REMOVE(blocker, list); 5479 g_free(blocker); 5480 } 5481 } 5482 } 5483 5484 void bdrv_op_block_all(BlockDriverState *bs, Error *reason) 5485 { 5486 int i; 5487 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5488 bdrv_op_block(bs, i, reason); 5489 } 5490 } 5491 5492 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason) 5493 { 5494 int i; 5495 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5496 bdrv_op_unblock(bs, i, reason); 5497 } 5498 } 5499 5500 bool bdrv_op_blocker_is_empty(BlockDriverState *bs) 5501 { 5502 int i; 5503 5504 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5505 if (!QLIST_EMPTY(&bs->op_blockers[i])) { 5506 return false; 5507 } 5508 } 5509 return true; 5510 } 5511 5512 void bdrv_iostatus_enable(BlockDriverState *bs) 5513 { 5514 bs->iostatus_enabled = true; 5515 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 5516 } 5517 5518 /* The I/O status is only enabled if the drive explicitly 5519 * enables it _and_ the VM is configured to stop on errors */ 5520 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) 5521 { 5522 return (bs->iostatus_enabled && 5523 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || 5524 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || 5525 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); 5526 } 5527 5528 void bdrv_iostatus_disable(BlockDriverState *bs) 5529 { 5530 bs->iostatus_enabled = false; 5531 } 5532 5533 void bdrv_iostatus_reset(BlockDriverState *bs) 5534 { 5535 if (bdrv_iostatus_is_enabled(bs)) { 5536 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 5537 if (bs->job) { 5538 block_job_iostatus_reset(bs->job); 5539 } 5540 } 5541 } 5542 5543 void bdrv_iostatus_set_err(BlockDriverState *bs, int error) 5544 { 5545 assert(bdrv_iostatus_is_enabled(bs)); 5546 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 5547 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : 5548 BLOCK_DEVICE_IO_STATUS_FAILED; 5549 } 5550 } 5551 5552 void bdrv_img_create(const char *filename, const char *fmt, 5553 const char *base_filename, const char *base_fmt, 5554 char *options, uint64_t img_size, int flags, 5555 Error **errp, bool quiet) 5556 { 5557 QemuOptsList *create_opts = NULL; 5558 QemuOpts *opts = NULL; 5559 const char *backing_fmt, *backing_file; 5560 int64_t size; 5561 BlockDriver *drv, *proto_drv; 5562 BlockDriver *backing_drv = NULL; 5563 Error *local_err = NULL; 5564 int ret = 0; 5565 5566 /* Find driver and parse its options */ 5567 drv = bdrv_find_format(fmt); 5568 if (!drv) { 5569 error_setg(errp, "Unknown file format '%s'", fmt); 5570 return; 5571 } 5572 5573 proto_drv = bdrv_find_protocol(filename, true); 5574 if (!proto_drv) { 5575 error_setg(errp, "Unknown protocol '%s'", filename); 5576 return; 5577 } 5578 5579 create_opts = qemu_opts_append(create_opts, drv->create_opts); 5580 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); 5581 5582 /* Create parameter list with default values */ 5583 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort); 5584 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size); 5585 5586 /* Parse -o options */ 5587 if (options) { 5588 if (qemu_opts_do_parse(opts, options, NULL) != 0) { 5589 error_setg(errp, "Invalid options for file format '%s'", fmt); 5590 goto out; 5591 } 5592 } 5593 5594 if (base_filename) { 5595 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) { 5596 error_setg(errp, "Backing file not supported for file format '%s'", 5597 fmt); 5598 goto out; 5599 } 5600 } 5601 5602 if (base_fmt) { 5603 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) { 5604 error_setg(errp, "Backing file format not supported for file " 5605 "format '%s'", fmt); 5606 goto out; 5607 } 5608 } 5609 5610 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); 5611 if (backing_file) { 5612 if (!strcmp(filename, backing_file)) { 5613 error_setg(errp, "Error: Trying to create an image with the " 5614 "same filename as the backing file"); 5615 goto out; 5616 } 5617 } 5618 5619 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); 5620 if (backing_fmt) { 5621 backing_drv = bdrv_find_format(backing_fmt); 5622 if (!backing_drv) { 5623 error_setg(errp, "Unknown backing file format '%s'", 5624 backing_fmt); 5625 goto out; 5626 } 5627 } 5628 5629 // The size for the image must always be specified, with one exception: 5630 // If we are using a backing file, we can obtain the size from there 5631 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); 5632 if (size == -1) { 5633 if (backing_file) { 5634 BlockDriverState *bs; 5635 int64_t size; 5636 int back_flags; 5637 5638 /* backing files always opened read-only */ 5639 back_flags = 5640 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 5641 5642 bs = NULL; 5643 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags, 5644 backing_drv, &local_err); 5645 if (ret < 0) { 5646 goto out; 5647 } 5648 size = bdrv_getlength(bs); 5649 if (size < 0) { 5650 error_setg_errno(errp, -size, "Could not get size of '%s'", 5651 backing_file); 5652 bdrv_unref(bs); 5653 goto out; 5654 } 5655 5656 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size); 5657 5658 bdrv_unref(bs); 5659 } else { 5660 error_setg(errp, "Image creation needs a size parameter"); 5661 goto out; 5662 } 5663 } 5664 5665 if (!quiet) { 5666 printf("Formatting '%s', fmt=%s ", filename, fmt); 5667 qemu_opts_print(opts); 5668 puts(""); 5669 } 5670 5671 ret = bdrv_create(drv, filename, opts, &local_err); 5672 5673 if (ret == -EFBIG) { 5674 /* This is generally a better message than whatever the driver would 5675 * deliver (especially because of the cluster_size_hint), since that 5676 * is most probably not much different from "image too large". */ 5677 const char *cluster_size_hint = ""; 5678 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) { 5679 cluster_size_hint = " (try using a larger cluster size)"; 5680 } 5681 error_setg(errp, "The image size is too large for file format '%s'" 5682 "%s", fmt, cluster_size_hint); 5683 error_free(local_err); 5684 local_err = NULL; 5685 } 5686 5687 out: 5688 qemu_opts_del(opts); 5689 qemu_opts_free(create_opts); 5690 if (local_err) { 5691 error_propagate(errp, local_err); 5692 } 5693 } 5694 5695 AioContext *bdrv_get_aio_context(BlockDriverState *bs) 5696 { 5697 return bs->aio_context; 5698 } 5699 5700 void bdrv_detach_aio_context(BlockDriverState *bs) 5701 { 5702 BdrvAioNotifier *baf; 5703 5704 if (!bs->drv) { 5705 return; 5706 } 5707 5708 QLIST_FOREACH(baf, &bs->aio_notifiers, list) { 5709 baf->detach_aio_context(baf->opaque); 5710 } 5711 5712 if (bs->io_limits_enabled) { 5713 throttle_detach_aio_context(&bs->throttle_state); 5714 } 5715 if (bs->drv->bdrv_detach_aio_context) { 5716 bs->drv->bdrv_detach_aio_context(bs); 5717 } 5718 if (bs->file) { 5719 bdrv_detach_aio_context(bs->file); 5720 } 5721 if (bs->backing_hd) { 5722 bdrv_detach_aio_context(bs->backing_hd); 5723 } 5724 5725 bs->aio_context = NULL; 5726 } 5727 5728 void bdrv_attach_aio_context(BlockDriverState *bs, 5729 AioContext *new_context) 5730 { 5731 BdrvAioNotifier *ban; 5732 5733 if (!bs->drv) { 5734 return; 5735 } 5736 5737 bs->aio_context = new_context; 5738 5739 if (bs->backing_hd) { 5740 bdrv_attach_aio_context(bs->backing_hd, new_context); 5741 } 5742 if (bs->file) { 5743 bdrv_attach_aio_context(bs->file, new_context); 5744 } 5745 if (bs->drv->bdrv_attach_aio_context) { 5746 bs->drv->bdrv_attach_aio_context(bs, new_context); 5747 } 5748 if (bs->io_limits_enabled) { 5749 throttle_attach_aio_context(&bs->throttle_state, new_context); 5750 } 5751 5752 QLIST_FOREACH(ban, &bs->aio_notifiers, list) { 5753 ban->attached_aio_context(new_context, ban->opaque); 5754 } 5755 } 5756 5757 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context) 5758 { 5759 bdrv_drain_all(); /* ensure there are no in-flight requests */ 5760 5761 bdrv_detach_aio_context(bs); 5762 5763 /* This function executes in the old AioContext so acquire the new one in 5764 * case it runs in a different thread. 5765 */ 5766 aio_context_acquire(new_context); 5767 bdrv_attach_aio_context(bs, new_context); 5768 aio_context_release(new_context); 5769 } 5770 5771 void bdrv_add_aio_context_notifier(BlockDriverState *bs, 5772 void (*attached_aio_context)(AioContext *new_context, void *opaque), 5773 void (*detach_aio_context)(void *opaque), void *opaque) 5774 { 5775 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1); 5776 *ban = (BdrvAioNotifier){ 5777 .attached_aio_context = attached_aio_context, 5778 .detach_aio_context = detach_aio_context, 5779 .opaque = opaque 5780 }; 5781 5782 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list); 5783 } 5784 5785 void bdrv_remove_aio_context_notifier(BlockDriverState *bs, 5786 void (*attached_aio_context)(AioContext *, 5787 void *), 5788 void (*detach_aio_context)(void *), 5789 void *opaque) 5790 { 5791 BdrvAioNotifier *ban, *ban_next; 5792 5793 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) { 5794 if (ban->attached_aio_context == attached_aio_context && 5795 ban->detach_aio_context == detach_aio_context && 5796 ban->opaque == opaque) 5797 { 5798 QLIST_REMOVE(ban, list); 5799 g_free(ban); 5800 5801 return; 5802 } 5803 } 5804 5805 abort(); 5806 } 5807 5808 void bdrv_add_before_write_notifier(BlockDriverState *bs, 5809 NotifierWithReturn *notifier) 5810 { 5811 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 5812 } 5813 5814 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, 5815 BlockDriverAmendStatusCB *status_cb) 5816 { 5817 if (!bs->drv->bdrv_amend_options) { 5818 return -ENOTSUP; 5819 } 5820 return bs->drv->bdrv_amend_options(bs, opts, status_cb); 5821 } 5822 5823 /* This function will be called by the bdrv_recurse_is_first_non_filter method 5824 * of block filter and by bdrv_is_first_non_filter. 5825 * It is used to test if the given bs is the candidate or recurse more in the 5826 * node graph. 5827 */ 5828 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, 5829 BlockDriverState *candidate) 5830 { 5831 /* return false if basic checks fails */ 5832 if (!bs || !bs->drv) { 5833 return false; 5834 } 5835 5836 /* the code reached a non block filter driver -> check if the bs is 5837 * the same as the candidate. It's the recursion termination condition. 5838 */ 5839 if (!bs->drv->is_filter) { 5840 return bs == candidate; 5841 } 5842 /* Down this path the driver is a block filter driver */ 5843 5844 /* If the block filter recursion method is defined use it to recurse down 5845 * the node graph. 5846 */ 5847 if (bs->drv->bdrv_recurse_is_first_non_filter) { 5848 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); 5849 } 5850 5851 /* the driver is a block filter but don't allow to recurse -> return false 5852 */ 5853 return false; 5854 } 5855 5856 /* This function checks if the candidate is the first non filter bs down it's 5857 * bs chain. Since we don't have pointers to parents it explore all bs chains 5858 * from the top. Some filters can choose not to pass down the recursion. 5859 */ 5860 bool bdrv_is_first_non_filter(BlockDriverState *candidate) 5861 { 5862 BlockDriverState *bs; 5863 5864 /* walk down the bs forest recursively */ 5865 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 5866 bool perm; 5867 5868 /* try to recurse in this top level bs */ 5869 perm = bdrv_recurse_is_first_non_filter(bs, candidate); 5870 5871 /* candidate is the first non filter */ 5872 if (perm) { 5873 return true; 5874 } 5875 } 5876 5877 return false; 5878 } 5879 5880 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp) 5881 { 5882 BlockDriverState *to_replace_bs = bdrv_find_node(node_name); 5883 AioContext *aio_context; 5884 5885 if (!to_replace_bs) { 5886 error_setg(errp, "Node name '%s' not found", node_name); 5887 return NULL; 5888 } 5889 5890 aio_context = bdrv_get_aio_context(to_replace_bs); 5891 aio_context_acquire(aio_context); 5892 5893 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) { 5894 to_replace_bs = NULL; 5895 goto out; 5896 } 5897 5898 /* We don't want arbitrary node of the BDS chain to be replaced only the top 5899 * most non filter in order to prevent data corruption. 5900 * Another benefit is that this tests exclude backing files which are 5901 * blocked by the backing blockers. 5902 */ 5903 if (!bdrv_is_first_non_filter(to_replace_bs)) { 5904 error_setg(errp, "Only top most non filter can be replaced"); 5905 to_replace_bs = NULL; 5906 goto out; 5907 } 5908 5909 out: 5910 aio_context_release(aio_context); 5911 return to_replace_bs; 5912 } 5913 5914 void bdrv_io_plug(BlockDriverState *bs) 5915 { 5916 BlockDriver *drv = bs->drv; 5917 if (drv && drv->bdrv_io_plug) { 5918 drv->bdrv_io_plug(bs); 5919 } else if (bs->file) { 5920 bdrv_io_plug(bs->file); 5921 } 5922 } 5923 5924 void bdrv_io_unplug(BlockDriverState *bs) 5925 { 5926 BlockDriver *drv = bs->drv; 5927 if (drv && drv->bdrv_io_unplug) { 5928 drv->bdrv_io_unplug(bs); 5929 } else if (bs->file) { 5930 bdrv_io_unplug(bs->file); 5931 } 5932 } 5933 5934 void bdrv_flush_io_queue(BlockDriverState *bs) 5935 { 5936 BlockDriver *drv = bs->drv; 5937 if (drv && drv->bdrv_flush_io_queue) { 5938 drv->bdrv_flush_io_queue(bs); 5939 } else if (bs->file) { 5940 bdrv_flush_io_queue(bs->file); 5941 } 5942 } 5943 5944 static bool append_open_options(QDict *d, BlockDriverState *bs) 5945 { 5946 const QDictEntry *entry; 5947 bool found_any = false; 5948 5949 for (entry = qdict_first(bs->options); entry; 5950 entry = qdict_next(bs->options, entry)) 5951 { 5952 /* Only take options for this level and exclude all non-driver-specific 5953 * options */ 5954 if (!strchr(qdict_entry_key(entry), '.') && 5955 strcmp(qdict_entry_key(entry), "node-name")) 5956 { 5957 qobject_incref(qdict_entry_value(entry)); 5958 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry)); 5959 found_any = true; 5960 } 5961 } 5962 5963 return found_any; 5964 } 5965 5966 /* Updates the following BDS fields: 5967 * - exact_filename: A filename which may be used for opening a block device 5968 * which (mostly) equals the given BDS (even without any 5969 * other options; so reading and writing must return the same 5970 * results, but caching etc. may be different) 5971 * - full_open_options: Options which, when given when opening a block device 5972 * (without a filename), result in a BDS (mostly) 5973 * equalling the given one 5974 * - filename: If exact_filename is set, it is copied here. Otherwise, 5975 * full_open_options is converted to a JSON object, prefixed with 5976 * "json:" (for use through the JSON pseudo protocol) and put here. 5977 */ 5978 void bdrv_refresh_filename(BlockDriverState *bs) 5979 { 5980 BlockDriver *drv = bs->drv; 5981 QDict *opts; 5982 5983 if (!drv) { 5984 return; 5985 } 5986 5987 /* This BDS's file name will most probably depend on its file's name, so 5988 * refresh that first */ 5989 if (bs->file) { 5990 bdrv_refresh_filename(bs->file); 5991 } 5992 5993 if (drv->bdrv_refresh_filename) { 5994 /* Obsolete information is of no use here, so drop the old file name 5995 * information before refreshing it */ 5996 bs->exact_filename[0] = '\0'; 5997 if (bs->full_open_options) { 5998 QDECREF(bs->full_open_options); 5999 bs->full_open_options = NULL; 6000 } 6001 6002 drv->bdrv_refresh_filename(bs); 6003 } else if (bs->file) { 6004 /* Try to reconstruct valid information from the underlying file */ 6005 bool has_open_options; 6006 6007 bs->exact_filename[0] = '\0'; 6008 if (bs->full_open_options) { 6009 QDECREF(bs->full_open_options); 6010 bs->full_open_options = NULL; 6011 } 6012 6013 opts = qdict_new(); 6014 has_open_options = append_open_options(opts, bs); 6015 6016 /* If no specific options have been given for this BDS, the filename of 6017 * the underlying file should suffice for this one as well */ 6018 if (bs->file->exact_filename[0] && !has_open_options) { 6019 strcpy(bs->exact_filename, bs->file->exact_filename); 6020 } 6021 /* Reconstructing the full options QDict is simple for most format block 6022 * drivers, as long as the full options are known for the underlying 6023 * file BDS. The full options QDict of that file BDS should somehow 6024 * contain a representation of the filename, therefore the following 6025 * suffices without querying the (exact_)filename of this BDS. */ 6026 if (bs->file->full_open_options) { 6027 qdict_put_obj(opts, "driver", 6028 QOBJECT(qstring_from_str(drv->format_name))); 6029 QINCREF(bs->file->full_open_options); 6030 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options)); 6031 6032 bs->full_open_options = opts; 6033 } else { 6034 QDECREF(opts); 6035 } 6036 } else if (!bs->full_open_options && qdict_size(bs->options)) { 6037 /* There is no underlying file BDS (at least referenced by BDS.file), 6038 * so the full options QDict should be equal to the options given 6039 * specifically for this block device when it was opened (plus the 6040 * driver specification). 6041 * Because those options don't change, there is no need to update 6042 * full_open_options when it's already set. */ 6043 6044 opts = qdict_new(); 6045 append_open_options(opts, bs); 6046 qdict_put_obj(opts, "driver", 6047 QOBJECT(qstring_from_str(drv->format_name))); 6048 6049 if (bs->exact_filename[0]) { 6050 /* This may not work for all block protocol drivers (some may 6051 * require this filename to be parsed), but we have to find some 6052 * default solution here, so just include it. If some block driver 6053 * does not support pure options without any filename at all or 6054 * needs some special format of the options QDict, it needs to 6055 * implement the driver-specific bdrv_refresh_filename() function. 6056 */ 6057 qdict_put_obj(opts, "filename", 6058 QOBJECT(qstring_from_str(bs->exact_filename))); 6059 } 6060 6061 bs->full_open_options = opts; 6062 } 6063 6064 if (bs->exact_filename[0]) { 6065 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename); 6066 } else if (bs->full_open_options) { 6067 QString *json = qobject_to_json(QOBJECT(bs->full_open_options)); 6068 snprintf(bs->filename, sizeof(bs->filename), "json:%s", 6069 qstring_get_str(json)); 6070 QDECREF(json); 6071 } 6072 } 6073 6074 /* This accessor function purpose is to allow the device models to access the 6075 * BlockAcctStats structure embedded inside a BlockDriverState without being 6076 * aware of the BlockDriverState structure layout. 6077 * It will go away when the BlockAcctStats structure will be moved inside 6078 * the device models. 6079 */ 6080 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs) 6081 { 6082 return &bs->stats; 6083 } 6084