1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "config-host.h" 25 #include "qemu-common.h" 26 #include "trace.h" 27 #include "block/block_int.h" 28 #include "block/blockjob.h" 29 #include "qemu/module.h" 30 #include "qapi/qmp/qjson.h" 31 #include "sysemu/block-backend.h" 32 #include "sysemu/sysemu.h" 33 #include "qemu/notify.h" 34 #include "block/coroutine.h" 35 #include "block/qapi.h" 36 #include "qmp-commands.h" 37 #include "qemu/timer.h" 38 #include "qapi-event.h" 39 40 #ifdef CONFIG_BSD 41 #include <sys/types.h> 42 #include <sys/stat.h> 43 #include <sys/ioctl.h> 44 #include <sys/queue.h> 45 #ifndef __DragonFly__ 46 #include <sys/disk.h> 47 #endif 48 #endif 49 50 #ifdef _WIN32 51 #include <windows.h> 52 #endif 53 54 struct BdrvDirtyBitmap { 55 HBitmap *bitmap; 56 QLIST_ENTRY(BdrvDirtyBitmap) list; 57 }; 58 59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 60 61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 63 BlockCompletionFunc *cb, void *opaque); 64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 66 BlockCompletionFunc *cb, void *opaque); 67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 68 int64_t sector_num, int nb_sectors, 69 QEMUIOVector *iov); 70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 71 int64_t sector_num, int nb_sectors, 72 QEMUIOVector *iov); 73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 75 BdrvRequestFlags flags); 76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 78 BdrvRequestFlags flags); 79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 80 int64_t sector_num, 81 QEMUIOVector *qiov, 82 int nb_sectors, 83 BdrvRequestFlags flags, 84 BlockCompletionFunc *cb, 85 void *opaque, 86 bool is_write); 87 static void coroutine_fn bdrv_co_do_rw(void *opaque); 88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 90 91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states = 92 QTAILQ_HEAD_INITIALIZER(bdrv_states); 93 94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = 95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); 96 97 static QLIST_HEAD(, BlockDriver) bdrv_drivers = 98 QLIST_HEAD_INITIALIZER(bdrv_drivers); 99 100 /* If non-zero, use only whitelisted block drivers */ 101 static int use_bdrv_whitelist; 102 103 #ifdef _WIN32 104 static int is_windows_drive_prefix(const char *filename) 105 { 106 return (((filename[0] >= 'a' && filename[0] <= 'z') || 107 (filename[0] >= 'A' && filename[0] <= 'Z')) && 108 filename[1] == ':'); 109 } 110 111 int is_windows_drive(const char *filename) 112 { 113 if (is_windows_drive_prefix(filename) && 114 filename[2] == '\0') 115 return 1; 116 if (strstart(filename, "\\\\.\\", NULL) || 117 strstart(filename, "//./", NULL)) 118 return 1; 119 return 0; 120 } 121 #endif 122 123 /* throttling disk I/O limits */ 124 void bdrv_set_io_limits(BlockDriverState *bs, 125 ThrottleConfig *cfg) 126 { 127 int i; 128 129 throttle_config(&bs->throttle_state, cfg); 130 131 for (i = 0; i < 2; i++) { 132 qemu_co_enter_next(&bs->throttled_reqs[i]); 133 } 134 } 135 136 /* this function drain all the throttled IOs */ 137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 138 { 139 bool drained = false; 140 bool enabled = bs->io_limits_enabled; 141 int i; 142 143 bs->io_limits_enabled = false; 144 145 for (i = 0; i < 2; i++) { 146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 147 drained = true; 148 } 149 } 150 151 bs->io_limits_enabled = enabled; 152 153 return drained; 154 } 155 156 void bdrv_io_limits_disable(BlockDriverState *bs) 157 { 158 bs->io_limits_enabled = false; 159 160 bdrv_start_throttled_reqs(bs); 161 162 throttle_destroy(&bs->throttle_state); 163 } 164 165 static void bdrv_throttle_read_timer_cb(void *opaque) 166 { 167 BlockDriverState *bs = opaque; 168 qemu_co_enter_next(&bs->throttled_reqs[0]); 169 } 170 171 static void bdrv_throttle_write_timer_cb(void *opaque) 172 { 173 BlockDriverState *bs = opaque; 174 qemu_co_enter_next(&bs->throttled_reqs[1]); 175 } 176 177 /* should be called before bdrv_set_io_limits if a limit is set */ 178 void bdrv_io_limits_enable(BlockDriverState *bs) 179 { 180 assert(!bs->io_limits_enabled); 181 throttle_init(&bs->throttle_state, 182 bdrv_get_aio_context(bs), 183 QEMU_CLOCK_VIRTUAL, 184 bdrv_throttle_read_timer_cb, 185 bdrv_throttle_write_timer_cb, 186 bs); 187 bs->io_limits_enabled = true; 188 } 189 190 /* This function makes an IO wait if needed 191 * 192 * @nb_sectors: the number of sectors of the IO 193 * @is_write: is the IO a write 194 */ 195 static void bdrv_io_limits_intercept(BlockDriverState *bs, 196 unsigned int bytes, 197 bool is_write) 198 { 199 /* does this io must wait */ 200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); 201 202 /* if must wait or any request of this type throttled queue the IO */ 203 if (must_wait || 204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { 205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]); 206 } 207 208 /* the IO will be executed, do the accounting */ 209 throttle_account(&bs->throttle_state, is_write, bytes); 210 211 212 /* if the next request must wait -> do nothing */ 213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) { 214 return; 215 } 216 217 /* else queue next request for execution */ 218 qemu_co_queue_next(&bs->throttled_reqs[is_write]); 219 } 220 221 size_t bdrv_opt_mem_align(BlockDriverState *bs) 222 { 223 if (!bs || !bs->drv) { 224 /* 4k should be on the safe side */ 225 return 4096; 226 } 227 228 return bs->bl.opt_mem_alignment; 229 } 230 231 /* check if the path starts with "<protocol>:" */ 232 int path_has_protocol(const char *path) 233 { 234 const char *p; 235 236 #ifdef _WIN32 237 if (is_windows_drive(path) || 238 is_windows_drive_prefix(path)) { 239 return 0; 240 } 241 p = path + strcspn(path, ":/\\"); 242 #else 243 p = path + strcspn(path, ":/"); 244 #endif 245 246 return *p == ':'; 247 } 248 249 int path_is_absolute(const char *path) 250 { 251 #ifdef _WIN32 252 /* specific case for names like: "\\.\d:" */ 253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) { 254 return 1; 255 } 256 return (*path == '/' || *path == '\\'); 257 #else 258 return (*path == '/'); 259 #endif 260 } 261 262 /* if filename is absolute, just copy it to dest. Otherwise, build a 263 path to it by considering it is relative to base_path. URL are 264 supported. */ 265 void path_combine(char *dest, int dest_size, 266 const char *base_path, 267 const char *filename) 268 { 269 const char *p, *p1; 270 int len; 271 272 if (dest_size <= 0) 273 return; 274 if (path_is_absolute(filename)) { 275 pstrcpy(dest, dest_size, filename); 276 } else { 277 p = strchr(base_path, ':'); 278 if (p) 279 p++; 280 else 281 p = base_path; 282 p1 = strrchr(base_path, '/'); 283 #ifdef _WIN32 284 { 285 const char *p2; 286 p2 = strrchr(base_path, '\\'); 287 if (!p1 || p2 > p1) 288 p1 = p2; 289 } 290 #endif 291 if (p1) 292 p1++; 293 else 294 p1 = base_path; 295 if (p1 > p) 296 p = p1; 297 len = p - base_path; 298 if (len > dest_size - 1) 299 len = dest_size - 1; 300 memcpy(dest, base_path, len); 301 dest[len] = '\0'; 302 pstrcat(dest, dest_size, filename); 303 } 304 } 305 306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz) 307 { 308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) { 309 pstrcpy(dest, sz, bs->backing_file); 310 } else { 311 path_combine(dest, sz, bs->filename, bs->backing_file); 312 } 313 } 314 315 void bdrv_register(BlockDriver *bdrv) 316 { 317 /* Block drivers without coroutine functions need emulation */ 318 if (!bdrv->bdrv_co_readv) { 319 bdrv->bdrv_co_readv = bdrv_co_readv_em; 320 bdrv->bdrv_co_writev = bdrv_co_writev_em; 321 322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 323 * the block driver lacks aio we need to emulate that too. 324 */ 325 if (!bdrv->bdrv_aio_readv) { 326 /* add AIO emulation layer */ 327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 329 } 330 } 331 332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); 333 } 334 335 BlockDriverState *bdrv_new_root(void) 336 { 337 BlockDriverState *bs = bdrv_new(); 338 339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list); 340 return bs; 341 } 342 343 BlockDriverState *bdrv_new(void) 344 { 345 BlockDriverState *bs; 346 int i; 347 348 bs = g_new0(BlockDriverState, 1); 349 QLIST_INIT(&bs->dirty_bitmaps); 350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 351 QLIST_INIT(&bs->op_blockers[i]); 352 } 353 bdrv_iostatus_disable(bs); 354 notifier_list_init(&bs->close_notifiers); 355 notifier_with_return_list_init(&bs->before_write_notifiers); 356 qemu_co_queue_init(&bs->throttled_reqs[0]); 357 qemu_co_queue_init(&bs->throttled_reqs[1]); 358 bs->refcnt = 1; 359 bs->aio_context = qemu_get_aio_context(); 360 361 return bs; 362 } 363 364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) 365 { 366 notifier_list_add(&bs->close_notifiers, notify); 367 } 368 369 BlockDriver *bdrv_find_format(const char *format_name) 370 { 371 BlockDriver *drv1; 372 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 373 if (!strcmp(drv1->format_name, format_name)) { 374 return drv1; 375 } 376 } 377 return NULL; 378 } 379 380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only) 381 { 382 static const char *whitelist_rw[] = { 383 CONFIG_BDRV_RW_WHITELIST 384 }; 385 static const char *whitelist_ro[] = { 386 CONFIG_BDRV_RO_WHITELIST 387 }; 388 const char **p; 389 390 if (!whitelist_rw[0] && !whitelist_ro[0]) { 391 return 1; /* no whitelist, anything goes */ 392 } 393 394 for (p = whitelist_rw; *p; p++) { 395 if (!strcmp(drv->format_name, *p)) { 396 return 1; 397 } 398 } 399 if (read_only) { 400 for (p = whitelist_ro; *p; p++) { 401 if (!strcmp(drv->format_name, *p)) { 402 return 1; 403 } 404 } 405 } 406 return 0; 407 } 408 409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name, 410 bool read_only) 411 { 412 BlockDriver *drv = bdrv_find_format(format_name); 413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL; 414 } 415 416 typedef struct CreateCo { 417 BlockDriver *drv; 418 char *filename; 419 QemuOpts *opts; 420 int ret; 421 Error *err; 422 } CreateCo; 423 424 static void coroutine_fn bdrv_create_co_entry(void *opaque) 425 { 426 Error *local_err = NULL; 427 int ret; 428 429 CreateCo *cco = opaque; 430 assert(cco->drv); 431 432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err); 433 if (local_err) { 434 error_propagate(&cco->err, local_err); 435 } 436 cco->ret = ret; 437 } 438 439 int bdrv_create(BlockDriver *drv, const char* filename, 440 QemuOpts *opts, Error **errp) 441 { 442 int ret; 443 444 Coroutine *co; 445 CreateCo cco = { 446 .drv = drv, 447 .filename = g_strdup(filename), 448 .opts = opts, 449 .ret = NOT_DONE, 450 .err = NULL, 451 }; 452 453 if (!drv->bdrv_create) { 454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); 455 ret = -ENOTSUP; 456 goto out; 457 } 458 459 if (qemu_in_coroutine()) { 460 /* Fast-path if already in coroutine context */ 461 bdrv_create_co_entry(&cco); 462 } else { 463 co = qemu_coroutine_create(bdrv_create_co_entry); 464 qemu_coroutine_enter(co, &cco); 465 while (cco.ret == NOT_DONE) { 466 aio_poll(qemu_get_aio_context(), true); 467 } 468 } 469 470 ret = cco.ret; 471 if (ret < 0) { 472 if (cco.err) { 473 error_propagate(errp, cco.err); 474 } else { 475 error_setg_errno(errp, -ret, "Could not create image"); 476 } 477 } 478 479 out: 480 g_free(cco.filename); 481 return ret; 482 } 483 484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) 485 { 486 BlockDriver *drv; 487 Error *local_err = NULL; 488 int ret; 489 490 drv = bdrv_find_protocol(filename, true); 491 if (drv == NULL) { 492 error_setg(errp, "Could not find protocol for file '%s'", filename); 493 return -ENOENT; 494 } 495 496 ret = bdrv_create(drv, filename, opts, &local_err); 497 if (local_err) { 498 error_propagate(errp, local_err); 499 } 500 return ret; 501 } 502 503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 504 { 505 BlockDriver *drv = bs->drv; 506 Error *local_err = NULL; 507 508 memset(&bs->bl, 0, sizeof(bs->bl)); 509 510 if (!drv) { 511 return; 512 } 513 514 /* Take some limits from the children as a default */ 515 if (bs->file) { 516 bdrv_refresh_limits(bs->file, &local_err); 517 if (local_err) { 518 error_propagate(errp, local_err); 519 return; 520 } 521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; 522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; 523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; 524 } else { 525 bs->bl.opt_mem_alignment = 512; 526 } 527 528 if (bs->backing_hd) { 529 bdrv_refresh_limits(bs->backing_hd, &local_err); 530 if (local_err) { 531 error_propagate(errp, local_err); 532 return; 533 } 534 bs->bl.opt_transfer_length = 535 MAX(bs->bl.opt_transfer_length, 536 bs->backing_hd->bl.opt_transfer_length); 537 bs->bl.max_transfer_length = 538 MIN_NON_ZERO(bs->bl.max_transfer_length, 539 bs->backing_hd->bl.max_transfer_length); 540 bs->bl.opt_mem_alignment = 541 MAX(bs->bl.opt_mem_alignment, 542 bs->backing_hd->bl.opt_mem_alignment); 543 } 544 545 /* Then let the driver override it */ 546 if (drv->bdrv_refresh_limits) { 547 drv->bdrv_refresh_limits(bs, errp); 548 } 549 } 550 551 /* 552 * Create a uniquely-named empty temporary file. 553 * Return 0 upon success, otherwise a negative errno value. 554 */ 555 int get_tmp_filename(char *filename, int size) 556 { 557 #ifdef _WIN32 558 char temp_dir[MAX_PATH]; 559 /* GetTempFileName requires that its output buffer (4th param) 560 have length MAX_PATH or greater. */ 561 assert(size >= MAX_PATH); 562 return (GetTempPath(MAX_PATH, temp_dir) 563 && GetTempFileName(temp_dir, "qem", 0, filename) 564 ? 0 : -GetLastError()); 565 #else 566 int fd; 567 const char *tmpdir; 568 tmpdir = getenv("TMPDIR"); 569 if (!tmpdir) { 570 tmpdir = "/var/tmp"; 571 } 572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { 573 return -EOVERFLOW; 574 } 575 fd = mkstemp(filename); 576 if (fd < 0) { 577 return -errno; 578 } 579 if (close(fd) != 0) { 580 unlink(filename); 581 return -errno; 582 } 583 return 0; 584 #endif 585 } 586 587 /* 588 * Detect host devices. By convention, /dev/cdrom[N] is always 589 * recognized as a host CDROM. 590 */ 591 static BlockDriver *find_hdev_driver(const char *filename) 592 { 593 int score_max = 0, score; 594 BlockDriver *drv = NULL, *d; 595 596 QLIST_FOREACH(d, &bdrv_drivers, list) { 597 if (d->bdrv_probe_device) { 598 score = d->bdrv_probe_device(filename); 599 if (score > score_max) { 600 score_max = score; 601 drv = d; 602 } 603 } 604 } 605 606 return drv; 607 } 608 609 BlockDriver *bdrv_find_protocol(const char *filename, 610 bool allow_protocol_prefix) 611 { 612 BlockDriver *drv1; 613 char protocol[128]; 614 int len; 615 const char *p; 616 617 /* TODO Drivers without bdrv_file_open must be specified explicitly */ 618 619 /* 620 * XXX(hch): we really should not let host device detection 621 * override an explicit protocol specification, but moving this 622 * later breaks access to device names with colons in them. 623 * Thanks to the brain-dead persistent naming schemes on udev- 624 * based Linux systems those actually are quite common. 625 */ 626 drv1 = find_hdev_driver(filename); 627 if (drv1) { 628 return drv1; 629 } 630 631 if (!path_has_protocol(filename) || !allow_protocol_prefix) { 632 return &bdrv_file; 633 } 634 635 p = strchr(filename, ':'); 636 assert(p != NULL); 637 len = p - filename; 638 if (len > sizeof(protocol) - 1) 639 len = sizeof(protocol) - 1; 640 memcpy(protocol, filename, len); 641 protocol[len] = '\0'; 642 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 643 if (drv1->protocol_name && 644 !strcmp(drv1->protocol_name, protocol)) { 645 return drv1; 646 } 647 } 648 return NULL; 649 } 650 651 /* 652 * Guess image format by probing its contents. 653 * This is not a good idea when your image is raw (CVE-2008-2004), but 654 * we do it anyway for backward compatibility. 655 * 656 * @buf contains the image's first @buf_size bytes. 657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE, 658 * but can be smaller if the image file is smaller) 659 * @filename is its filename. 660 * 661 * For all block drivers, call the bdrv_probe() method to get its 662 * probing score. 663 * Return the first block driver with the highest probing score. 664 */ 665 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, 666 const char *filename) 667 { 668 int score_max = 0, score; 669 BlockDriver *drv = NULL, *d; 670 671 QLIST_FOREACH(d, &bdrv_drivers, list) { 672 if (d->bdrv_probe) { 673 score = d->bdrv_probe(buf, buf_size, filename); 674 if (score > score_max) { 675 score_max = score; 676 drv = d; 677 } 678 } 679 } 680 681 return drv; 682 } 683 684 static int find_image_format(BlockDriverState *bs, const char *filename, 685 BlockDriver **pdrv, Error **errp) 686 { 687 BlockDriver *drv; 688 uint8_t buf[BLOCK_PROBE_BUF_SIZE]; 689 int ret = 0; 690 691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ 692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { 693 *pdrv = &bdrv_raw; 694 return ret; 695 } 696 697 ret = bdrv_pread(bs, 0, buf, sizeof(buf)); 698 if (ret < 0) { 699 error_setg_errno(errp, -ret, "Could not read image for determining its " 700 "format"); 701 *pdrv = NULL; 702 return ret; 703 } 704 705 drv = bdrv_probe_all(buf, ret, filename); 706 if (!drv) { 707 error_setg(errp, "Could not determine image format: No compatible " 708 "driver found"); 709 ret = -ENOENT; 710 } 711 *pdrv = drv; 712 return ret; 713 } 714 715 /** 716 * Set the current 'total_sectors' value 717 * Return 0 on success, -errno on error. 718 */ 719 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) 720 { 721 BlockDriver *drv = bs->drv; 722 723 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ 724 if (bs->sg) 725 return 0; 726 727 /* query actual device if possible, otherwise just trust the hint */ 728 if (drv->bdrv_getlength) { 729 int64_t length = drv->bdrv_getlength(bs); 730 if (length < 0) { 731 return length; 732 } 733 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE); 734 } 735 736 bs->total_sectors = hint; 737 return 0; 738 } 739 740 /** 741 * Set open flags for a given discard mode 742 * 743 * Return 0 on success, -1 if the discard mode was invalid. 744 */ 745 int bdrv_parse_discard_flags(const char *mode, int *flags) 746 { 747 *flags &= ~BDRV_O_UNMAP; 748 749 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) { 750 /* do nothing */ 751 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) { 752 *flags |= BDRV_O_UNMAP; 753 } else { 754 return -1; 755 } 756 757 return 0; 758 } 759 760 /** 761 * Set open flags for a given cache mode 762 * 763 * Return 0 on success, -1 if the cache mode was invalid. 764 */ 765 int bdrv_parse_cache_flags(const char *mode, int *flags) 766 { 767 *flags &= ~BDRV_O_CACHE_MASK; 768 769 if (!strcmp(mode, "off") || !strcmp(mode, "none")) { 770 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; 771 } else if (!strcmp(mode, "directsync")) { 772 *flags |= BDRV_O_NOCACHE; 773 } else if (!strcmp(mode, "writeback")) { 774 *flags |= BDRV_O_CACHE_WB; 775 } else if (!strcmp(mode, "unsafe")) { 776 *flags |= BDRV_O_CACHE_WB; 777 *flags |= BDRV_O_NO_FLUSH; 778 } else if (!strcmp(mode, "writethrough")) { 779 /* this is the default */ 780 } else { 781 return -1; 782 } 783 784 return 0; 785 } 786 787 /** 788 * The copy-on-read flag is actually a reference count so multiple users may 789 * use the feature without worrying about clobbering its previous state. 790 * Copy-on-read stays enabled until all users have called to disable it. 791 */ 792 void bdrv_enable_copy_on_read(BlockDriverState *bs) 793 { 794 bs->copy_on_read++; 795 } 796 797 void bdrv_disable_copy_on_read(BlockDriverState *bs) 798 { 799 assert(bs->copy_on_read > 0); 800 bs->copy_on_read--; 801 } 802 803 /* 804 * Returns the flags that a temporary snapshot should get, based on the 805 * originally requested flags (the originally requested image will have flags 806 * like a backing file) 807 */ 808 static int bdrv_temp_snapshot_flags(int flags) 809 { 810 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; 811 } 812 813 /* 814 * Returns the flags that bs->file should get, based on the given flags for 815 * the parent BDS 816 */ 817 static int bdrv_inherited_flags(int flags) 818 { 819 /* Enable protocol handling, disable format probing for bs->file */ 820 flags |= BDRV_O_PROTOCOL; 821 822 /* Our block drivers take care to send flushes and respect unmap policy, 823 * so we can enable both unconditionally on lower layers. */ 824 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP; 825 826 /* Clear flags that only apply to the top layer */ 827 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ); 828 829 return flags; 830 } 831 832 /* 833 * Returns the flags that bs->backing_hd should get, based on the given flags 834 * for the parent BDS 835 */ 836 static int bdrv_backing_flags(int flags) 837 { 838 /* backing files always opened read-only */ 839 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ); 840 841 /* snapshot=on is handled on the top layer */ 842 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY); 843 844 return flags; 845 } 846 847 static int bdrv_open_flags(BlockDriverState *bs, int flags) 848 { 849 int open_flags = flags | BDRV_O_CACHE_WB; 850 851 /* 852 * Clear flags that are internal to the block layer before opening the 853 * image. 854 */ 855 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL); 856 857 /* 858 * Snapshots should be writable. 859 */ 860 if (flags & BDRV_O_TEMPORARY) { 861 open_flags |= BDRV_O_RDWR; 862 } 863 864 return open_flags; 865 } 866 867 static void bdrv_assign_node_name(BlockDriverState *bs, 868 const char *node_name, 869 Error **errp) 870 { 871 if (!node_name) { 872 return; 873 } 874 875 /* Check for empty string or invalid characters */ 876 if (!id_wellformed(node_name)) { 877 error_setg(errp, "Invalid node name"); 878 return; 879 } 880 881 /* takes care of avoiding namespaces collisions */ 882 if (blk_by_name(node_name)) { 883 error_setg(errp, "node-name=%s is conflicting with a device id", 884 node_name); 885 return; 886 } 887 888 /* takes care of avoiding duplicates node names */ 889 if (bdrv_find_node(node_name)) { 890 error_setg(errp, "Duplicate node name"); 891 return; 892 } 893 894 /* copy node name into the bs and insert it into the graph list */ 895 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name); 896 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list); 897 } 898 899 /* 900 * Common part for opening disk images and files 901 * 902 * Removes all processed options from *options. 903 */ 904 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, 905 QDict *options, int flags, BlockDriver *drv, Error **errp) 906 { 907 int ret, open_flags; 908 const char *filename; 909 const char *node_name = NULL; 910 Error *local_err = NULL; 911 912 assert(drv != NULL); 913 assert(bs->file == NULL); 914 assert(options != NULL && bs->options != options); 915 916 if (file != NULL) { 917 filename = file->filename; 918 } else { 919 filename = qdict_get_try_str(options, "filename"); 920 } 921 922 if (drv->bdrv_needs_filename && !filename) { 923 error_setg(errp, "The '%s' block driver requires a file name", 924 drv->format_name); 925 return -EINVAL; 926 } 927 928 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); 929 930 node_name = qdict_get_try_str(options, "node-name"); 931 bdrv_assign_node_name(bs, node_name, &local_err); 932 if (local_err) { 933 error_propagate(errp, local_err); 934 return -EINVAL; 935 } 936 qdict_del(options, "node-name"); 937 938 /* bdrv_open() with directly using a protocol as drv. This layer is already 939 * opened, so assign it to bs (while file becomes a closed BlockDriverState) 940 * and return immediately. */ 941 if (file != NULL && drv->bdrv_file_open) { 942 bdrv_swap(file, bs); 943 return 0; 944 } 945 946 bs->open_flags = flags; 947 bs->guest_block_size = 512; 948 bs->request_alignment = 512; 949 bs->zero_beyond_eof = true; 950 open_flags = bdrv_open_flags(bs, flags); 951 bs->read_only = !(open_flags & BDRV_O_RDWR); 952 bs->growable = !!(flags & BDRV_O_PROTOCOL); 953 954 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) { 955 error_setg(errp, 956 !bs->read_only && bdrv_is_whitelisted(drv, true) 957 ? "Driver '%s' can only be used for read-only devices" 958 : "Driver '%s' is not whitelisted", 959 drv->format_name); 960 return -ENOTSUP; 961 } 962 963 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ 964 if (flags & BDRV_O_COPY_ON_READ) { 965 if (!bs->read_only) { 966 bdrv_enable_copy_on_read(bs); 967 } else { 968 error_setg(errp, "Can't use copy-on-read on read-only device"); 969 return -EINVAL; 970 } 971 } 972 973 if (filename != NULL) { 974 pstrcpy(bs->filename, sizeof(bs->filename), filename); 975 } else { 976 bs->filename[0] = '\0'; 977 } 978 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename); 979 980 bs->drv = drv; 981 bs->opaque = g_malloc0(drv->instance_size); 982 983 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); 984 985 /* Open the image, either directly or using a protocol */ 986 if (drv->bdrv_file_open) { 987 assert(file == NULL); 988 assert(!drv->bdrv_needs_filename || filename != NULL); 989 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err); 990 } else { 991 if (file == NULL) { 992 error_setg(errp, "Can't use '%s' as a block driver for the " 993 "protocol level", drv->format_name); 994 ret = -EINVAL; 995 goto free_and_fail; 996 } 997 bs->file = file; 998 ret = drv->bdrv_open(bs, options, open_flags, &local_err); 999 } 1000 1001 if (ret < 0) { 1002 if (local_err) { 1003 error_propagate(errp, local_err); 1004 } else if (bs->filename[0]) { 1005 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename); 1006 } else { 1007 error_setg_errno(errp, -ret, "Could not open image"); 1008 } 1009 goto free_and_fail; 1010 } 1011 1012 ret = refresh_total_sectors(bs, bs->total_sectors); 1013 if (ret < 0) { 1014 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 1015 goto free_and_fail; 1016 } 1017 1018 bdrv_refresh_limits(bs, &local_err); 1019 if (local_err) { 1020 error_propagate(errp, local_err); 1021 ret = -EINVAL; 1022 goto free_and_fail; 1023 } 1024 1025 assert(bdrv_opt_mem_align(bs) != 0); 1026 assert((bs->request_alignment != 0) || bs->sg); 1027 return 0; 1028 1029 free_and_fail: 1030 bs->file = NULL; 1031 g_free(bs->opaque); 1032 bs->opaque = NULL; 1033 bs->drv = NULL; 1034 return ret; 1035 } 1036 1037 static QDict *parse_json_filename(const char *filename, Error **errp) 1038 { 1039 QObject *options_obj; 1040 QDict *options; 1041 int ret; 1042 1043 ret = strstart(filename, "json:", &filename); 1044 assert(ret); 1045 1046 options_obj = qobject_from_json(filename); 1047 if (!options_obj) { 1048 error_setg(errp, "Could not parse the JSON options"); 1049 return NULL; 1050 } 1051 1052 if (qobject_type(options_obj) != QTYPE_QDICT) { 1053 qobject_decref(options_obj); 1054 error_setg(errp, "Invalid JSON object given"); 1055 return NULL; 1056 } 1057 1058 options = qobject_to_qdict(options_obj); 1059 qdict_flatten(options); 1060 1061 return options; 1062 } 1063 1064 /* 1065 * Fills in default options for opening images and converts the legacy 1066 * filename/flags pair to option QDict entries. 1067 */ 1068 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags, 1069 BlockDriver *drv, Error **errp) 1070 { 1071 const char *filename = *pfilename; 1072 const char *drvname; 1073 bool protocol = flags & BDRV_O_PROTOCOL; 1074 bool parse_filename = false; 1075 Error *local_err = NULL; 1076 1077 /* Parse json: pseudo-protocol */ 1078 if (filename && g_str_has_prefix(filename, "json:")) { 1079 QDict *json_options = parse_json_filename(filename, &local_err); 1080 if (local_err) { 1081 error_propagate(errp, local_err); 1082 return -EINVAL; 1083 } 1084 1085 /* Options given in the filename have lower priority than options 1086 * specified directly */ 1087 qdict_join(*options, json_options, false); 1088 QDECREF(json_options); 1089 *pfilename = filename = NULL; 1090 } 1091 1092 /* Fetch the file name from the options QDict if necessary */ 1093 if (protocol && filename) { 1094 if (!qdict_haskey(*options, "filename")) { 1095 qdict_put(*options, "filename", qstring_from_str(filename)); 1096 parse_filename = true; 1097 } else { 1098 error_setg(errp, "Can't specify 'file' and 'filename' options at " 1099 "the same time"); 1100 return -EINVAL; 1101 } 1102 } 1103 1104 /* Find the right block driver */ 1105 filename = qdict_get_try_str(*options, "filename"); 1106 drvname = qdict_get_try_str(*options, "driver"); 1107 1108 if (drv) { 1109 if (drvname) { 1110 error_setg(errp, "Driver specified twice"); 1111 return -EINVAL; 1112 } 1113 drvname = drv->format_name; 1114 qdict_put(*options, "driver", qstring_from_str(drvname)); 1115 } else { 1116 if (!drvname && protocol) { 1117 if (filename) { 1118 drv = bdrv_find_protocol(filename, parse_filename); 1119 if (!drv) { 1120 error_setg(errp, "Unknown protocol"); 1121 return -EINVAL; 1122 } 1123 1124 drvname = drv->format_name; 1125 qdict_put(*options, "driver", qstring_from_str(drvname)); 1126 } else { 1127 error_setg(errp, "Must specify either driver or file"); 1128 return -EINVAL; 1129 } 1130 } else if (drvname) { 1131 drv = bdrv_find_format(drvname); 1132 if (!drv) { 1133 error_setg(errp, "Unknown driver '%s'", drvname); 1134 return -ENOENT; 1135 } 1136 } 1137 } 1138 1139 assert(drv || !protocol); 1140 1141 /* Driver-specific filename parsing */ 1142 if (drv && drv->bdrv_parse_filename && parse_filename) { 1143 drv->bdrv_parse_filename(filename, *options, &local_err); 1144 if (local_err) { 1145 error_propagate(errp, local_err); 1146 return -EINVAL; 1147 } 1148 1149 if (!drv->bdrv_needs_filename) { 1150 qdict_del(*options, "filename"); 1151 } 1152 } 1153 1154 return 0; 1155 } 1156 1157 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd) 1158 { 1159 1160 if (bs->backing_hd) { 1161 assert(bs->backing_blocker); 1162 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker); 1163 } else if (backing_hd) { 1164 error_setg(&bs->backing_blocker, 1165 "device is used as backing hd of '%s'", 1166 bdrv_get_device_name(bs)); 1167 } 1168 1169 bs->backing_hd = backing_hd; 1170 if (!backing_hd) { 1171 error_free(bs->backing_blocker); 1172 bs->backing_blocker = NULL; 1173 goto out; 1174 } 1175 bs->open_flags &= ~BDRV_O_NO_BACKING; 1176 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename); 1177 pstrcpy(bs->backing_format, sizeof(bs->backing_format), 1178 backing_hd->drv ? backing_hd->drv->format_name : ""); 1179 1180 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker); 1181 /* Otherwise we won't be able to commit due to check in bdrv_commit */ 1182 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, 1183 bs->backing_blocker); 1184 out: 1185 bdrv_refresh_limits(bs, NULL); 1186 } 1187 1188 /* 1189 * Opens the backing file for a BlockDriverState if not yet open 1190 * 1191 * options is a QDict of options to pass to the block drivers, or NULL for an 1192 * empty set of options. The reference to the QDict is transferred to this 1193 * function (even on failure), so if the caller intends to reuse the dictionary, 1194 * it needs to use QINCREF() before calling bdrv_file_open. 1195 */ 1196 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) 1197 { 1198 char *backing_filename = g_malloc0(PATH_MAX); 1199 int ret = 0; 1200 BlockDriverState *backing_hd; 1201 Error *local_err = NULL; 1202 1203 if (bs->backing_hd != NULL) { 1204 QDECREF(options); 1205 goto free_exit; 1206 } 1207 1208 /* NULL means an empty set of options */ 1209 if (options == NULL) { 1210 options = qdict_new(); 1211 } 1212 1213 bs->open_flags &= ~BDRV_O_NO_BACKING; 1214 if (qdict_haskey(options, "file.filename")) { 1215 backing_filename[0] = '\0'; 1216 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) { 1217 QDECREF(options); 1218 goto free_exit; 1219 } else { 1220 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX); 1221 } 1222 1223 if (!bs->drv || !bs->drv->supports_backing) { 1224 ret = -EINVAL; 1225 error_setg(errp, "Driver doesn't support backing files"); 1226 QDECREF(options); 1227 goto free_exit; 1228 } 1229 1230 backing_hd = bdrv_new(); 1231 1232 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) { 1233 qdict_put(options, "driver", qstring_from_str(bs->backing_format)); 1234 } 1235 1236 assert(bs->backing_hd == NULL); 1237 ret = bdrv_open(&backing_hd, 1238 *backing_filename ? backing_filename : NULL, NULL, options, 1239 bdrv_backing_flags(bs->open_flags), NULL, &local_err); 1240 if (ret < 0) { 1241 bdrv_unref(backing_hd); 1242 backing_hd = NULL; 1243 bs->open_flags |= BDRV_O_NO_BACKING; 1244 error_setg(errp, "Could not open backing file: %s", 1245 error_get_pretty(local_err)); 1246 error_free(local_err); 1247 goto free_exit; 1248 } 1249 bdrv_set_backing_hd(bs, backing_hd); 1250 1251 free_exit: 1252 g_free(backing_filename); 1253 return ret; 1254 } 1255 1256 /* 1257 * Opens a disk image whose options are given as BlockdevRef in another block 1258 * device's options. 1259 * 1260 * If allow_none is true, no image will be opened if filename is false and no 1261 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned. 1262 * 1263 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict. 1264 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict 1265 * itself, all options starting with "${bdref_key}." are considered part of the 1266 * BlockdevRef. 1267 * 1268 * The BlockdevRef will be removed from the options QDict. 1269 * 1270 * To conform with the behavior of bdrv_open(), *pbs has to be NULL. 1271 */ 1272 int bdrv_open_image(BlockDriverState **pbs, const char *filename, 1273 QDict *options, const char *bdref_key, int flags, 1274 bool allow_none, Error **errp) 1275 { 1276 QDict *image_options; 1277 int ret; 1278 char *bdref_key_dot; 1279 const char *reference; 1280 1281 assert(pbs); 1282 assert(*pbs == NULL); 1283 1284 bdref_key_dot = g_strdup_printf("%s.", bdref_key); 1285 qdict_extract_subqdict(options, &image_options, bdref_key_dot); 1286 g_free(bdref_key_dot); 1287 1288 reference = qdict_get_try_str(options, bdref_key); 1289 if (!filename && !reference && !qdict_size(image_options)) { 1290 if (allow_none) { 1291 ret = 0; 1292 } else { 1293 error_setg(errp, "A block device must be specified for \"%s\"", 1294 bdref_key); 1295 ret = -EINVAL; 1296 } 1297 QDECREF(image_options); 1298 goto done; 1299 } 1300 1301 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp); 1302 1303 done: 1304 qdict_del(options, bdref_key); 1305 return ret; 1306 } 1307 1308 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) 1309 { 1310 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ 1311 char *tmp_filename = g_malloc0(PATH_MAX + 1); 1312 int64_t total_size; 1313 QemuOpts *opts = NULL; 1314 QDict *snapshot_options; 1315 BlockDriverState *bs_snapshot; 1316 Error *local_err; 1317 int ret; 1318 1319 /* if snapshot, we create a temporary backing file and open it 1320 instead of opening 'filename' directly */ 1321 1322 /* Get the required size from the image */ 1323 total_size = bdrv_getlength(bs); 1324 if (total_size < 0) { 1325 ret = total_size; 1326 error_setg_errno(errp, -total_size, "Could not get image size"); 1327 goto out; 1328 } 1329 1330 /* Create the temporary image */ 1331 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1); 1332 if (ret < 0) { 1333 error_setg_errno(errp, -ret, "Could not get temporary filename"); 1334 goto out; 1335 } 1336 1337 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0, 1338 &error_abort); 1339 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size); 1340 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err); 1341 qemu_opts_del(opts); 1342 if (ret < 0) { 1343 error_setg_errno(errp, -ret, "Could not create temporary overlay " 1344 "'%s': %s", tmp_filename, 1345 error_get_pretty(local_err)); 1346 error_free(local_err); 1347 goto out; 1348 } 1349 1350 /* Prepare a new options QDict for the temporary file */ 1351 snapshot_options = qdict_new(); 1352 qdict_put(snapshot_options, "file.driver", 1353 qstring_from_str("file")); 1354 qdict_put(snapshot_options, "file.filename", 1355 qstring_from_str(tmp_filename)); 1356 1357 bs_snapshot = bdrv_new(); 1358 1359 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options, 1360 flags, &bdrv_qcow2, &local_err); 1361 if (ret < 0) { 1362 error_propagate(errp, local_err); 1363 goto out; 1364 } 1365 1366 bdrv_append(bs_snapshot, bs); 1367 1368 out: 1369 g_free(tmp_filename); 1370 return ret; 1371 } 1372 1373 /* 1374 * Opens a disk image (raw, qcow2, vmdk, ...) 1375 * 1376 * options is a QDict of options to pass to the block drivers, or NULL for an 1377 * empty set of options. The reference to the QDict belongs to the block layer 1378 * after the call (even on failure), so if the caller intends to reuse the 1379 * dictionary, it needs to use QINCREF() before calling bdrv_open. 1380 * 1381 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there. 1382 * If it is not NULL, the referenced BDS will be reused. 1383 * 1384 * The reference parameter may be used to specify an existing block device which 1385 * should be opened. If specified, neither options nor a filename may be given, 1386 * nor can an existing BDS be reused (that is, *pbs has to be NULL). 1387 */ 1388 int bdrv_open(BlockDriverState **pbs, const char *filename, 1389 const char *reference, QDict *options, int flags, 1390 BlockDriver *drv, Error **errp) 1391 { 1392 int ret; 1393 BlockDriverState *file = NULL, *bs; 1394 const char *drvname; 1395 Error *local_err = NULL; 1396 int snapshot_flags = 0; 1397 1398 assert(pbs); 1399 1400 if (reference) { 1401 bool options_non_empty = options ? qdict_size(options) : false; 1402 QDECREF(options); 1403 1404 if (*pbs) { 1405 error_setg(errp, "Cannot reuse an existing BDS when referencing " 1406 "another block device"); 1407 return -EINVAL; 1408 } 1409 1410 if (filename || options_non_empty) { 1411 error_setg(errp, "Cannot reference an existing block device with " 1412 "additional options or a new filename"); 1413 return -EINVAL; 1414 } 1415 1416 bs = bdrv_lookup_bs(reference, reference, errp); 1417 if (!bs) { 1418 return -ENODEV; 1419 } 1420 bdrv_ref(bs); 1421 *pbs = bs; 1422 return 0; 1423 } 1424 1425 if (*pbs) { 1426 bs = *pbs; 1427 } else { 1428 bs = bdrv_new(); 1429 } 1430 1431 /* NULL means an empty set of options */ 1432 if (options == NULL) { 1433 options = qdict_new(); 1434 } 1435 1436 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err); 1437 if (local_err) { 1438 goto fail; 1439 } 1440 1441 /* Find the right image format driver */ 1442 drv = NULL; 1443 drvname = qdict_get_try_str(options, "driver"); 1444 if (drvname) { 1445 drv = bdrv_find_format(drvname); 1446 qdict_del(options, "driver"); 1447 if (!drv) { 1448 error_setg(errp, "Unknown driver: '%s'", drvname); 1449 ret = -EINVAL; 1450 goto fail; 1451 } 1452 } 1453 1454 assert(drvname || !(flags & BDRV_O_PROTOCOL)); 1455 if (drv && !drv->bdrv_file_open) { 1456 /* If the user explicitly wants a format driver here, we'll need to add 1457 * another layer for the protocol in bs->file */ 1458 flags &= ~BDRV_O_PROTOCOL; 1459 } 1460 1461 bs->options = options; 1462 options = qdict_clone_shallow(options); 1463 1464 /* Open image file without format layer */ 1465 if ((flags & BDRV_O_PROTOCOL) == 0) { 1466 if (flags & BDRV_O_RDWR) { 1467 flags |= BDRV_O_ALLOW_RDWR; 1468 } 1469 if (flags & BDRV_O_SNAPSHOT) { 1470 snapshot_flags = bdrv_temp_snapshot_flags(flags); 1471 flags = bdrv_backing_flags(flags); 1472 } 1473 1474 assert(file == NULL); 1475 ret = bdrv_open_image(&file, filename, options, "file", 1476 bdrv_inherited_flags(flags), 1477 true, &local_err); 1478 if (ret < 0) { 1479 goto fail; 1480 } 1481 } 1482 1483 /* Image format probing */ 1484 bs->probed = !drv; 1485 if (!drv && file) { 1486 ret = find_image_format(file, filename, &drv, &local_err); 1487 if (ret < 0) { 1488 goto fail; 1489 } 1490 } else if (!drv) { 1491 error_setg(errp, "Must specify either driver or file"); 1492 ret = -EINVAL; 1493 goto fail; 1494 } 1495 1496 /* Open the image */ 1497 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err); 1498 if (ret < 0) { 1499 goto fail; 1500 } 1501 1502 if (file && (bs->file != file)) { 1503 bdrv_unref(file); 1504 file = NULL; 1505 } 1506 1507 /* If there is a backing file, use it */ 1508 if ((flags & BDRV_O_NO_BACKING) == 0) { 1509 QDict *backing_options; 1510 1511 qdict_extract_subqdict(options, &backing_options, "backing."); 1512 ret = bdrv_open_backing_file(bs, backing_options, &local_err); 1513 if (ret < 0) { 1514 goto close_and_fail; 1515 } 1516 } 1517 1518 bdrv_refresh_filename(bs); 1519 1520 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the 1521 * temporary snapshot afterwards. */ 1522 if (snapshot_flags) { 1523 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err); 1524 if (local_err) { 1525 goto close_and_fail; 1526 } 1527 } 1528 1529 /* Check if any unknown options were used */ 1530 if (options && (qdict_size(options) != 0)) { 1531 const QDictEntry *entry = qdict_first(options); 1532 if (flags & BDRV_O_PROTOCOL) { 1533 error_setg(errp, "Block protocol '%s' doesn't support the option " 1534 "'%s'", drv->format_name, entry->key); 1535 } else { 1536 error_setg(errp, "Block format '%s' used by device '%s' doesn't " 1537 "support the option '%s'", drv->format_name, 1538 bdrv_get_device_name(bs), entry->key); 1539 } 1540 1541 ret = -EINVAL; 1542 goto close_and_fail; 1543 } 1544 1545 if (!bdrv_key_required(bs)) { 1546 if (bs->blk) { 1547 blk_dev_change_media_cb(bs->blk, true); 1548 } 1549 } else if (!runstate_check(RUN_STATE_PRELAUNCH) 1550 && !runstate_check(RUN_STATE_INMIGRATE) 1551 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */ 1552 error_setg(errp, 1553 "Guest must be stopped for opening of encrypted image"); 1554 ret = -EBUSY; 1555 goto close_and_fail; 1556 } 1557 1558 QDECREF(options); 1559 *pbs = bs; 1560 return 0; 1561 1562 fail: 1563 if (file != NULL) { 1564 bdrv_unref(file); 1565 } 1566 QDECREF(bs->options); 1567 QDECREF(options); 1568 bs->options = NULL; 1569 if (!*pbs) { 1570 /* If *pbs is NULL, a new BDS has been created in this function and 1571 needs to be freed now. Otherwise, it does not need to be closed, 1572 since it has not really been opened yet. */ 1573 bdrv_unref(bs); 1574 } 1575 if (local_err) { 1576 error_propagate(errp, local_err); 1577 } 1578 return ret; 1579 1580 close_and_fail: 1581 /* See fail path, but now the BDS has to be always closed */ 1582 if (*pbs) { 1583 bdrv_close(bs); 1584 } else { 1585 bdrv_unref(bs); 1586 } 1587 QDECREF(options); 1588 if (local_err) { 1589 error_propagate(errp, local_err); 1590 } 1591 return ret; 1592 } 1593 1594 typedef struct BlockReopenQueueEntry { 1595 bool prepared; 1596 BDRVReopenState state; 1597 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; 1598 } BlockReopenQueueEntry; 1599 1600 /* 1601 * Adds a BlockDriverState to a simple queue for an atomic, transactional 1602 * reopen of multiple devices. 1603 * 1604 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT 1605 * already performed, or alternatively may be NULL a new BlockReopenQueue will 1606 * be created and initialized. This newly created BlockReopenQueue should be 1607 * passed back in for subsequent calls that are intended to be of the same 1608 * atomic 'set'. 1609 * 1610 * bs is the BlockDriverState to add to the reopen queue. 1611 * 1612 * flags contains the open flags for the associated bs 1613 * 1614 * returns a pointer to bs_queue, which is either the newly allocated 1615 * bs_queue, or the existing bs_queue being used. 1616 * 1617 */ 1618 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, 1619 BlockDriverState *bs, int flags) 1620 { 1621 assert(bs != NULL); 1622 1623 BlockReopenQueueEntry *bs_entry; 1624 if (bs_queue == NULL) { 1625 bs_queue = g_new0(BlockReopenQueue, 1); 1626 QSIMPLEQ_INIT(bs_queue); 1627 } 1628 1629 /* bdrv_open() masks this flag out */ 1630 flags &= ~BDRV_O_PROTOCOL; 1631 1632 if (bs->file) { 1633 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags)); 1634 } 1635 1636 bs_entry = g_new0(BlockReopenQueueEntry, 1); 1637 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); 1638 1639 bs_entry->state.bs = bs; 1640 bs_entry->state.flags = flags; 1641 1642 return bs_queue; 1643 } 1644 1645 /* 1646 * Reopen multiple BlockDriverStates atomically & transactionally. 1647 * 1648 * The queue passed in (bs_queue) must have been built up previous 1649 * via bdrv_reopen_queue(). 1650 * 1651 * Reopens all BDS specified in the queue, with the appropriate 1652 * flags. All devices are prepared for reopen, and failure of any 1653 * device will cause all device changes to be abandonded, and intermediate 1654 * data cleaned up. 1655 * 1656 * If all devices prepare successfully, then the changes are committed 1657 * to all devices. 1658 * 1659 */ 1660 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) 1661 { 1662 int ret = -1; 1663 BlockReopenQueueEntry *bs_entry, *next; 1664 Error *local_err = NULL; 1665 1666 assert(bs_queue != NULL); 1667 1668 bdrv_drain_all(); 1669 1670 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1671 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { 1672 error_propagate(errp, local_err); 1673 goto cleanup; 1674 } 1675 bs_entry->prepared = true; 1676 } 1677 1678 /* If we reach this point, we have success and just need to apply the 1679 * changes 1680 */ 1681 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1682 bdrv_reopen_commit(&bs_entry->state); 1683 } 1684 1685 ret = 0; 1686 1687 cleanup: 1688 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { 1689 if (ret && bs_entry->prepared) { 1690 bdrv_reopen_abort(&bs_entry->state); 1691 } 1692 g_free(bs_entry); 1693 } 1694 g_free(bs_queue); 1695 return ret; 1696 } 1697 1698 1699 /* Reopen a single BlockDriverState with the specified flags. */ 1700 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) 1701 { 1702 int ret = -1; 1703 Error *local_err = NULL; 1704 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); 1705 1706 ret = bdrv_reopen_multiple(queue, &local_err); 1707 if (local_err != NULL) { 1708 error_propagate(errp, local_err); 1709 } 1710 return ret; 1711 } 1712 1713 1714 /* 1715 * Prepares a BlockDriverState for reopen. All changes are staged in the 1716 * 'opaque' field of the BDRVReopenState, which is used and allocated by 1717 * the block driver layer .bdrv_reopen_prepare() 1718 * 1719 * bs is the BlockDriverState to reopen 1720 * flags are the new open flags 1721 * queue is the reopen queue 1722 * 1723 * Returns 0 on success, non-zero on error. On error errp will be set 1724 * as well. 1725 * 1726 * On failure, bdrv_reopen_abort() will be called to clean up any data. 1727 * It is the responsibility of the caller to then call the abort() or 1728 * commit() for any other BDS that have been left in a prepare() state 1729 * 1730 */ 1731 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, 1732 Error **errp) 1733 { 1734 int ret = -1; 1735 Error *local_err = NULL; 1736 BlockDriver *drv; 1737 1738 assert(reopen_state != NULL); 1739 assert(reopen_state->bs->drv != NULL); 1740 drv = reopen_state->bs->drv; 1741 1742 /* if we are to stay read-only, do not allow permission change 1743 * to r/w */ 1744 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && 1745 reopen_state->flags & BDRV_O_RDWR) { 1746 error_set(errp, QERR_DEVICE_IS_READ_ONLY, 1747 bdrv_get_device_name(reopen_state->bs)); 1748 goto error; 1749 } 1750 1751 1752 ret = bdrv_flush(reopen_state->bs); 1753 if (ret) { 1754 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", 1755 strerror(-ret)); 1756 goto error; 1757 } 1758 1759 if (drv->bdrv_reopen_prepare) { 1760 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); 1761 if (ret) { 1762 if (local_err != NULL) { 1763 error_propagate(errp, local_err); 1764 } else { 1765 error_setg(errp, "failed while preparing to reopen image '%s'", 1766 reopen_state->bs->filename); 1767 } 1768 goto error; 1769 } 1770 } else { 1771 /* It is currently mandatory to have a bdrv_reopen_prepare() 1772 * handler for each supported drv. */ 1773 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, 1774 drv->format_name, bdrv_get_device_name(reopen_state->bs), 1775 "reopening of file"); 1776 ret = -1; 1777 goto error; 1778 } 1779 1780 ret = 0; 1781 1782 error: 1783 return ret; 1784 } 1785 1786 /* 1787 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and 1788 * makes them final by swapping the staging BlockDriverState contents into 1789 * the active BlockDriverState contents. 1790 */ 1791 void bdrv_reopen_commit(BDRVReopenState *reopen_state) 1792 { 1793 BlockDriver *drv; 1794 1795 assert(reopen_state != NULL); 1796 drv = reopen_state->bs->drv; 1797 assert(drv != NULL); 1798 1799 /* If there are any driver level actions to take */ 1800 if (drv->bdrv_reopen_commit) { 1801 drv->bdrv_reopen_commit(reopen_state); 1802 } 1803 1804 /* set BDS specific flags now */ 1805 reopen_state->bs->open_flags = reopen_state->flags; 1806 reopen_state->bs->enable_write_cache = !!(reopen_state->flags & 1807 BDRV_O_CACHE_WB); 1808 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); 1809 1810 bdrv_refresh_limits(reopen_state->bs, NULL); 1811 } 1812 1813 /* 1814 * Abort the reopen, and delete and free the staged changes in 1815 * reopen_state 1816 */ 1817 void bdrv_reopen_abort(BDRVReopenState *reopen_state) 1818 { 1819 BlockDriver *drv; 1820 1821 assert(reopen_state != NULL); 1822 drv = reopen_state->bs->drv; 1823 assert(drv != NULL); 1824 1825 if (drv->bdrv_reopen_abort) { 1826 drv->bdrv_reopen_abort(reopen_state); 1827 } 1828 } 1829 1830 1831 void bdrv_close(BlockDriverState *bs) 1832 { 1833 BdrvAioNotifier *ban, *ban_next; 1834 1835 if (bs->job) { 1836 block_job_cancel_sync(bs->job); 1837 } 1838 bdrv_drain_all(); /* complete I/O */ 1839 bdrv_flush(bs); 1840 bdrv_drain_all(); /* in case flush left pending I/O */ 1841 notifier_list_notify(&bs->close_notifiers, bs); 1842 1843 if (bs->drv) { 1844 if (bs->backing_hd) { 1845 BlockDriverState *backing_hd = bs->backing_hd; 1846 bdrv_set_backing_hd(bs, NULL); 1847 bdrv_unref(backing_hd); 1848 } 1849 bs->drv->bdrv_close(bs); 1850 g_free(bs->opaque); 1851 bs->opaque = NULL; 1852 bs->drv = NULL; 1853 bs->copy_on_read = 0; 1854 bs->backing_file[0] = '\0'; 1855 bs->backing_format[0] = '\0'; 1856 bs->total_sectors = 0; 1857 bs->encrypted = 0; 1858 bs->valid_key = 0; 1859 bs->sg = 0; 1860 bs->growable = 0; 1861 bs->zero_beyond_eof = false; 1862 QDECREF(bs->options); 1863 bs->options = NULL; 1864 QDECREF(bs->full_open_options); 1865 bs->full_open_options = NULL; 1866 1867 if (bs->file != NULL) { 1868 bdrv_unref(bs->file); 1869 bs->file = NULL; 1870 } 1871 } 1872 1873 if (bs->blk) { 1874 blk_dev_change_media_cb(bs->blk, false); 1875 } 1876 1877 /*throttling disk I/O limits*/ 1878 if (bs->io_limits_enabled) { 1879 bdrv_io_limits_disable(bs); 1880 } 1881 1882 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) { 1883 g_free(ban); 1884 } 1885 QLIST_INIT(&bs->aio_notifiers); 1886 } 1887 1888 void bdrv_close_all(void) 1889 { 1890 BlockDriverState *bs; 1891 1892 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 1893 AioContext *aio_context = bdrv_get_aio_context(bs); 1894 1895 aio_context_acquire(aio_context); 1896 bdrv_close(bs); 1897 aio_context_release(aio_context); 1898 } 1899 } 1900 1901 /* Check if any requests are in-flight (including throttled requests) */ 1902 static bool bdrv_requests_pending(BlockDriverState *bs) 1903 { 1904 if (!QLIST_EMPTY(&bs->tracked_requests)) { 1905 return true; 1906 } 1907 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 1908 return true; 1909 } 1910 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 1911 return true; 1912 } 1913 if (bs->file && bdrv_requests_pending(bs->file)) { 1914 return true; 1915 } 1916 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 1917 return true; 1918 } 1919 return false; 1920 } 1921 1922 static bool bdrv_drain_one(BlockDriverState *bs) 1923 { 1924 bool bs_busy; 1925 1926 bdrv_flush_io_queue(bs); 1927 bdrv_start_throttled_reqs(bs); 1928 bs_busy = bdrv_requests_pending(bs); 1929 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); 1930 return bs_busy; 1931 } 1932 1933 /* 1934 * Wait for pending requests to complete on a single BlockDriverState subtree 1935 * 1936 * See the warning in bdrv_drain_all(). This function can only be called if 1937 * you are sure nothing can generate I/O because you have op blockers 1938 * installed. 1939 * 1940 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 1941 * AioContext. 1942 */ 1943 void bdrv_drain(BlockDriverState *bs) 1944 { 1945 while (bdrv_drain_one(bs)) { 1946 /* Keep iterating */ 1947 } 1948 } 1949 1950 /* 1951 * Wait for pending requests to complete across all BlockDriverStates 1952 * 1953 * This function does not flush data to disk, use bdrv_flush_all() for that 1954 * after calling this function. 1955 * 1956 * Note that completion of an asynchronous I/O operation can trigger any 1957 * number of other I/O operations on other devices---for example a coroutine 1958 * can be arbitrarily complex and a constant flow of I/O can come until the 1959 * coroutine is complete. Because of this, it is not possible to have a 1960 * function to drain a single device's I/O queue. 1961 */ 1962 void bdrv_drain_all(void) 1963 { 1964 /* Always run first iteration so any pending completion BHs run */ 1965 bool busy = true; 1966 BlockDriverState *bs; 1967 1968 while (busy) { 1969 busy = false; 1970 1971 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 1972 AioContext *aio_context = bdrv_get_aio_context(bs); 1973 1974 aio_context_acquire(aio_context); 1975 busy |= bdrv_drain_one(bs); 1976 aio_context_release(aio_context); 1977 } 1978 } 1979 } 1980 1981 /* make a BlockDriverState anonymous by removing from bdrv_state and 1982 * graph_bdrv_state list. 1983 Also, NULL terminate the device_name to prevent double remove */ 1984 void bdrv_make_anon(BlockDriverState *bs) 1985 { 1986 /* 1987 * Take care to remove bs from bdrv_states only when it's actually 1988 * in it. Note that bs->device_list.tqe_prev is initially null, 1989 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish 1990 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by 1991 * resetting it to null on remove. 1992 */ 1993 if (bs->device_list.tqe_prev) { 1994 QTAILQ_REMOVE(&bdrv_states, bs, device_list); 1995 bs->device_list.tqe_prev = NULL; 1996 } 1997 if (bs->node_name[0] != '\0') { 1998 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list); 1999 } 2000 bs->node_name[0] = '\0'; 2001 } 2002 2003 static void bdrv_rebind(BlockDriverState *bs) 2004 { 2005 if (bs->drv && bs->drv->bdrv_rebind) { 2006 bs->drv->bdrv_rebind(bs); 2007 } 2008 } 2009 2010 static void bdrv_move_feature_fields(BlockDriverState *bs_dest, 2011 BlockDriverState *bs_src) 2012 { 2013 /* move some fields that need to stay attached to the device */ 2014 2015 /* dev info */ 2016 bs_dest->guest_block_size = bs_src->guest_block_size; 2017 bs_dest->copy_on_read = bs_src->copy_on_read; 2018 2019 bs_dest->enable_write_cache = bs_src->enable_write_cache; 2020 2021 /* i/o throttled req */ 2022 memcpy(&bs_dest->throttle_state, 2023 &bs_src->throttle_state, 2024 sizeof(ThrottleState)); 2025 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0]; 2026 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1]; 2027 bs_dest->io_limits_enabled = bs_src->io_limits_enabled; 2028 2029 /* r/w error */ 2030 bs_dest->on_read_error = bs_src->on_read_error; 2031 bs_dest->on_write_error = bs_src->on_write_error; 2032 2033 /* i/o status */ 2034 bs_dest->iostatus_enabled = bs_src->iostatus_enabled; 2035 bs_dest->iostatus = bs_src->iostatus; 2036 2037 /* dirty bitmap */ 2038 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps; 2039 2040 /* reference count */ 2041 bs_dest->refcnt = bs_src->refcnt; 2042 2043 /* job */ 2044 bs_dest->job = bs_src->job; 2045 2046 /* keep the same entry in bdrv_states */ 2047 bs_dest->device_list = bs_src->device_list; 2048 bs_dest->blk = bs_src->blk; 2049 2050 memcpy(bs_dest->op_blockers, bs_src->op_blockers, 2051 sizeof(bs_dest->op_blockers)); 2052 } 2053 2054 /* 2055 * Swap bs contents for two image chains while they are live, 2056 * while keeping required fields on the BlockDriverState that is 2057 * actually attached to a device. 2058 * 2059 * This will modify the BlockDriverState fields, and swap contents 2060 * between bs_new and bs_old. Both bs_new and bs_old are modified. 2061 * 2062 * bs_new must not be attached to a BlockBackend. 2063 * 2064 * This function does not create any image files. 2065 */ 2066 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) 2067 { 2068 BlockDriverState tmp; 2069 2070 /* The code needs to swap the node_name but simply swapping node_list won't 2071 * work so first remove the nodes from the graph list, do the swap then 2072 * insert them back if needed. 2073 */ 2074 if (bs_new->node_name[0] != '\0') { 2075 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list); 2076 } 2077 if (bs_old->node_name[0] != '\0') { 2078 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list); 2079 } 2080 2081 /* bs_new must be unattached and shouldn't have anything fancy enabled */ 2082 assert(!bs_new->blk); 2083 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps)); 2084 assert(bs_new->job == NULL); 2085 assert(bs_new->io_limits_enabled == false); 2086 assert(!throttle_have_timer(&bs_new->throttle_state)); 2087 2088 tmp = *bs_new; 2089 *bs_new = *bs_old; 2090 *bs_old = tmp; 2091 2092 /* there are some fields that should not be swapped, move them back */ 2093 bdrv_move_feature_fields(&tmp, bs_old); 2094 bdrv_move_feature_fields(bs_old, bs_new); 2095 bdrv_move_feature_fields(bs_new, &tmp); 2096 2097 /* bs_new must remain unattached */ 2098 assert(!bs_new->blk); 2099 2100 /* Check a few fields that should remain attached to the device */ 2101 assert(bs_new->job == NULL); 2102 assert(bs_new->io_limits_enabled == false); 2103 assert(!throttle_have_timer(&bs_new->throttle_state)); 2104 2105 /* insert the nodes back into the graph node list if needed */ 2106 if (bs_new->node_name[0] != '\0') { 2107 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list); 2108 } 2109 if (bs_old->node_name[0] != '\0') { 2110 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list); 2111 } 2112 2113 bdrv_rebind(bs_new); 2114 bdrv_rebind(bs_old); 2115 } 2116 2117 /* 2118 * Add new bs contents at the top of an image chain while the chain is 2119 * live, while keeping required fields on the top layer. 2120 * 2121 * This will modify the BlockDriverState fields, and swap contents 2122 * between bs_new and bs_top. Both bs_new and bs_top are modified. 2123 * 2124 * bs_new must not be attached to a BlockBackend. 2125 * 2126 * This function does not create any image files. 2127 */ 2128 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) 2129 { 2130 bdrv_swap(bs_new, bs_top); 2131 2132 /* The contents of 'tmp' will become bs_top, as we are 2133 * swapping bs_new and bs_top contents. */ 2134 bdrv_set_backing_hd(bs_top, bs_new); 2135 } 2136 2137 static void bdrv_delete(BlockDriverState *bs) 2138 { 2139 assert(!bs->job); 2140 assert(bdrv_op_blocker_is_empty(bs)); 2141 assert(!bs->refcnt); 2142 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 2143 2144 bdrv_close(bs); 2145 2146 /* remove from list, if necessary */ 2147 bdrv_make_anon(bs); 2148 2149 g_free(bs); 2150 } 2151 2152 /* 2153 * Run consistency checks on an image 2154 * 2155 * Returns 0 if the check could be completed (it doesn't mean that the image is 2156 * free of errors) or -errno when an internal error occurred. The results of the 2157 * check are stored in res. 2158 */ 2159 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 2160 { 2161 if (bs->drv == NULL) { 2162 return -ENOMEDIUM; 2163 } 2164 if (bs->drv->bdrv_check == NULL) { 2165 return -ENOTSUP; 2166 } 2167 2168 memset(res, 0, sizeof(*res)); 2169 return bs->drv->bdrv_check(bs, res, fix); 2170 } 2171 2172 #define COMMIT_BUF_SECTORS 2048 2173 2174 /* commit COW file into the raw image */ 2175 int bdrv_commit(BlockDriverState *bs) 2176 { 2177 BlockDriver *drv = bs->drv; 2178 int64_t sector, total_sectors, length, backing_length; 2179 int n, ro, open_flags; 2180 int ret = 0; 2181 uint8_t *buf = NULL; 2182 char filename[PATH_MAX]; 2183 2184 if (!drv) 2185 return -ENOMEDIUM; 2186 2187 if (!bs->backing_hd) { 2188 return -ENOTSUP; 2189 } 2190 2191 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) || 2192 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) { 2193 return -EBUSY; 2194 } 2195 2196 ro = bs->backing_hd->read_only; 2197 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ 2198 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); 2199 open_flags = bs->backing_hd->open_flags; 2200 2201 if (ro) { 2202 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { 2203 return -EACCES; 2204 } 2205 } 2206 2207 length = bdrv_getlength(bs); 2208 if (length < 0) { 2209 ret = length; 2210 goto ro_cleanup; 2211 } 2212 2213 backing_length = bdrv_getlength(bs->backing_hd); 2214 if (backing_length < 0) { 2215 ret = backing_length; 2216 goto ro_cleanup; 2217 } 2218 2219 /* If our top snapshot is larger than the backing file image, 2220 * grow the backing file image if possible. If not possible, 2221 * we must return an error */ 2222 if (length > backing_length) { 2223 ret = bdrv_truncate(bs->backing_hd, length); 2224 if (ret < 0) { 2225 goto ro_cleanup; 2226 } 2227 } 2228 2229 total_sectors = length >> BDRV_SECTOR_BITS; 2230 2231 /* qemu_try_blockalign() for bs will choose an alignment that works for 2232 * bs->backing_hd as well, so no need to compare the alignment manually. */ 2233 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); 2234 if (buf == NULL) { 2235 ret = -ENOMEM; 2236 goto ro_cleanup; 2237 } 2238 2239 for (sector = 0; sector < total_sectors; sector += n) { 2240 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); 2241 if (ret < 0) { 2242 goto ro_cleanup; 2243 } 2244 if (ret) { 2245 ret = bdrv_read(bs, sector, buf, n); 2246 if (ret < 0) { 2247 goto ro_cleanup; 2248 } 2249 2250 ret = bdrv_write(bs->backing_hd, sector, buf, n); 2251 if (ret < 0) { 2252 goto ro_cleanup; 2253 } 2254 } 2255 } 2256 2257 if (drv->bdrv_make_empty) { 2258 ret = drv->bdrv_make_empty(bs); 2259 if (ret < 0) { 2260 goto ro_cleanup; 2261 } 2262 bdrv_flush(bs); 2263 } 2264 2265 /* 2266 * Make sure all data we wrote to the backing device is actually 2267 * stable on disk. 2268 */ 2269 if (bs->backing_hd) { 2270 bdrv_flush(bs->backing_hd); 2271 } 2272 2273 ret = 0; 2274 ro_cleanup: 2275 qemu_vfree(buf); 2276 2277 if (ro) { 2278 /* ignoring error return here */ 2279 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); 2280 } 2281 2282 return ret; 2283 } 2284 2285 int bdrv_commit_all(void) 2286 { 2287 BlockDriverState *bs; 2288 2289 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 2290 AioContext *aio_context = bdrv_get_aio_context(bs); 2291 2292 aio_context_acquire(aio_context); 2293 if (bs->drv && bs->backing_hd) { 2294 int ret = bdrv_commit(bs); 2295 if (ret < 0) { 2296 aio_context_release(aio_context); 2297 return ret; 2298 } 2299 } 2300 aio_context_release(aio_context); 2301 } 2302 return 0; 2303 } 2304 2305 /** 2306 * Remove an active request from the tracked requests list 2307 * 2308 * This function should be called when a tracked request is completing. 2309 */ 2310 static void tracked_request_end(BdrvTrackedRequest *req) 2311 { 2312 if (req->serialising) { 2313 req->bs->serialising_in_flight--; 2314 } 2315 2316 QLIST_REMOVE(req, list); 2317 qemu_co_queue_restart_all(&req->wait_queue); 2318 } 2319 2320 /** 2321 * Add an active request to the tracked requests list 2322 */ 2323 static void tracked_request_begin(BdrvTrackedRequest *req, 2324 BlockDriverState *bs, 2325 int64_t offset, 2326 unsigned int bytes, bool is_write) 2327 { 2328 *req = (BdrvTrackedRequest){ 2329 .bs = bs, 2330 .offset = offset, 2331 .bytes = bytes, 2332 .is_write = is_write, 2333 .co = qemu_coroutine_self(), 2334 .serialising = false, 2335 .overlap_offset = offset, 2336 .overlap_bytes = bytes, 2337 }; 2338 2339 qemu_co_queue_init(&req->wait_queue); 2340 2341 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 2342 } 2343 2344 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 2345 { 2346 int64_t overlap_offset = req->offset & ~(align - 1); 2347 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 2348 - overlap_offset; 2349 2350 if (!req->serialising) { 2351 req->bs->serialising_in_flight++; 2352 req->serialising = true; 2353 } 2354 2355 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 2356 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 2357 } 2358 2359 /** 2360 * Round a region to cluster boundaries 2361 */ 2362 void bdrv_round_to_clusters(BlockDriverState *bs, 2363 int64_t sector_num, int nb_sectors, 2364 int64_t *cluster_sector_num, 2365 int *cluster_nb_sectors) 2366 { 2367 BlockDriverInfo bdi; 2368 2369 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 2370 *cluster_sector_num = sector_num; 2371 *cluster_nb_sectors = nb_sectors; 2372 } else { 2373 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 2374 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 2375 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 2376 nb_sectors, c); 2377 } 2378 } 2379 2380 static int bdrv_get_cluster_size(BlockDriverState *bs) 2381 { 2382 BlockDriverInfo bdi; 2383 int ret; 2384 2385 ret = bdrv_get_info(bs, &bdi); 2386 if (ret < 0 || bdi.cluster_size == 0) { 2387 return bs->request_alignment; 2388 } else { 2389 return bdi.cluster_size; 2390 } 2391 } 2392 2393 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 2394 int64_t offset, unsigned int bytes) 2395 { 2396 /* aaaa bbbb */ 2397 if (offset >= req->overlap_offset + req->overlap_bytes) { 2398 return false; 2399 } 2400 /* bbbb aaaa */ 2401 if (req->overlap_offset >= offset + bytes) { 2402 return false; 2403 } 2404 return true; 2405 } 2406 2407 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 2408 { 2409 BlockDriverState *bs = self->bs; 2410 BdrvTrackedRequest *req; 2411 bool retry; 2412 bool waited = false; 2413 2414 if (!bs->serialising_in_flight) { 2415 return false; 2416 } 2417 2418 do { 2419 retry = false; 2420 QLIST_FOREACH(req, &bs->tracked_requests, list) { 2421 if (req == self || (!req->serialising && !self->serialising)) { 2422 continue; 2423 } 2424 if (tracked_request_overlaps(req, self->overlap_offset, 2425 self->overlap_bytes)) 2426 { 2427 /* Hitting this means there was a reentrant request, for 2428 * example, a block driver issuing nested requests. This must 2429 * never happen since it means deadlock. 2430 */ 2431 assert(qemu_coroutine_self() != req->co); 2432 2433 /* If the request is already (indirectly) waiting for us, or 2434 * will wait for us as soon as it wakes up, then just go on 2435 * (instead of producing a deadlock in the former case). */ 2436 if (!req->waiting_for) { 2437 self->waiting_for = req; 2438 qemu_co_queue_wait(&req->wait_queue); 2439 self->waiting_for = NULL; 2440 retry = true; 2441 waited = true; 2442 break; 2443 } 2444 } 2445 } 2446 } while (retry); 2447 2448 return waited; 2449 } 2450 2451 /* 2452 * Return values: 2453 * 0 - success 2454 * -EINVAL - backing format specified, but no file 2455 * -ENOSPC - can't update the backing file because no space is left in the 2456 * image file header 2457 * -ENOTSUP - format driver doesn't support changing the backing file 2458 */ 2459 int bdrv_change_backing_file(BlockDriverState *bs, 2460 const char *backing_file, const char *backing_fmt) 2461 { 2462 BlockDriver *drv = bs->drv; 2463 int ret; 2464 2465 /* Backing file format doesn't make sense without a backing file */ 2466 if (backing_fmt && !backing_file) { 2467 return -EINVAL; 2468 } 2469 2470 if (drv->bdrv_change_backing_file != NULL) { 2471 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); 2472 } else { 2473 ret = -ENOTSUP; 2474 } 2475 2476 if (ret == 0) { 2477 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 2478 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 2479 } 2480 return ret; 2481 } 2482 2483 /* 2484 * Finds the image layer in the chain that has 'bs' as its backing file. 2485 * 2486 * active is the current topmost image. 2487 * 2488 * Returns NULL if bs is not found in active's image chain, 2489 * or if active == bs. 2490 * 2491 * Returns the bottommost base image if bs == NULL. 2492 */ 2493 BlockDriverState *bdrv_find_overlay(BlockDriverState *active, 2494 BlockDriverState *bs) 2495 { 2496 while (active && bs != active->backing_hd) { 2497 active = active->backing_hd; 2498 } 2499 2500 return active; 2501 } 2502 2503 /* Given a BDS, searches for the base layer. */ 2504 BlockDriverState *bdrv_find_base(BlockDriverState *bs) 2505 { 2506 return bdrv_find_overlay(bs, NULL); 2507 } 2508 2509 typedef struct BlkIntermediateStates { 2510 BlockDriverState *bs; 2511 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; 2512 } BlkIntermediateStates; 2513 2514 2515 /* 2516 * Drops images above 'base' up to and including 'top', and sets the image 2517 * above 'top' to have base as its backing file. 2518 * 2519 * Requires that the overlay to 'top' is opened r/w, so that the backing file 2520 * information in 'bs' can be properly updated. 2521 * 2522 * E.g., this will convert the following chain: 2523 * bottom <- base <- intermediate <- top <- active 2524 * 2525 * to 2526 * 2527 * bottom <- base <- active 2528 * 2529 * It is allowed for bottom==base, in which case it converts: 2530 * 2531 * base <- intermediate <- top <- active 2532 * 2533 * to 2534 * 2535 * base <- active 2536 * 2537 * If backing_file_str is non-NULL, it will be used when modifying top's 2538 * overlay image metadata. 2539 * 2540 * Error conditions: 2541 * if active == top, that is considered an error 2542 * 2543 */ 2544 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, 2545 BlockDriverState *base, const char *backing_file_str) 2546 { 2547 BlockDriverState *intermediate; 2548 BlockDriverState *base_bs = NULL; 2549 BlockDriverState *new_top_bs = NULL; 2550 BlkIntermediateStates *intermediate_state, *next; 2551 int ret = -EIO; 2552 2553 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; 2554 QSIMPLEQ_INIT(&states_to_delete); 2555 2556 if (!top->drv || !base->drv) { 2557 goto exit; 2558 } 2559 2560 new_top_bs = bdrv_find_overlay(active, top); 2561 2562 if (new_top_bs == NULL) { 2563 /* we could not find the image above 'top', this is an error */ 2564 goto exit; 2565 } 2566 2567 /* special case of new_top_bs->backing_hd already pointing to base - nothing 2568 * to do, no intermediate images */ 2569 if (new_top_bs->backing_hd == base) { 2570 ret = 0; 2571 goto exit; 2572 } 2573 2574 intermediate = top; 2575 2576 /* now we will go down through the list, and add each BDS we find 2577 * into our deletion queue, until we hit the 'base' 2578 */ 2579 while (intermediate) { 2580 intermediate_state = g_new0(BlkIntermediateStates, 1); 2581 intermediate_state->bs = intermediate; 2582 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); 2583 2584 if (intermediate->backing_hd == base) { 2585 base_bs = intermediate->backing_hd; 2586 break; 2587 } 2588 intermediate = intermediate->backing_hd; 2589 } 2590 if (base_bs == NULL) { 2591 /* something went wrong, we did not end at the base. safely 2592 * unravel everything, and exit with error */ 2593 goto exit; 2594 } 2595 2596 /* success - we can delete the intermediate states, and link top->base */ 2597 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename; 2598 ret = bdrv_change_backing_file(new_top_bs, backing_file_str, 2599 base_bs->drv ? base_bs->drv->format_name : ""); 2600 if (ret) { 2601 goto exit; 2602 } 2603 bdrv_set_backing_hd(new_top_bs, base_bs); 2604 2605 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2606 /* so that bdrv_close() does not recursively close the chain */ 2607 bdrv_set_backing_hd(intermediate_state->bs, NULL); 2608 bdrv_unref(intermediate_state->bs); 2609 } 2610 ret = 0; 2611 2612 exit: 2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2614 g_free(intermediate_state); 2615 } 2616 return ret; 2617 } 2618 2619 2620 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 2621 size_t size) 2622 { 2623 int64_t len; 2624 2625 if (size > INT_MAX) { 2626 return -EIO; 2627 } 2628 2629 if (!bdrv_is_inserted(bs)) 2630 return -ENOMEDIUM; 2631 2632 if (bs->growable) 2633 return 0; 2634 2635 len = bdrv_getlength(bs); 2636 2637 if (offset < 0) 2638 return -EIO; 2639 2640 if ((offset > len) || (len - offset < size)) 2641 return -EIO; 2642 2643 return 0; 2644 } 2645 2646 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 2647 int nb_sectors) 2648 { 2649 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2650 return -EIO; 2651 } 2652 2653 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 2654 nb_sectors * BDRV_SECTOR_SIZE); 2655 } 2656 2657 typedef struct RwCo { 2658 BlockDriverState *bs; 2659 int64_t offset; 2660 QEMUIOVector *qiov; 2661 bool is_write; 2662 int ret; 2663 BdrvRequestFlags flags; 2664 } RwCo; 2665 2666 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 2667 { 2668 RwCo *rwco = opaque; 2669 2670 if (!rwco->is_write) { 2671 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 2672 rwco->qiov->size, rwco->qiov, 2673 rwco->flags); 2674 } else { 2675 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 2676 rwco->qiov->size, rwco->qiov, 2677 rwco->flags); 2678 } 2679 } 2680 2681 /* 2682 * Process a vectored synchronous request using coroutines 2683 */ 2684 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 2685 QEMUIOVector *qiov, bool is_write, 2686 BdrvRequestFlags flags) 2687 { 2688 Coroutine *co; 2689 RwCo rwco = { 2690 .bs = bs, 2691 .offset = offset, 2692 .qiov = qiov, 2693 .is_write = is_write, 2694 .ret = NOT_DONE, 2695 .flags = flags, 2696 }; 2697 2698 /** 2699 * In sync call context, when the vcpu is blocked, this throttling timer 2700 * will not fire; so the I/O throttling function has to be disabled here 2701 * if it has been enabled. 2702 */ 2703 if (bs->io_limits_enabled) { 2704 fprintf(stderr, "Disabling I/O throttling on '%s' due " 2705 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 2706 bdrv_io_limits_disable(bs); 2707 } 2708 2709 if (qemu_in_coroutine()) { 2710 /* Fast-path if already in coroutine context */ 2711 bdrv_rw_co_entry(&rwco); 2712 } else { 2713 AioContext *aio_context = bdrv_get_aio_context(bs); 2714 2715 co = qemu_coroutine_create(bdrv_rw_co_entry); 2716 qemu_coroutine_enter(co, &rwco); 2717 while (rwco.ret == NOT_DONE) { 2718 aio_poll(aio_context, true); 2719 } 2720 } 2721 return rwco.ret; 2722 } 2723 2724 /* 2725 * Process a synchronous request using coroutines 2726 */ 2727 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 2728 int nb_sectors, bool is_write, BdrvRequestFlags flags) 2729 { 2730 QEMUIOVector qiov; 2731 struct iovec iov = { 2732 .iov_base = (void *)buf, 2733 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 2734 }; 2735 2736 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2737 return -EINVAL; 2738 } 2739 2740 qemu_iovec_init_external(&qiov, &iov, 1); 2741 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 2742 &qiov, is_write, flags); 2743 } 2744 2745 /* return < 0 if error. See bdrv_write() for the return codes */ 2746 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 2747 uint8_t *buf, int nb_sectors) 2748 { 2749 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 2750 } 2751 2752 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 2753 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 2754 uint8_t *buf, int nb_sectors) 2755 { 2756 bool enabled; 2757 int ret; 2758 2759 enabled = bs->io_limits_enabled; 2760 bs->io_limits_enabled = false; 2761 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 2762 bs->io_limits_enabled = enabled; 2763 return ret; 2764 } 2765 2766 /* Return < 0 if error. Important errors are: 2767 -EIO generic I/O error (may happen for all errors) 2768 -ENOMEDIUM No media inserted. 2769 -EINVAL Invalid sector number or nb_sectors 2770 -EACCES Trying to write a read-only device 2771 */ 2772 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 2773 const uint8_t *buf, int nb_sectors) 2774 { 2775 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 2776 } 2777 2778 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 2779 int nb_sectors, BdrvRequestFlags flags) 2780 { 2781 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 2782 BDRV_REQ_ZERO_WRITE | flags); 2783 } 2784 2785 /* 2786 * Completely zero out a block device with the help of bdrv_write_zeroes. 2787 * The operation is sped up by checking the block status and only writing 2788 * zeroes to the device if they currently do not return zeroes. Optional 2789 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 2790 * 2791 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 2792 */ 2793 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 2794 { 2795 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 2796 int n; 2797 2798 target_sectors = bdrv_nb_sectors(bs); 2799 if (target_sectors < 0) { 2800 return target_sectors; 2801 } 2802 2803 for (;;) { 2804 nb_sectors = target_sectors - sector_num; 2805 if (nb_sectors <= 0) { 2806 return 0; 2807 } 2808 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2809 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE; 2810 } 2811 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 2812 if (ret < 0) { 2813 error_report("error getting block status at sector %" PRId64 ": %s", 2814 sector_num, strerror(-ret)); 2815 return ret; 2816 } 2817 if (ret & BDRV_BLOCK_ZERO) { 2818 sector_num += n; 2819 continue; 2820 } 2821 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 2822 if (ret < 0) { 2823 error_report("error writing zeroes at sector %" PRId64 ": %s", 2824 sector_num, strerror(-ret)); 2825 return ret; 2826 } 2827 sector_num += n; 2828 } 2829 } 2830 2831 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 2832 { 2833 QEMUIOVector qiov; 2834 struct iovec iov = { 2835 .iov_base = (void *)buf, 2836 .iov_len = bytes, 2837 }; 2838 int ret; 2839 2840 if (bytes < 0) { 2841 return -EINVAL; 2842 } 2843 2844 qemu_iovec_init_external(&qiov, &iov, 1); 2845 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 2846 if (ret < 0) { 2847 return ret; 2848 } 2849 2850 return bytes; 2851 } 2852 2853 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 2854 { 2855 int ret; 2856 2857 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 2858 if (ret < 0) { 2859 return ret; 2860 } 2861 2862 return qiov->size; 2863 } 2864 2865 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 2866 const void *buf, int bytes) 2867 { 2868 QEMUIOVector qiov; 2869 struct iovec iov = { 2870 .iov_base = (void *) buf, 2871 .iov_len = bytes, 2872 }; 2873 2874 if (bytes < 0) { 2875 return -EINVAL; 2876 } 2877 2878 qemu_iovec_init_external(&qiov, &iov, 1); 2879 return bdrv_pwritev(bs, offset, &qiov); 2880 } 2881 2882 /* 2883 * Writes to the file and ensures that no writes are reordered across this 2884 * request (acts as a barrier) 2885 * 2886 * Returns 0 on success, -errno in error cases. 2887 */ 2888 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 2889 const void *buf, int count) 2890 { 2891 int ret; 2892 2893 ret = bdrv_pwrite(bs, offset, buf, count); 2894 if (ret < 0) { 2895 return ret; 2896 } 2897 2898 /* No flush needed for cache modes that already do it */ 2899 if (bs->enable_write_cache) { 2900 bdrv_flush(bs); 2901 } 2902 2903 return 0; 2904 } 2905 2906 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 2907 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2908 { 2909 /* Perform I/O through a temporary buffer so that users who scribble over 2910 * their read buffer while the operation is in progress do not end up 2911 * modifying the image file. This is critical for zero-copy guest I/O 2912 * where anything might happen inside guest memory. 2913 */ 2914 void *bounce_buffer; 2915 2916 BlockDriver *drv = bs->drv; 2917 struct iovec iov; 2918 QEMUIOVector bounce_qiov; 2919 int64_t cluster_sector_num; 2920 int cluster_nb_sectors; 2921 size_t skip_bytes; 2922 int ret; 2923 2924 /* Cover entire cluster so no additional backing file I/O is required when 2925 * allocating cluster in the image file. 2926 */ 2927 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2928 &cluster_sector_num, &cluster_nb_sectors); 2929 2930 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 2931 cluster_sector_num, cluster_nb_sectors); 2932 2933 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 2934 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 2935 if (bounce_buffer == NULL) { 2936 ret = -ENOMEM; 2937 goto err; 2938 } 2939 2940 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 2941 2942 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 2943 &bounce_qiov); 2944 if (ret < 0) { 2945 goto err; 2946 } 2947 2948 if (drv->bdrv_co_write_zeroes && 2949 buffer_is_zero(bounce_buffer, iov.iov_len)) { 2950 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 2951 cluster_nb_sectors, 0); 2952 } else { 2953 /* This does not change the data on the disk, it is not necessary 2954 * to flush even in cache=writethrough mode. 2955 */ 2956 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 2957 &bounce_qiov); 2958 } 2959 2960 if (ret < 0) { 2961 /* It might be okay to ignore write errors for guest requests. If this 2962 * is a deliberate copy-on-read then we don't want to ignore the error. 2963 * Simply report it in all cases. 2964 */ 2965 goto err; 2966 } 2967 2968 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 2969 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 2970 nb_sectors * BDRV_SECTOR_SIZE); 2971 2972 err: 2973 qemu_vfree(bounce_buffer); 2974 return ret; 2975 } 2976 2977 /* 2978 * Forwards an already correctly aligned request to the BlockDriver. This 2979 * handles copy on read and zeroing after EOF; any other features must be 2980 * implemented by the caller. 2981 */ 2982 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 2983 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 2984 int64_t align, QEMUIOVector *qiov, int flags) 2985 { 2986 BlockDriver *drv = bs->drv; 2987 int ret; 2988 2989 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 2990 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 2991 2992 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 2993 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 2994 assert(!qiov || bytes == qiov->size); 2995 2996 /* Handle Copy on Read and associated serialisation */ 2997 if (flags & BDRV_REQ_COPY_ON_READ) { 2998 /* If we touch the same cluster it counts as an overlap. This 2999 * guarantees that allocating writes will be serialized and not race 3000 * with each other for the same cluster. For example, in copy-on-read 3001 * it ensures that the CoR read and write operations are atomic and 3002 * guest writes cannot interleave between them. */ 3003 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 3004 } 3005 3006 wait_serialising_requests(req); 3007 3008 if (flags & BDRV_REQ_COPY_ON_READ) { 3009 int pnum; 3010 3011 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 3012 if (ret < 0) { 3013 goto out; 3014 } 3015 3016 if (!ret || pnum != nb_sectors) { 3017 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 3018 goto out; 3019 } 3020 } 3021 3022 /* Forward the request to the BlockDriver */ 3023 if (!(bs->zero_beyond_eof && bs->growable)) { 3024 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 3025 } else { 3026 /* Read zeros after EOF of growable BDSes */ 3027 int64_t total_sectors, max_nb_sectors; 3028 3029 total_sectors = bdrv_nb_sectors(bs); 3030 if (total_sectors < 0) { 3031 ret = total_sectors; 3032 goto out; 3033 } 3034 3035 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 3036 align >> BDRV_SECTOR_BITS); 3037 if (max_nb_sectors > 0) { 3038 QEMUIOVector local_qiov; 3039 size_t local_sectors; 3040 3041 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS); 3042 local_sectors = MIN(max_nb_sectors, nb_sectors); 3043 3044 qemu_iovec_init(&local_qiov, qiov->niov); 3045 qemu_iovec_concat(&local_qiov, qiov, 0, 3046 local_sectors * BDRV_SECTOR_SIZE); 3047 3048 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors, 3049 &local_qiov); 3050 3051 qemu_iovec_destroy(&local_qiov); 3052 } else { 3053 ret = 0; 3054 } 3055 3056 /* Reading beyond end of file is supposed to produce zeroes */ 3057 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 3058 uint64_t offset = MAX(0, total_sectors - sector_num); 3059 uint64_t bytes = (sector_num + nb_sectors - offset) * 3060 BDRV_SECTOR_SIZE; 3061 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 3062 } 3063 } 3064 3065 out: 3066 return ret; 3067 } 3068 3069 /* 3070 * Handle a read request in coroutine context 3071 */ 3072 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 3073 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 3074 BdrvRequestFlags flags) 3075 { 3076 BlockDriver *drv = bs->drv; 3077 BdrvTrackedRequest req; 3078 3079 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 3080 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 3081 uint8_t *head_buf = NULL; 3082 uint8_t *tail_buf = NULL; 3083 QEMUIOVector local_qiov; 3084 bool use_local_qiov = false; 3085 int ret; 3086 3087 if (!drv) { 3088 return -ENOMEDIUM; 3089 } 3090 if (bdrv_check_byte_request(bs, offset, bytes)) { 3091 return -EIO; 3092 } 3093 3094 if (bs->copy_on_read) { 3095 flags |= BDRV_REQ_COPY_ON_READ; 3096 } 3097 3098 /* throttling disk I/O */ 3099 if (bs->io_limits_enabled) { 3100 bdrv_io_limits_intercept(bs, bytes, false); 3101 } 3102 3103 /* Align read if necessary by padding qiov */ 3104 if (offset & (align - 1)) { 3105 head_buf = qemu_blockalign(bs, align); 3106 qemu_iovec_init(&local_qiov, qiov->niov + 2); 3107 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 3108 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3109 use_local_qiov = true; 3110 3111 bytes += offset & (align - 1); 3112 offset = offset & ~(align - 1); 3113 } 3114 3115 if ((offset + bytes) & (align - 1)) { 3116 if (!use_local_qiov) { 3117 qemu_iovec_init(&local_qiov, qiov->niov + 1); 3118 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3119 use_local_qiov = true; 3120 } 3121 tail_buf = qemu_blockalign(bs, align); 3122 qemu_iovec_add(&local_qiov, tail_buf, 3123 align - ((offset + bytes) & (align - 1))); 3124 3125 bytes = ROUND_UP(bytes, align); 3126 } 3127 3128 tracked_request_begin(&req, bs, offset, bytes, false); 3129 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 3130 use_local_qiov ? &local_qiov : qiov, 3131 flags); 3132 tracked_request_end(&req); 3133 3134 if (use_local_qiov) { 3135 qemu_iovec_destroy(&local_qiov); 3136 qemu_vfree(head_buf); 3137 qemu_vfree(tail_buf); 3138 } 3139 3140 return ret; 3141 } 3142 3143 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 3144 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 3145 BdrvRequestFlags flags) 3146 { 3147 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { 3148 return -EINVAL; 3149 } 3150 3151 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 3152 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 3153 } 3154 3155 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 3156 int nb_sectors, QEMUIOVector *qiov) 3157 { 3158 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 3159 3160 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 3161 } 3162 3163 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 3164 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 3165 { 3166 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 3167 3168 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 3169 BDRV_REQ_COPY_ON_READ); 3170 } 3171 3172 /* if no limit is specified in the BlockLimits use a default 3173 * of 32768 512-byte sectors (16 MiB) per request. 3174 */ 3175 #define MAX_WRITE_ZEROES_DEFAULT 32768 3176 3177 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 3178 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 3179 { 3180 BlockDriver *drv = bs->drv; 3181 QEMUIOVector qiov; 3182 struct iovec iov = {0}; 3183 int ret = 0; 3184 3185 int max_write_zeroes = bs->bl.max_write_zeroes ? 3186 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; 3187 3188 while (nb_sectors > 0 && !ret) { 3189 int num = nb_sectors; 3190 3191 /* Align request. Block drivers can expect the "bulk" of the request 3192 * to be aligned. 3193 */ 3194 if (bs->bl.write_zeroes_alignment 3195 && num > bs->bl.write_zeroes_alignment) { 3196 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 3197 /* Make a small request up to the first aligned sector. */ 3198 num = bs->bl.write_zeroes_alignment; 3199 num -= sector_num % bs->bl.write_zeroes_alignment; 3200 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 3201 /* Shorten the request to the last aligned sector. num cannot 3202 * underflow because num > bs->bl.write_zeroes_alignment. 3203 */ 3204 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 3205 } 3206 } 3207 3208 /* limit request size */ 3209 if (num > max_write_zeroes) { 3210 num = max_write_zeroes; 3211 } 3212 3213 ret = -ENOTSUP; 3214 /* First try the efficient write zeroes operation */ 3215 if (drv->bdrv_co_write_zeroes) { 3216 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 3217 } 3218 3219 if (ret == -ENOTSUP) { 3220 /* Fall back to bounce buffer if write zeroes is unsupported */ 3221 iov.iov_len = num * BDRV_SECTOR_SIZE; 3222 if (iov.iov_base == NULL) { 3223 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 3224 if (iov.iov_base == NULL) { 3225 ret = -ENOMEM; 3226 goto fail; 3227 } 3228 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 3229 } 3230 qemu_iovec_init_external(&qiov, &iov, 1); 3231 3232 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 3233 3234 /* Keep bounce buffer around if it is big enough for all 3235 * all future requests. 3236 */ 3237 if (num < max_write_zeroes) { 3238 qemu_vfree(iov.iov_base); 3239 iov.iov_base = NULL; 3240 } 3241 } 3242 3243 sector_num += num; 3244 nb_sectors -= num; 3245 } 3246 3247 fail: 3248 qemu_vfree(iov.iov_base); 3249 return ret; 3250 } 3251 3252 /* 3253 * Forwards an already correctly aligned write request to the BlockDriver. 3254 */ 3255 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 3256 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 3257 QEMUIOVector *qiov, int flags) 3258 { 3259 BlockDriver *drv = bs->drv; 3260 bool waited; 3261 int ret; 3262 3263 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 3264 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 3265 3266 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 3267 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 3268 assert(!qiov || bytes == qiov->size); 3269 3270 waited = wait_serialising_requests(req); 3271 assert(!waited || !req->serialising); 3272 assert(req->overlap_offset <= offset); 3273 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 3274 3275 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 3276 3277 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 3278 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 3279 qemu_iovec_is_zero(qiov)) { 3280 flags |= BDRV_REQ_ZERO_WRITE; 3281 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 3282 flags |= BDRV_REQ_MAY_UNMAP; 3283 } 3284 } 3285 3286 if (ret < 0) { 3287 /* Do nothing, write notifier decided to fail this request */ 3288 } else if (flags & BDRV_REQ_ZERO_WRITE) { 3289 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); 3290 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 3291 } else { 3292 BLKDBG_EVENT(bs, BLKDBG_PWRITEV); 3293 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 3294 } 3295 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); 3296 3297 if (ret == 0 && !bs->enable_write_cache) { 3298 ret = bdrv_co_flush(bs); 3299 } 3300 3301 bdrv_set_dirty(bs, sector_num, nb_sectors); 3302 3303 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); 3304 3305 if (bs->growable && ret >= 0) { 3306 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 3307 } 3308 3309 return ret; 3310 } 3311 3312 /* 3313 * Handle a write request in coroutine context 3314 */ 3315 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 3316 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 3317 BdrvRequestFlags flags) 3318 { 3319 BdrvTrackedRequest req; 3320 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 3321 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 3322 uint8_t *head_buf = NULL; 3323 uint8_t *tail_buf = NULL; 3324 QEMUIOVector local_qiov; 3325 bool use_local_qiov = false; 3326 int ret; 3327 3328 if (!bs->drv) { 3329 return -ENOMEDIUM; 3330 } 3331 if (bs->read_only) { 3332 return -EACCES; 3333 } 3334 if (bdrv_check_byte_request(bs, offset, bytes)) { 3335 return -EIO; 3336 } 3337 3338 /* throttling disk I/O */ 3339 if (bs->io_limits_enabled) { 3340 bdrv_io_limits_intercept(bs, bytes, true); 3341 } 3342 3343 /* 3344 * Align write if necessary by performing a read-modify-write cycle. 3345 * Pad qiov with the read parts and be sure to have a tracked request not 3346 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 3347 */ 3348 tracked_request_begin(&req, bs, offset, bytes, true); 3349 3350 if (offset & (align - 1)) { 3351 QEMUIOVector head_qiov; 3352 struct iovec head_iov; 3353 3354 mark_request_serialising(&req, align); 3355 wait_serialising_requests(&req); 3356 3357 head_buf = qemu_blockalign(bs, align); 3358 head_iov = (struct iovec) { 3359 .iov_base = head_buf, 3360 .iov_len = align, 3361 }; 3362 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 3363 3364 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 3365 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 3366 align, &head_qiov, 0); 3367 if (ret < 0) { 3368 goto fail; 3369 } 3370 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 3371 3372 qemu_iovec_init(&local_qiov, qiov->niov + 2); 3373 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 3374 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3375 use_local_qiov = true; 3376 3377 bytes += offset & (align - 1); 3378 offset = offset & ~(align - 1); 3379 } 3380 3381 if ((offset + bytes) & (align - 1)) { 3382 QEMUIOVector tail_qiov; 3383 struct iovec tail_iov; 3384 size_t tail_bytes; 3385 bool waited; 3386 3387 mark_request_serialising(&req, align); 3388 waited = wait_serialising_requests(&req); 3389 assert(!waited || !use_local_qiov); 3390 3391 tail_buf = qemu_blockalign(bs, align); 3392 tail_iov = (struct iovec) { 3393 .iov_base = tail_buf, 3394 .iov_len = align, 3395 }; 3396 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 3397 3398 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 3399 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 3400 align, &tail_qiov, 0); 3401 if (ret < 0) { 3402 goto fail; 3403 } 3404 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 3405 3406 if (!use_local_qiov) { 3407 qemu_iovec_init(&local_qiov, qiov->niov + 1); 3408 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3409 use_local_qiov = true; 3410 } 3411 3412 tail_bytes = (offset + bytes) & (align - 1); 3413 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 3414 3415 bytes = ROUND_UP(bytes, align); 3416 } 3417 3418 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 3419 use_local_qiov ? &local_qiov : qiov, 3420 flags); 3421 3422 fail: 3423 tracked_request_end(&req); 3424 3425 if (use_local_qiov) { 3426 qemu_iovec_destroy(&local_qiov); 3427 } 3428 qemu_vfree(head_buf); 3429 qemu_vfree(tail_buf); 3430 3431 return ret; 3432 } 3433 3434 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 3435 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 3436 BdrvRequestFlags flags) 3437 { 3438 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { 3439 return -EINVAL; 3440 } 3441 3442 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 3443 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 3444 } 3445 3446 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 3447 int nb_sectors, QEMUIOVector *qiov) 3448 { 3449 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 3450 3451 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 3452 } 3453 3454 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 3455 int64_t sector_num, int nb_sectors, 3456 BdrvRequestFlags flags) 3457 { 3458 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 3459 3460 if (!(bs->open_flags & BDRV_O_UNMAP)) { 3461 flags &= ~BDRV_REQ_MAY_UNMAP; 3462 } 3463 3464 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 3465 BDRV_REQ_ZERO_WRITE | flags); 3466 } 3467 3468 /** 3469 * Truncate file to 'offset' bytes (needed only for file protocols) 3470 */ 3471 int bdrv_truncate(BlockDriverState *bs, int64_t offset) 3472 { 3473 BlockDriver *drv = bs->drv; 3474 int ret; 3475 if (!drv) 3476 return -ENOMEDIUM; 3477 if (!drv->bdrv_truncate) 3478 return -ENOTSUP; 3479 if (bs->read_only) 3480 return -EACCES; 3481 3482 ret = drv->bdrv_truncate(bs, offset); 3483 if (ret == 0) { 3484 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3485 if (bs->blk) { 3486 blk_dev_resize_cb(bs->blk); 3487 } 3488 } 3489 return ret; 3490 } 3491 3492 /** 3493 * Length of a allocated file in bytes. Sparse files are counted by actual 3494 * allocated space. Return < 0 if error or unknown. 3495 */ 3496 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) 3497 { 3498 BlockDriver *drv = bs->drv; 3499 if (!drv) { 3500 return -ENOMEDIUM; 3501 } 3502 if (drv->bdrv_get_allocated_file_size) { 3503 return drv->bdrv_get_allocated_file_size(bs); 3504 } 3505 if (bs->file) { 3506 return bdrv_get_allocated_file_size(bs->file); 3507 } 3508 return -ENOTSUP; 3509 } 3510 3511 /** 3512 * Return number of sectors on success, -errno on error. 3513 */ 3514 int64_t bdrv_nb_sectors(BlockDriverState *bs) 3515 { 3516 BlockDriver *drv = bs->drv; 3517 3518 if (!drv) 3519 return -ENOMEDIUM; 3520 3521 if (drv->has_variable_length) { 3522 int ret = refresh_total_sectors(bs, bs->total_sectors); 3523 if (ret < 0) { 3524 return ret; 3525 } 3526 } 3527 return bs->total_sectors; 3528 } 3529 3530 /** 3531 * Return length in bytes on success, -errno on error. 3532 * The length is always a multiple of BDRV_SECTOR_SIZE. 3533 */ 3534 int64_t bdrv_getlength(BlockDriverState *bs) 3535 { 3536 int64_t ret = bdrv_nb_sectors(bs); 3537 3538 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; 3539 } 3540 3541 /* return 0 as number of sectors if no device present or error */ 3542 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) 3543 { 3544 int64_t nb_sectors = bdrv_nb_sectors(bs); 3545 3546 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; 3547 } 3548 3549 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, 3550 BlockdevOnError on_write_error) 3551 { 3552 bs->on_read_error = on_read_error; 3553 bs->on_write_error = on_write_error; 3554 } 3555 3556 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) 3557 { 3558 return is_read ? bs->on_read_error : bs->on_write_error; 3559 } 3560 3561 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) 3562 { 3563 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; 3564 3565 switch (on_err) { 3566 case BLOCKDEV_ON_ERROR_ENOSPC: 3567 return (error == ENOSPC) ? 3568 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; 3569 case BLOCKDEV_ON_ERROR_STOP: 3570 return BLOCK_ERROR_ACTION_STOP; 3571 case BLOCKDEV_ON_ERROR_REPORT: 3572 return BLOCK_ERROR_ACTION_REPORT; 3573 case BLOCKDEV_ON_ERROR_IGNORE: 3574 return BLOCK_ERROR_ACTION_IGNORE; 3575 default: 3576 abort(); 3577 } 3578 } 3579 3580 static void send_qmp_error_event(BlockDriverState *bs, 3581 BlockErrorAction action, 3582 bool is_read, int error) 3583 { 3584 IoOperationType optype; 3585 3586 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; 3587 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action, 3588 bdrv_iostatus_is_enabled(bs), 3589 error == ENOSPC, strerror(error), 3590 &error_abort); 3591 } 3592 3593 /* This is done by device models because, while the block layer knows 3594 * about the error, it does not know whether an operation comes from 3595 * the device or the block layer (from a job, for example). 3596 */ 3597 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, 3598 bool is_read, int error) 3599 { 3600 assert(error >= 0); 3601 3602 if (action == BLOCK_ERROR_ACTION_STOP) { 3603 /* First set the iostatus, so that "info block" returns an iostatus 3604 * that matches the events raised so far (an additional error iostatus 3605 * is fine, but not a lost one). 3606 */ 3607 bdrv_iostatus_set_err(bs, error); 3608 3609 /* Then raise the request to stop the VM and the event. 3610 * qemu_system_vmstop_request_prepare has two effects. First, 3611 * it ensures that the STOP event always comes after the 3612 * BLOCK_IO_ERROR event. Second, it ensures that even if management 3613 * can observe the STOP event and do a "cont" before the STOP 3614 * event is issued, the VM will not stop. In this case, vm_start() 3615 * also ensures that the STOP/RESUME pair of events is emitted. 3616 */ 3617 qemu_system_vmstop_request_prepare(); 3618 send_qmp_error_event(bs, action, is_read, error); 3619 qemu_system_vmstop_request(RUN_STATE_IO_ERROR); 3620 } else { 3621 send_qmp_error_event(bs, action, is_read, error); 3622 } 3623 } 3624 3625 int bdrv_is_read_only(BlockDriverState *bs) 3626 { 3627 return bs->read_only; 3628 } 3629 3630 int bdrv_is_sg(BlockDriverState *bs) 3631 { 3632 return bs->sg; 3633 } 3634 3635 int bdrv_enable_write_cache(BlockDriverState *bs) 3636 { 3637 return bs->enable_write_cache; 3638 } 3639 3640 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) 3641 { 3642 bs->enable_write_cache = wce; 3643 3644 /* so a reopen() will preserve wce */ 3645 if (wce) { 3646 bs->open_flags |= BDRV_O_CACHE_WB; 3647 } else { 3648 bs->open_flags &= ~BDRV_O_CACHE_WB; 3649 } 3650 } 3651 3652 int bdrv_is_encrypted(BlockDriverState *bs) 3653 { 3654 if (bs->backing_hd && bs->backing_hd->encrypted) 3655 return 1; 3656 return bs->encrypted; 3657 } 3658 3659 int bdrv_key_required(BlockDriverState *bs) 3660 { 3661 BlockDriverState *backing_hd = bs->backing_hd; 3662 3663 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) 3664 return 1; 3665 return (bs->encrypted && !bs->valid_key); 3666 } 3667 3668 int bdrv_set_key(BlockDriverState *bs, const char *key) 3669 { 3670 int ret; 3671 if (bs->backing_hd && bs->backing_hd->encrypted) { 3672 ret = bdrv_set_key(bs->backing_hd, key); 3673 if (ret < 0) 3674 return ret; 3675 if (!bs->encrypted) 3676 return 0; 3677 } 3678 if (!bs->encrypted) { 3679 return -EINVAL; 3680 } else if (!bs->drv || !bs->drv->bdrv_set_key) { 3681 return -ENOMEDIUM; 3682 } 3683 ret = bs->drv->bdrv_set_key(bs, key); 3684 if (ret < 0) { 3685 bs->valid_key = 0; 3686 } else if (!bs->valid_key) { 3687 bs->valid_key = 1; 3688 if (bs->blk) { 3689 /* call the change callback now, we skipped it on open */ 3690 blk_dev_change_media_cb(bs->blk, true); 3691 } 3692 } 3693 return ret; 3694 } 3695 3696 const char *bdrv_get_format_name(BlockDriverState *bs) 3697 { 3698 return bs->drv ? bs->drv->format_name : NULL; 3699 } 3700 3701 static int qsort_strcmp(const void *a, const void *b) 3702 { 3703 return strcmp(a, b); 3704 } 3705 3706 void bdrv_iterate_format(void (*it)(void *opaque, const char *name), 3707 void *opaque) 3708 { 3709 BlockDriver *drv; 3710 int count = 0; 3711 int i; 3712 const char **formats = NULL; 3713 3714 QLIST_FOREACH(drv, &bdrv_drivers, list) { 3715 if (drv->format_name) { 3716 bool found = false; 3717 int i = count; 3718 while (formats && i && !found) { 3719 found = !strcmp(formats[--i], drv->format_name); 3720 } 3721 3722 if (!found) { 3723 formats = g_renew(const char *, formats, count + 1); 3724 formats[count++] = drv->format_name; 3725 } 3726 } 3727 } 3728 3729 qsort(formats, count, sizeof(formats[0]), qsort_strcmp); 3730 3731 for (i = 0; i < count; i++) { 3732 it(opaque, formats[i]); 3733 } 3734 3735 g_free(formats); 3736 } 3737 3738 /* This function is to find block backend bs */ 3739 /* TODO convert callers to blk_by_name(), then remove */ 3740 BlockDriverState *bdrv_find(const char *name) 3741 { 3742 BlockBackend *blk = blk_by_name(name); 3743 3744 return blk ? blk_bs(blk) : NULL; 3745 } 3746 3747 /* This function is to find a node in the bs graph */ 3748 BlockDriverState *bdrv_find_node(const char *node_name) 3749 { 3750 BlockDriverState *bs; 3751 3752 assert(node_name); 3753 3754 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { 3755 if (!strcmp(node_name, bs->node_name)) { 3756 return bs; 3757 } 3758 } 3759 return NULL; 3760 } 3761 3762 /* Put this QMP function here so it can access the static graph_bdrv_states. */ 3763 BlockDeviceInfoList *bdrv_named_nodes_list(void) 3764 { 3765 BlockDeviceInfoList *list, *entry; 3766 BlockDriverState *bs; 3767 3768 list = NULL; 3769 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { 3770 entry = g_malloc0(sizeof(*entry)); 3771 entry->value = bdrv_block_device_info(bs); 3772 entry->next = list; 3773 list = entry; 3774 } 3775 3776 return list; 3777 } 3778 3779 BlockDriverState *bdrv_lookup_bs(const char *device, 3780 const char *node_name, 3781 Error **errp) 3782 { 3783 BlockBackend *blk; 3784 BlockDriverState *bs; 3785 3786 if (device) { 3787 blk = blk_by_name(device); 3788 3789 if (blk) { 3790 return blk_bs(blk); 3791 } 3792 } 3793 3794 if (node_name) { 3795 bs = bdrv_find_node(node_name); 3796 3797 if (bs) { 3798 return bs; 3799 } 3800 } 3801 3802 error_setg(errp, "Cannot find device=%s nor node_name=%s", 3803 device ? device : "", 3804 node_name ? node_name : ""); 3805 return NULL; 3806 } 3807 3808 /* If 'base' is in the same chain as 'top', return true. Otherwise, 3809 * return false. If either argument is NULL, return false. */ 3810 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) 3811 { 3812 while (top && top != base) { 3813 top = top->backing_hd; 3814 } 3815 3816 return top != NULL; 3817 } 3818 3819 BlockDriverState *bdrv_next_node(BlockDriverState *bs) 3820 { 3821 if (!bs) { 3822 return QTAILQ_FIRST(&graph_bdrv_states); 3823 } 3824 return QTAILQ_NEXT(bs, node_list); 3825 } 3826 3827 BlockDriverState *bdrv_next(BlockDriverState *bs) 3828 { 3829 if (!bs) { 3830 return QTAILQ_FIRST(&bdrv_states); 3831 } 3832 return QTAILQ_NEXT(bs, device_list); 3833 } 3834 3835 const char *bdrv_get_node_name(const BlockDriverState *bs) 3836 { 3837 return bs->node_name; 3838 } 3839 3840 /* TODO check what callers really want: bs->node_name or blk_name() */ 3841 const char *bdrv_get_device_name(const BlockDriverState *bs) 3842 { 3843 return bs->blk ? blk_name(bs->blk) : ""; 3844 } 3845 3846 int bdrv_get_flags(BlockDriverState *bs) 3847 { 3848 return bs->open_flags; 3849 } 3850 3851 int bdrv_flush_all(void) 3852 { 3853 BlockDriverState *bs; 3854 int result = 0; 3855 3856 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 3857 AioContext *aio_context = bdrv_get_aio_context(bs); 3858 int ret; 3859 3860 aio_context_acquire(aio_context); 3861 ret = bdrv_flush(bs); 3862 if (ret < 0 && !result) { 3863 result = ret; 3864 } 3865 aio_context_release(aio_context); 3866 } 3867 3868 return result; 3869 } 3870 3871 int bdrv_has_zero_init_1(BlockDriverState *bs) 3872 { 3873 return 1; 3874 } 3875 3876 int bdrv_has_zero_init(BlockDriverState *bs) 3877 { 3878 assert(bs->drv); 3879 3880 /* If BS is a copy on write image, it is initialized to 3881 the contents of the base image, which may not be zeroes. */ 3882 if (bs->backing_hd) { 3883 return 0; 3884 } 3885 if (bs->drv->bdrv_has_zero_init) { 3886 return bs->drv->bdrv_has_zero_init(bs); 3887 } 3888 3889 /* safe default */ 3890 return 0; 3891 } 3892 3893 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) 3894 { 3895 BlockDriverInfo bdi; 3896 3897 if (bs->backing_hd) { 3898 return false; 3899 } 3900 3901 if (bdrv_get_info(bs, &bdi) == 0) { 3902 return bdi.unallocated_blocks_are_zero; 3903 } 3904 3905 return false; 3906 } 3907 3908 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) 3909 { 3910 BlockDriverInfo bdi; 3911 3912 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { 3913 return false; 3914 } 3915 3916 if (bdrv_get_info(bs, &bdi) == 0) { 3917 return bdi.can_write_zeroes_with_unmap; 3918 } 3919 3920 return false; 3921 } 3922 3923 typedef struct BdrvCoGetBlockStatusData { 3924 BlockDriverState *bs; 3925 BlockDriverState *base; 3926 int64_t sector_num; 3927 int nb_sectors; 3928 int *pnum; 3929 int64_t ret; 3930 bool done; 3931 } BdrvCoGetBlockStatusData; 3932 3933 /* 3934 * Returns the allocation status of the specified sectors. 3935 * Drivers not implementing the functionality are assumed to not support 3936 * backing files, hence all their sectors are reported as allocated. 3937 * 3938 * If 'sector_num' is beyond the end of the disk image the return value is 0 3939 * and 'pnum' is set to 0. 3940 * 3941 * 'pnum' is set to the number of sectors (including and immediately following 3942 * the specified sector) that are known to be in the same 3943 * allocated/unallocated state. 3944 * 3945 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 3946 * beyond the end of the disk image it will be clamped. 3947 */ 3948 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 3949 int64_t sector_num, 3950 int nb_sectors, int *pnum) 3951 { 3952 int64_t total_sectors; 3953 int64_t n; 3954 int64_t ret, ret2; 3955 3956 total_sectors = bdrv_nb_sectors(bs); 3957 if (total_sectors < 0) { 3958 return total_sectors; 3959 } 3960 3961 if (sector_num >= total_sectors) { 3962 *pnum = 0; 3963 return 0; 3964 } 3965 3966 n = total_sectors - sector_num; 3967 if (n < nb_sectors) { 3968 nb_sectors = n; 3969 } 3970 3971 if (!bs->drv->bdrv_co_get_block_status) { 3972 *pnum = nb_sectors; 3973 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 3974 if (bs->drv->protocol_name) { 3975 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 3976 } 3977 return ret; 3978 } 3979 3980 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 3981 if (ret < 0) { 3982 *pnum = 0; 3983 return ret; 3984 } 3985 3986 if (ret & BDRV_BLOCK_RAW) { 3987 assert(ret & BDRV_BLOCK_OFFSET_VALID); 3988 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 3989 *pnum, pnum); 3990 } 3991 3992 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 3993 ret |= BDRV_BLOCK_ALLOCATED; 3994 } 3995 3996 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { 3997 if (bdrv_unallocated_blocks_are_zero(bs)) { 3998 ret |= BDRV_BLOCK_ZERO; 3999 } else if (bs->backing_hd) { 4000 BlockDriverState *bs2 = bs->backing_hd; 4001 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 4002 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 4003 ret |= BDRV_BLOCK_ZERO; 4004 } 4005 } 4006 } 4007 4008 if (bs->file && 4009 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 4010 (ret & BDRV_BLOCK_OFFSET_VALID)) { 4011 int file_pnum; 4012 4013 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 4014 *pnum, &file_pnum); 4015 if (ret2 >= 0) { 4016 /* Ignore errors. This is just providing extra information, it 4017 * is useful but not necessary. 4018 */ 4019 if (!file_pnum) { 4020 /* !file_pnum indicates an offset at or beyond the EOF; it is 4021 * perfectly valid for the format block driver to point to such 4022 * offsets, so catch it and mark everything as zero */ 4023 ret |= BDRV_BLOCK_ZERO; 4024 } else { 4025 /* Limit request to the range reported by the protocol driver */ 4026 *pnum = file_pnum; 4027 ret |= (ret2 & BDRV_BLOCK_ZERO); 4028 } 4029 } 4030 } 4031 4032 return ret; 4033 } 4034 4035 /* Coroutine wrapper for bdrv_get_block_status() */ 4036 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) 4037 { 4038 BdrvCoGetBlockStatusData *data = opaque; 4039 BlockDriverState *bs = data->bs; 4040 4041 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, 4042 data->pnum); 4043 data->done = true; 4044 } 4045 4046 /* 4047 * Synchronous wrapper around bdrv_co_get_block_status(). 4048 * 4049 * See bdrv_co_get_block_status() for details. 4050 */ 4051 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, 4052 int nb_sectors, int *pnum) 4053 { 4054 Coroutine *co; 4055 BdrvCoGetBlockStatusData data = { 4056 .bs = bs, 4057 .sector_num = sector_num, 4058 .nb_sectors = nb_sectors, 4059 .pnum = pnum, 4060 .done = false, 4061 }; 4062 4063 if (qemu_in_coroutine()) { 4064 /* Fast-path if already in coroutine context */ 4065 bdrv_get_block_status_co_entry(&data); 4066 } else { 4067 AioContext *aio_context = bdrv_get_aio_context(bs); 4068 4069 co = qemu_coroutine_create(bdrv_get_block_status_co_entry); 4070 qemu_coroutine_enter(co, &data); 4071 while (!data.done) { 4072 aio_poll(aio_context, true); 4073 } 4074 } 4075 return data.ret; 4076 } 4077 4078 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 4079 int nb_sectors, int *pnum) 4080 { 4081 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 4082 if (ret < 0) { 4083 return ret; 4084 } 4085 return !!(ret & BDRV_BLOCK_ALLOCATED); 4086 } 4087 4088 /* 4089 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 4090 * 4091 * Return true if the given sector is allocated in any image between 4092 * BASE and TOP (inclusive). BASE can be NULL to check if the given 4093 * sector is allocated in any image of the chain. Return false otherwise. 4094 * 4095 * 'pnum' is set to the number of sectors (including and immediately following 4096 * the specified sector) that are known to be in the same 4097 * allocated/unallocated state. 4098 * 4099 */ 4100 int bdrv_is_allocated_above(BlockDriverState *top, 4101 BlockDriverState *base, 4102 int64_t sector_num, 4103 int nb_sectors, int *pnum) 4104 { 4105 BlockDriverState *intermediate; 4106 int ret, n = nb_sectors; 4107 4108 intermediate = top; 4109 while (intermediate && intermediate != base) { 4110 int pnum_inter; 4111 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 4112 &pnum_inter); 4113 if (ret < 0) { 4114 return ret; 4115 } else if (ret) { 4116 *pnum = pnum_inter; 4117 return 1; 4118 } 4119 4120 /* 4121 * [sector_num, nb_sectors] is unallocated on top but intermediate 4122 * might have 4123 * 4124 * [sector_num+x, nr_sectors] allocated. 4125 */ 4126 if (n > pnum_inter && 4127 (intermediate == top || 4128 sector_num + pnum_inter < intermediate->total_sectors)) { 4129 n = pnum_inter; 4130 } 4131 4132 intermediate = intermediate->backing_hd; 4133 } 4134 4135 *pnum = n; 4136 return 0; 4137 } 4138 4139 const char *bdrv_get_encrypted_filename(BlockDriverState *bs) 4140 { 4141 if (bs->backing_hd && bs->backing_hd->encrypted) 4142 return bs->backing_file; 4143 else if (bs->encrypted) 4144 return bs->filename; 4145 else 4146 return NULL; 4147 } 4148 4149 void bdrv_get_backing_filename(BlockDriverState *bs, 4150 char *filename, int filename_size) 4151 { 4152 pstrcpy(filename, filename_size, bs->backing_file); 4153 } 4154 4155 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 4156 const uint8_t *buf, int nb_sectors) 4157 { 4158 BlockDriver *drv = bs->drv; 4159 if (!drv) 4160 return -ENOMEDIUM; 4161 if (!drv->bdrv_write_compressed) 4162 return -ENOTSUP; 4163 if (bdrv_check_request(bs, sector_num, nb_sectors)) 4164 return -EIO; 4165 4166 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 4167 4168 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 4169 } 4170 4171 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 4172 { 4173 BlockDriver *drv = bs->drv; 4174 if (!drv) 4175 return -ENOMEDIUM; 4176 if (!drv->bdrv_get_info) 4177 return -ENOTSUP; 4178 memset(bdi, 0, sizeof(*bdi)); 4179 return drv->bdrv_get_info(bs, bdi); 4180 } 4181 4182 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) 4183 { 4184 BlockDriver *drv = bs->drv; 4185 if (drv && drv->bdrv_get_specific_info) { 4186 return drv->bdrv_get_specific_info(bs); 4187 } 4188 return NULL; 4189 } 4190 4191 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 4192 int64_t pos, int size) 4193 { 4194 QEMUIOVector qiov; 4195 struct iovec iov = { 4196 .iov_base = (void *) buf, 4197 .iov_len = size, 4198 }; 4199 4200 qemu_iovec_init_external(&qiov, &iov, 1); 4201 return bdrv_writev_vmstate(bs, &qiov, pos); 4202 } 4203 4204 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 4205 { 4206 BlockDriver *drv = bs->drv; 4207 4208 if (!drv) { 4209 return -ENOMEDIUM; 4210 } else if (drv->bdrv_save_vmstate) { 4211 return drv->bdrv_save_vmstate(bs, qiov, pos); 4212 } else if (bs->file) { 4213 return bdrv_writev_vmstate(bs->file, qiov, pos); 4214 } 4215 4216 return -ENOTSUP; 4217 } 4218 4219 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 4220 int64_t pos, int size) 4221 { 4222 BlockDriver *drv = bs->drv; 4223 if (!drv) 4224 return -ENOMEDIUM; 4225 if (drv->bdrv_load_vmstate) 4226 return drv->bdrv_load_vmstate(bs, buf, pos, size); 4227 if (bs->file) 4228 return bdrv_load_vmstate(bs->file, buf, pos, size); 4229 return -ENOTSUP; 4230 } 4231 4232 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) 4233 { 4234 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { 4235 return; 4236 } 4237 4238 bs->drv->bdrv_debug_event(bs, event); 4239 } 4240 4241 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, 4242 const char *tag) 4243 { 4244 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { 4245 bs = bs->file; 4246 } 4247 4248 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { 4249 return bs->drv->bdrv_debug_breakpoint(bs, event, tag); 4250 } 4251 4252 return -ENOTSUP; 4253 } 4254 4255 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) 4256 { 4257 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { 4258 bs = bs->file; 4259 } 4260 4261 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { 4262 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); 4263 } 4264 4265 return -ENOTSUP; 4266 } 4267 4268 int bdrv_debug_resume(BlockDriverState *bs, const char *tag) 4269 { 4270 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { 4271 bs = bs->file; 4272 } 4273 4274 if (bs && bs->drv && bs->drv->bdrv_debug_resume) { 4275 return bs->drv->bdrv_debug_resume(bs, tag); 4276 } 4277 4278 return -ENOTSUP; 4279 } 4280 4281 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) 4282 { 4283 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { 4284 bs = bs->file; 4285 } 4286 4287 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { 4288 return bs->drv->bdrv_debug_is_suspended(bs, tag); 4289 } 4290 4291 return false; 4292 } 4293 4294 int bdrv_is_snapshot(BlockDriverState *bs) 4295 { 4296 return !!(bs->open_flags & BDRV_O_SNAPSHOT); 4297 } 4298 4299 /* backing_file can either be relative, or absolute, or a protocol. If it is 4300 * relative, it must be relative to the chain. So, passing in bs->filename 4301 * from a BDS as backing_file should not be done, as that may be relative to 4302 * the CWD rather than the chain. */ 4303 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, 4304 const char *backing_file) 4305 { 4306 char *filename_full = NULL; 4307 char *backing_file_full = NULL; 4308 char *filename_tmp = NULL; 4309 int is_protocol = 0; 4310 BlockDriverState *curr_bs = NULL; 4311 BlockDriverState *retval = NULL; 4312 4313 if (!bs || !bs->drv || !backing_file) { 4314 return NULL; 4315 } 4316 4317 filename_full = g_malloc(PATH_MAX); 4318 backing_file_full = g_malloc(PATH_MAX); 4319 filename_tmp = g_malloc(PATH_MAX); 4320 4321 is_protocol = path_has_protocol(backing_file); 4322 4323 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { 4324 4325 /* If either of the filename paths is actually a protocol, then 4326 * compare unmodified paths; otherwise make paths relative */ 4327 if (is_protocol || path_has_protocol(curr_bs->backing_file)) { 4328 if (strcmp(backing_file, curr_bs->backing_file) == 0) { 4329 retval = curr_bs->backing_hd; 4330 break; 4331 } 4332 } else { 4333 /* If not an absolute filename path, make it relative to the current 4334 * image's filename path */ 4335 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 4336 backing_file); 4337 4338 /* We are going to compare absolute pathnames */ 4339 if (!realpath(filename_tmp, filename_full)) { 4340 continue; 4341 } 4342 4343 /* We need to make sure the backing filename we are comparing against 4344 * is relative to the current image filename (or absolute) */ 4345 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 4346 curr_bs->backing_file); 4347 4348 if (!realpath(filename_tmp, backing_file_full)) { 4349 continue; 4350 } 4351 4352 if (strcmp(backing_file_full, filename_full) == 0) { 4353 retval = curr_bs->backing_hd; 4354 break; 4355 } 4356 } 4357 } 4358 4359 g_free(filename_full); 4360 g_free(backing_file_full); 4361 g_free(filename_tmp); 4362 return retval; 4363 } 4364 4365 int bdrv_get_backing_file_depth(BlockDriverState *bs) 4366 { 4367 if (!bs->drv) { 4368 return 0; 4369 } 4370 4371 if (!bs->backing_hd) { 4372 return 0; 4373 } 4374 4375 return 1 + bdrv_get_backing_file_depth(bs->backing_hd); 4376 } 4377 4378 /**************************************************************/ 4379 /* async I/Os */ 4380 4381 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 4382 QEMUIOVector *qiov, int nb_sectors, 4383 BlockCompletionFunc *cb, void *opaque) 4384 { 4385 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 4386 4387 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 4388 cb, opaque, false); 4389 } 4390 4391 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 4392 QEMUIOVector *qiov, int nb_sectors, 4393 BlockCompletionFunc *cb, void *opaque) 4394 { 4395 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 4396 4397 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 4398 cb, opaque, true); 4399 } 4400 4401 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 4402 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 4403 BlockCompletionFunc *cb, void *opaque) 4404 { 4405 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 4406 4407 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 4408 BDRV_REQ_ZERO_WRITE | flags, 4409 cb, opaque, true); 4410 } 4411 4412 4413 typedef struct MultiwriteCB { 4414 int error; 4415 int num_requests; 4416 int num_callbacks; 4417 struct { 4418 BlockCompletionFunc *cb; 4419 void *opaque; 4420 QEMUIOVector *free_qiov; 4421 } callbacks[]; 4422 } MultiwriteCB; 4423 4424 static void multiwrite_user_cb(MultiwriteCB *mcb) 4425 { 4426 int i; 4427 4428 for (i = 0; i < mcb->num_callbacks; i++) { 4429 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 4430 if (mcb->callbacks[i].free_qiov) { 4431 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 4432 } 4433 g_free(mcb->callbacks[i].free_qiov); 4434 } 4435 } 4436 4437 static void multiwrite_cb(void *opaque, int ret) 4438 { 4439 MultiwriteCB *mcb = opaque; 4440 4441 trace_multiwrite_cb(mcb, ret); 4442 4443 if (ret < 0 && !mcb->error) { 4444 mcb->error = ret; 4445 } 4446 4447 mcb->num_requests--; 4448 if (mcb->num_requests == 0) { 4449 multiwrite_user_cb(mcb); 4450 g_free(mcb); 4451 } 4452 } 4453 4454 static int multiwrite_req_compare(const void *a, const void *b) 4455 { 4456 const BlockRequest *req1 = a, *req2 = b; 4457 4458 /* 4459 * Note that we can't simply subtract req2->sector from req1->sector 4460 * here as that could overflow the return value. 4461 */ 4462 if (req1->sector > req2->sector) { 4463 return 1; 4464 } else if (req1->sector < req2->sector) { 4465 return -1; 4466 } else { 4467 return 0; 4468 } 4469 } 4470 4471 /* 4472 * Takes a bunch of requests and tries to merge them. Returns the number of 4473 * requests that remain after merging. 4474 */ 4475 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 4476 int num_reqs, MultiwriteCB *mcb) 4477 { 4478 int i, outidx; 4479 4480 // Sort requests by start sector 4481 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 4482 4483 // Check if adjacent requests touch the same clusters. If so, combine them, 4484 // filling up gaps with zero sectors. 4485 outidx = 0; 4486 for (i = 1; i < num_reqs; i++) { 4487 int merge = 0; 4488 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 4489 4490 // Handle exactly sequential writes and overlapping writes. 4491 if (reqs[i].sector <= oldreq_last) { 4492 merge = 1; 4493 } 4494 4495 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 4496 merge = 0; 4497 } 4498 4499 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 4500 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 4501 merge = 0; 4502 } 4503 4504 if (merge) { 4505 size_t size; 4506 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 4507 qemu_iovec_init(qiov, 4508 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 4509 4510 // Add the first request to the merged one. If the requests are 4511 // overlapping, drop the last sectors of the first request. 4512 size = (reqs[i].sector - reqs[outidx].sector) << 9; 4513 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 4514 4515 // We should need to add any zeros between the two requests 4516 assert (reqs[i].sector <= oldreq_last); 4517 4518 // Add the second request 4519 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 4520 4521 // Add tail of first request, if necessary 4522 if (qiov->size < reqs[outidx].qiov->size) { 4523 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 4524 reqs[outidx].qiov->size - qiov->size); 4525 } 4526 4527 reqs[outidx].nb_sectors = qiov->size >> 9; 4528 reqs[outidx].qiov = qiov; 4529 4530 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 4531 } else { 4532 outidx++; 4533 reqs[outidx].sector = reqs[i].sector; 4534 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 4535 reqs[outidx].qiov = reqs[i].qiov; 4536 } 4537 } 4538 4539 return outidx + 1; 4540 } 4541 4542 /* 4543 * Submit multiple AIO write requests at once. 4544 * 4545 * On success, the function returns 0 and all requests in the reqs array have 4546 * been submitted. In error case this function returns -1, and any of the 4547 * requests may or may not be submitted yet. In particular, this means that the 4548 * callback will be called for some of the requests, for others it won't. The 4549 * caller must check the error field of the BlockRequest to wait for the right 4550 * callbacks (if error != 0, no callback will be called). 4551 * 4552 * The implementation may modify the contents of the reqs array, e.g. to merge 4553 * requests. However, the fields opaque and error are left unmodified as they 4554 * are used to signal failure for a single request to the caller. 4555 */ 4556 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 4557 { 4558 MultiwriteCB *mcb; 4559 int i; 4560 4561 /* don't submit writes if we don't have a medium */ 4562 if (bs->drv == NULL) { 4563 for (i = 0; i < num_reqs; i++) { 4564 reqs[i].error = -ENOMEDIUM; 4565 } 4566 return -1; 4567 } 4568 4569 if (num_reqs == 0) { 4570 return 0; 4571 } 4572 4573 // Create MultiwriteCB structure 4574 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 4575 mcb->num_requests = 0; 4576 mcb->num_callbacks = num_reqs; 4577 4578 for (i = 0; i < num_reqs; i++) { 4579 mcb->callbacks[i].cb = reqs[i].cb; 4580 mcb->callbacks[i].opaque = reqs[i].opaque; 4581 } 4582 4583 // Check for mergable requests 4584 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 4585 4586 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 4587 4588 /* Run the aio requests. */ 4589 mcb->num_requests = num_reqs; 4590 for (i = 0; i < num_reqs; i++) { 4591 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 4592 reqs[i].nb_sectors, reqs[i].flags, 4593 multiwrite_cb, mcb, 4594 true); 4595 } 4596 4597 return 0; 4598 } 4599 4600 void bdrv_aio_cancel(BlockAIOCB *acb) 4601 { 4602 qemu_aio_ref(acb); 4603 bdrv_aio_cancel_async(acb); 4604 while (acb->refcnt > 1) { 4605 if (acb->aiocb_info->get_aio_context) { 4606 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 4607 } else if (acb->bs) { 4608 aio_poll(bdrv_get_aio_context(acb->bs), true); 4609 } else { 4610 abort(); 4611 } 4612 } 4613 qemu_aio_unref(acb); 4614 } 4615 4616 /* Async version of aio cancel. The caller is not blocked if the acb implements 4617 * cancel_async, otherwise we do nothing and let the request normally complete. 4618 * In either case the completion callback must be called. */ 4619 void bdrv_aio_cancel_async(BlockAIOCB *acb) 4620 { 4621 if (acb->aiocb_info->cancel_async) { 4622 acb->aiocb_info->cancel_async(acb); 4623 } 4624 } 4625 4626 /**************************************************************/ 4627 /* async block device emulation */ 4628 4629 typedef struct BlockAIOCBSync { 4630 BlockAIOCB common; 4631 QEMUBH *bh; 4632 int ret; 4633 /* vector translation state */ 4634 QEMUIOVector *qiov; 4635 uint8_t *bounce; 4636 int is_write; 4637 } BlockAIOCBSync; 4638 4639 static const AIOCBInfo bdrv_em_aiocb_info = { 4640 .aiocb_size = sizeof(BlockAIOCBSync), 4641 }; 4642 4643 static void bdrv_aio_bh_cb(void *opaque) 4644 { 4645 BlockAIOCBSync *acb = opaque; 4646 4647 if (!acb->is_write && acb->ret >= 0) { 4648 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 4649 } 4650 qemu_vfree(acb->bounce); 4651 acb->common.cb(acb->common.opaque, acb->ret); 4652 qemu_bh_delete(acb->bh); 4653 acb->bh = NULL; 4654 qemu_aio_unref(acb); 4655 } 4656 4657 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 4658 int64_t sector_num, 4659 QEMUIOVector *qiov, 4660 int nb_sectors, 4661 BlockCompletionFunc *cb, 4662 void *opaque, 4663 int is_write) 4664 4665 { 4666 BlockAIOCBSync *acb; 4667 4668 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 4669 acb->is_write = is_write; 4670 acb->qiov = qiov; 4671 acb->bounce = qemu_try_blockalign(bs, qiov->size); 4672 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 4673 4674 if (acb->bounce == NULL) { 4675 acb->ret = -ENOMEM; 4676 } else if (is_write) { 4677 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 4678 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 4679 } else { 4680 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 4681 } 4682 4683 qemu_bh_schedule(acb->bh); 4684 4685 return &acb->common; 4686 } 4687 4688 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 4689 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 4690 BlockCompletionFunc *cb, void *opaque) 4691 { 4692 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 4693 } 4694 4695 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 4696 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 4697 BlockCompletionFunc *cb, void *opaque) 4698 { 4699 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 4700 } 4701 4702 4703 typedef struct BlockAIOCBCoroutine { 4704 BlockAIOCB common; 4705 BlockRequest req; 4706 bool is_write; 4707 bool *done; 4708 QEMUBH* bh; 4709 } BlockAIOCBCoroutine; 4710 4711 static const AIOCBInfo bdrv_em_co_aiocb_info = { 4712 .aiocb_size = sizeof(BlockAIOCBCoroutine), 4713 }; 4714 4715 static void bdrv_co_em_bh(void *opaque) 4716 { 4717 BlockAIOCBCoroutine *acb = opaque; 4718 4719 acb->common.cb(acb->common.opaque, acb->req.error); 4720 4721 qemu_bh_delete(acb->bh); 4722 qemu_aio_unref(acb); 4723 } 4724 4725 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 4726 static void coroutine_fn bdrv_co_do_rw(void *opaque) 4727 { 4728 BlockAIOCBCoroutine *acb = opaque; 4729 BlockDriverState *bs = acb->common.bs; 4730 4731 if (!acb->is_write) { 4732 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 4733 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 4734 } else { 4735 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 4736 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 4737 } 4738 4739 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4740 qemu_bh_schedule(acb->bh); 4741 } 4742 4743 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 4744 int64_t sector_num, 4745 QEMUIOVector *qiov, 4746 int nb_sectors, 4747 BdrvRequestFlags flags, 4748 BlockCompletionFunc *cb, 4749 void *opaque, 4750 bool is_write) 4751 { 4752 Coroutine *co; 4753 BlockAIOCBCoroutine *acb; 4754 4755 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4756 acb->req.sector = sector_num; 4757 acb->req.nb_sectors = nb_sectors; 4758 acb->req.qiov = qiov; 4759 acb->req.flags = flags; 4760 acb->is_write = is_write; 4761 4762 co = qemu_coroutine_create(bdrv_co_do_rw); 4763 qemu_coroutine_enter(co, acb); 4764 4765 return &acb->common; 4766 } 4767 4768 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 4769 { 4770 BlockAIOCBCoroutine *acb = opaque; 4771 BlockDriverState *bs = acb->common.bs; 4772 4773 acb->req.error = bdrv_co_flush(bs); 4774 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4775 qemu_bh_schedule(acb->bh); 4776 } 4777 4778 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 4779 BlockCompletionFunc *cb, void *opaque) 4780 { 4781 trace_bdrv_aio_flush(bs, opaque); 4782 4783 Coroutine *co; 4784 BlockAIOCBCoroutine *acb; 4785 4786 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4787 4788 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 4789 qemu_coroutine_enter(co, acb); 4790 4791 return &acb->common; 4792 } 4793 4794 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 4795 { 4796 BlockAIOCBCoroutine *acb = opaque; 4797 BlockDriverState *bs = acb->common.bs; 4798 4799 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 4800 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4801 qemu_bh_schedule(acb->bh); 4802 } 4803 4804 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 4805 int64_t sector_num, int nb_sectors, 4806 BlockCompletionFunc *cb, void *opaque) 4807 { 4808 Coroutine *co; 4809 BlockAIOCBCoroutine *acb; 4810 4811 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 4812 4813 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4814 acb->req.sector = sector_num; 4815 acb->req.nb_sectors = nb_sectors; 4816 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 4817 qemu_coroutine_enter(co, acb); 4818 4819 return &acb->common; 4820 } 4821 4822 void bdrv_init(void) 4823 { 4824 module_call_init(MODULE_INIT_BLOCK); 4825 } 4826 4827 void bdrv_init_with_whitelist(void) 4828 { 4829 use_bdrv_whitelist = 1; 4830 bdrv_init(); 4831 } 4832 4833 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 4834 BlockCompletionFunc *cb, void *opaque) 4835 { 4836 BlockAIOCB *acb; 4837 4838 acb = g_slice_alloc(aiocb_info->aiocb_size); 4839 acb->aiocb_info = aiocb_info; 4840 acb->bs = bs; 4841 acb->cb = cb; 4842 acb->opaque = opaque; 4843 acb->refcnt = 1; 4844 return acb; 4845 } 4846 4847 void qemu_aio_ref(void *p) 4848 { 4849 BlockAIOCB *acb = p; 4850 acb->refcnt++; 4851 } 4852 4853 void qemu_aio_unref(void *p) 4854 { 4855 BlockAIOCB *acb = p; 4856 assert(acb->refcnt > 0); 4857 if (--acb->refcnt == 0) { 4858 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 4859 } 4860 } 4861 4862 /**************************************************************/ 4863 /* Coroutine block device emulation */ 4864 4865 typedef struct CoroutineIOCompletion { 4866 Coroutine *coroutine; 4867 int ret; 4868 } CoroutineIOCompletion; 4869 4870 static void bdrv_co_io_em_complete(void *opaque, int ret) 4871 { 4872 CoroutineIOCompletion *co = opaque; 4873 4874 co->ret = ret; 4875 qemu_coroutine_enter(co->coroutine, NULL); 4876 } 4877 4878 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 4879 int nb_sectors, QEMUIOVector *iov, 4880 bool is_write) 4881 { 4882 CoroutineIOCompletion co = { 4883 .coroutine = qemu_coroutine_self(), 4884 }; 4885 BlockAIOCB *acb; 4886 4887 if (is_write) { 4888 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 4889 bdrv_co_io_em_complete, &co); 4890 } else { 4891 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 4892 bdrv_co_io_em_complete, &co); 4893 } 4894 4895 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 4896 if (!acb) { 4897 return -EIO; 4898 } 4899 qemu_coroutine_yield(); 4900 4901 return co.ret; 4902 } 4903 4904 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 4905 int64_t sector_num, int nb_sectors, 4906 QEMUIOVector *iov) 4907 { 4908 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 4909 } 4910 4911 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 4912 int64_t sector_num, int nb_sectors, 4913 QEMUIOVector *iov) 4914 { 4915 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 4916 } 4917 4918 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 4919 { 4920 RwCo *rwco = opaque; 4921 4922 rwco->ret = bdrv_co_flush(rwco->bs); 4923 } 4924 4925 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 4926 { 4927 int ret; 4928 4929 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 4930 return 0; 4931 } 4932 4933 /* Write back cached data to the OS even with cache=unsafe */ 4934 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 4935 if (bs->drv->bdrv_co_flush_to_os) { 4936 ret = bs->drv->bdrv_co_flush_to_os(bs); 4937 if (ret < 0) { 4938 return ret; 4939 } 4940 } 4941 4942 /* But don't actually force it to the disk with cache=unsafe */ 4943 if (bs->open_flags & BDRV_O_NO_FLUSH) { 4944 goto flush_parent; 4945 } 4946 4947 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 4948 if (bs->drv->bdrv_co_flush_to_disk) { 4949 ret = bs->drv->bdrv_co_flush_to_disk(bs); 4950 } else if (bs->drv->bdrv_aio_flush) { 4951 BlockAIOCB *acb; 4952 CoroutineIOCompletion co = { 4953 .coroutine = qemu_coroutine_self(), 4954 }; 4955 4956 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 4957 if (acb == NULL) { 4958 ret = -EIO; 4959 } else { 4960 qemu_coroutine_yield(); 4961 ret = co.ret; 4962 } 4963 } else { 4964 /* 4965 * Some block drivers always operate in either writethrough or unsafe 4966 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 4967 * know how the server works (because the behaviour is hardcoded or 4968 * depends on server-side configuration), so we can't ensure that 4969 * everything is safe on disk. Returning an error doesn't work because 4970 * that would break guests even if the server operates in writethrough 4971 * mode. 4972 * 4973 * Let's hope the user knows what he's doing. 4974 */ 4975 ret = 0; 4976 } 4977 if (ret < 0) { 4978 return ret; 4979 } 4980 4981 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 4982 * in the case of cache=unsafe, so there are no useless flushes. 4983 */ 4984 flush_parent: 4985 return bdrv_co_flush(bs->file); 4986 } 4987 4988 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) 4989 { 4990 Error *local_err = NULL; 4991 int ret; 4992 4993 if (!bs->drv) { 4994 return; 4995 } 4996 4997 if (!(bs->open_flags & BDRV_O_INCOMING)) { 4998 return; 4999 } 5000 bs->open_flags &= ~BDRV_O_INCOMING; 5001 5002 if (bs->drv->bdrv_invalidate_cache) { 5003 bs->drv->bdrv_invalidate_cache(bs, &local_err); 5004 } else if (bs->file) { 5005 bdrv_invalidate_cache(bs->file, &local_err); 5006 } 5007 if (local_err) { 5008 error_propagate(errp, local_err); 5009 return; 5010 } 5011 5012 ret = refresh_total_sectors(bs, bs->total_sectors); 5013 if (ret < 0) { 5014 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 5015 return; 5016 } 5017 } 5018 5019 void bdrv_invalidate_cache_all(Error **errp) 5020 { 5021 BlockDriverState *bs; 5022 Error *local_err = NULL; 5023 5024 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 5025 AioContext *aio_context = bdrv_get_aio_context(bs); 5026 5027 aio_context_acquire(aio_context); 5028 bdrv_invalidate_cache(bs, &local_err); 5029 aio_context_release(aio_context); 5030 if (local_err) { 5031 error_propagate(errp, local_err); 5032 return; 5033 } 5034 } 5035 } 5036 5037 int bdrv_flush(BlockDriverState *bs) 5038 { 5039 Coroutine *co; 5040 RwCo rwco = { 5041 .bs = bs, 5042 .ret = NOT_DONE, 5043 }; 5044 5045 if (qemu_in_coroutine()) { 5046 /* Fast-path if already in coroutine context */ 5047 bdrv_flush_co_entry(&rwco); 5048 } else { 5049 AioContext *aio_context = bdrv_get_aio_context(bs); 5050 5051 co = qemu_coroutine_create(bdrv_flush_co_entry); 5052 qemu_coroutine_enter(co, &rwco); 5053 while (rwco.ret == NOT_DONE) { 5054 aio_poll(aio_context, true); 5055 } 5056 } 5057 5058 return rwco.ret; 5059 } 5060 5061 typedef struct DiscardCo { 5062 BlockDriverState *bs; 5063 int64_t sector_num; 5064 int nb_sectors; 5065 int ret; 5066 } DiscardCo; 5067 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 5068 { 5069 DiscardCo *rwco = opaque; 5070 5071 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 5072 } 5073 5074 /* if no limit is specified in the BlockLimits use a default 5075 * of 32768 512-byte sectors (16 MiB) per request. 5076 */ 5077 #define MAX_DISCARD_DEFAULT 32768 5078 5079 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 5080 int nb_sectors) 5081 { 5082 int max_discard; 5083 5084 if (!bs->drv) { 5085 return -ENOMEDIUM; 5086 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { 5087 return -EIO; 5088 } else if (bs->read_only) { 5089 return -EROFS; 5090 } 5091 5092 bdrv_reset_dirty(bs, sector_num, nb_sectors); 5093 5094 /* Do nothing if disabled. */ 5095 if (!(bs->open_flags & BDRV_O_UNMAP)) { 5096 return 0; 5097 } 5098 5099 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 5100 return 0; 5101 } 5102 5103 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; 5104 while (nb_sectors > 0) { 5105 int ret; 5106 int num = nb_sectors; 5107 5108 /* align request */ 5109 if (bs->bl.discard_alignment && 5110 num >= bs->bl.discard_alignment && 5111 sector_num % bs->bl.discard_alignment) { 5112 if (num > bs->bl.discard_alignment) { 5113 num = bs->bl.discard_alignment; 5114 } 5115 num -= sector_num % bs->bl.discard_alignment; 5116 } 5117 5118 /* limit request size */ 5119 if (num > max_discard) { 5120 num = max_discard; 5121 } 5122 5123 if (bs->drv->bdrv_co_discard) { 5124 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 5125 } else { 5126 BlockAIOCB *acb; 5127 CoroutineIOCompletion co = { 5128 .coroutine = qemu_coroutine_self(), 5129 }; 5130 5131 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 5132 bdrv_co_io_em_complete, &co); 5133 if (acb == NULL) { 5134 return -EIO; 5135 } else { 5136 qemu_coroutine_yield(); 5137 ret = co.ret; 5138 } 5139 } 5140 if (ret && ret != -ENOTSUP) { 5141 return ret; 5142 } 5143 5144 sector_num += num; 5145 nb_sectors -= num; 5146 } 5147 return 0; 5148 } 5149 5150 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 5151 { 5152 Coroutine *co; 5153 DiscardCo rwco = { 5154 .bs = bs, 5155 .sector_num = sector_num, 5156 .nb_sectors = nb_sectors, 5157 .ret = NOT_DONE, 5158 }; 5159 5160 if (qemu_in_coroutine()) { 5161 /* Fast-path if already in coroutine context */ 5162 bdrv_discard_co_entry(&rwco); 5163 } else { 5164 AioContext *aio_context = bdrv_get_aio_context(bs); 5165 5166 co = qemu_coroutine_create(bdrv_discard_co_entry); 5167 qemu_coroutine_enter(co, &rwco); 5168 while (rwco.ret == NOT_DONE) { 5169 aio_poll(aio_context, true); 5170 } 5171 } 5172 5173 return rwco.ret; 5174 } 5175 5176 /**************************************************************/ 5177 /* removable device support */ 5178 5179 /** 5180 * Return TRUE if the media is present 5181 */ 5182 int bdrv_is_inserted(BlockDriverState *bs) 5183 { 5184 BlockDriver *drv = bs->drv; 5185 5186 if (!drv) 5187 return 0; 5188 if (!drv->bdrv_is_inserted) 5189 return 1; 5190 return drv->bdrv_is_inserted(bs); 5191 } 5192 5193 /** 5194 * Return whether the media changed since the last call to this 5195 * function, or -ENOTSUP if we don't know. Most drivers don't know. 5196 */ 5197 int bdrv_media_changed(BlockDriverState *bs) 5198 { 5199 BlockDriver *drv = bs->drv; 5200 5201 if (drv && drv->bdrv_media_changed) { 5202 return drv->bdrv_media_changed(bs); 5203 } 5204 return -ENOTSUP; 5205 } 5206 5207 /** 5208 * If eject_flag is TRUE, eject the media. Otherwise, close the tray 5209 */ 5210 void bdrv_eject(BlockDriverState *bs, bool eject_flag) 5211 { 5212 BlockDriver *drv = bs->drv; 5213 const char *device_name; 5214 5215 if (drv && drv->bdrv_eject) { 5216 drv->bdrv_eject(bs, eject_flag); 5217 } 5218 5219 device_name = bdrv_get_device_name(bs); 5220 if (device_name[0] != '\0') { 5221 qapi_event_send_device_tray_moved(device_name, 5222 eject_flag, &error_abort); 5223 } 5224 } 5225 5226 /** 5227 * Lock or unlock the media (if it is locked, the user won't be able 5228 * to eject it manually). 5229 */ 5230 void bdrv_lock_medium(BlockDriverState *bs, bool locked) 5231 { 5232 BlockDriver *drv = bs->drv; 5233 5234 trace_bdrv_lock_medium(bs, locked); 5235 5236 if (drv && drv->bdrv_lock_medium) { 5237 drv->bdrv_lock_medium(bs, locked); 5238 } 5239 } 5240 5241 /* needed for generic scsi interface */ 5242 5243 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 5244 { 5245 BlockDriver *drv = bs->drv; 5246 5247 if (drv && drv->bdrv_ioctl) 5248 return drv->bdrv_ioctl(bs, req, buf); 5249 return -ENOTSUP; 5250 } 5251 5252 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 5253 unsigned long int req, void *buf, 5254 BlockCompletionFunc *cb, void *opaque) 5255 { 5256 BlockDriver *drv = bs->drv; 5257 5258 if (drv && drv->bdrv_aio_ioctl) 5259 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 5260 return NULL; 5261 } 5262 5263 void bdrv_set_guest_block_size(BlockDriverState *bs, int align) 5264 { 5265 bs->guest_block_size = align; 5266 } 5267 5268 void *qemu_blockalign(BlockDriverState *bs, size_t size) 5269 { 5270 return qemu_memalign(bdrv_opt_mem_align(bs), size); 5271 } 5272 5273 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 5274 { 5275 return memset(qemu_blockalign(bs, size), 0, size); 5276 } 5277 5278 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 5279 { 5280 size_t align = bdrv_opt_mem_align(bs); 5281 5282 /* Ensure that NULL is never returned on success */ 5283 assert(align > 0); 5284 if (size == 0) { 5285 size = align; 5286 } 5287 5288 return qemu_try_memalign(align, size); 5289 } 5290 5291 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 5292 { 5293 void *mem = qemu_try_blockalign(bs, size); 5294 5295 if (mem) { 5296 memset(mem, 0, size); 5297 } 5298 5299 return mem; 5300 } 5301 5302 /* 5303 * Check if all memory in this vector is sector aligned. 5304 */ 5305 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 5306 { 5307 int i; 5308 size_t alignment = bdrv_opt_mem_align(bs); 5309 5310 for (i = 0; i < qiov->niov; i++) { 5311 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 5312 return false; 5313 } 5314 if (qiov->iov[i].iov_len % alignment) { 5315 return false; 5316 } 5317 } 5318 5319 return true; 5320 } 5321 5322 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, 5323 Error **errp) 5324 { 5325 int64_t bitmap_size; 5326 BdrvDirtyBitmap *bitmap; 5327 5328 assert((granularity & (granularity - 1)) == 0); 5329 5330 granularity >>= BDRV_SECTOR_BITS; 5331 assert(granularity); 5332 bitmap_size = bdrv_nb_sectors(bs); 5333 if (bitmap_size < 0) { 5334 error_setg_errno(errp, -bitmap_size, "could not get length of device"); 5335 errno = -bitmap_size; 5336 return NULL; 5337 } 5338 bitmap = g_new0(BdrvDirtyBitmap, 1); 5339 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); 5340 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); 5341 return bitmap; 5342 } 5343 5344 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) 5345 { 5346 BdrvDirtyBitmap *bm, *next; 5347 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { 5348 if (bm == bitmap) { 5349 QLIST_REMOVE(bitmap, list); 5350 hbitmap_free(bitmap->bitmap); 5351 g_free(bitmap); 5352 return; 5353 } 5354 } 5355 } 5356 5357 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) 5358 { 5359 BdrvDirtyBitmap *bm; 5360 BlockDirtyInfoList *list = NULL; 5361 BlockDirtyInfoList **plist = &list; 5362 5363 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { 5364 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); 5365 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); 5366 info->count = bdrv_get_dirty_count(bs, bm); 5367 info->granularity = 5368 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap)); 5369 entry->value = info; 5370 *plist = entry; 5371 plist = &entry->next; 5372 } 5373 5374 return list; 5375 } 5376 5377 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector) 5378 { 5379 if (bitmap) { 5380 return hbitmap_get(bitmap->bitmap, sector); 5381 } else { 5382 return 0; 5383 } 5384 } 5385 5386 void bdrv_dirty_iter_init(BlockDriverState *bs, 5387 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) 5388 { 5389 hbitmap_iter_init(hbi, bitmap->bitmap, 0); 5390 } 5391 5392 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, 5393 int nr_sectors) 5394 { 5395 BdrvDirtyBitmap *bitmap; 5396 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { 5397 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); 5398 } 5399 } 5400 5401 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) 5402 { 5403 BdrvDirtyBitmap *bitmap; 5404 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { 5405 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); 5406 } 5407 } 5408 5409 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) 5410 { 5411 return hbitmap_count(bitmap->bitmap); 5412 } 5413 5414 /* Get a reference to bs */ 5415 void bdrv_ref(BlockDriverState *bs) 5416 { 5417 bs->refcnt++; 5418 } 5419 5420 /* Release a previously grabbed reference to bs. 5421 * If after releasing, reference count is zero, the BlockDriverState is 5422 * deleted. */ 5423 void bdrv_unref(BlockDriverState *bs) 5424 { 5425 if (!bs) { 5426 return; 5427 } 5428 assert(bs->refcnt > 0); 5429 if (--bs->refcnt == 0) { 5430 bdrv_delete(bs); 5431 } 5432 } 5433 5434 struct BdrvOpBlocker { 5435 Error *reason; 5436 QLIST_ENTRY(BdrvOpBlocker) list; 5437 }; 5438 5439 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp) 5440 { 5441 BdrvOpBlocker *blocker; 5442 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5443 if (!QLIST_EMPTY(&bs->op_blockers[op])) { 5444 blocker = QLIST_FIRST(&bs->op_blockers[op]); 5445 if (errp) { 5446 error_setg(errp, "Device '%s' is busy: %s", 5447 bdrv_get_device_name(bs), 5448 error_get_pretty(blocker->reason)); 5449 } 5450 return true; 5451 } 5452 return false; 5453 } 5454 5455 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason) 5456 { 5457 BdrvOpBlocker *blocker; 5458 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5459 5460 blocker = g_new0(BdrvOpBlocker, 1); 5461 blocker->reason = reason; 5462 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list); 5463 } 5464 5465 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason) 5466 { 5467 BdrvOpBlocker *blocker, *next; 5468 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5469 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) { 5470 if (blocker->reason == reason) { 5471 QLIST_REMOVE(blocker, list); 5472 g_free(blocker); 5473 } 5474 } 5475 } 5476 5477 void bdrv_op_block_all(BlockDriverState *bs, Error *reason) 5478 { 5479 int i; 5480 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5481 bdrv_op_block(bs, i, reason); 5482 } 5483 } 5484 5485 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason) 5486 { 5487 int i; 5488 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5489 bdrv_op_unblock(bs, i, reason); 5490 } 5491 } 5492 5493 bool bdrv_op_blocker_is_empty(BlockDriverState *bs) 5494 { 5495 int i; 5496 5497 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5498 if (!QLIST_EMPTY(&bs->op_blockers[i])) { 5499 return false; 5500 } 5501 } 5502 return true; 5503 } 5504 5505 void bdrv_iostatus_enable(BlockDriverState *bs) 5506 { 5507 bs->iostatus_enabled = true; 5508 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 5509 } 5510 5511 /* The I/O status is only enabled if the drive explicitly 5512 * enables it _and_ the VM is configured to stop on errors */ 5513 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) 5514 { 5515 return (bs->iostatus_enabled && 5516 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || 5517 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || 5518 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); 5519 } 5520 5521 void bdrv_iostatus_disable(BlockDriverState *bs) 5522 { 5523 bs->iostatus_enabled = false; 5524 } 5525 5526 void bdrv_iostatus_reset(BlockDriverState *bs) 5527 { 5528 if (bdrv_iostatus_is_enabled(bs)) { 5529 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 5530 if (bs->job) { 5531 block_job_iostatus_reset(bs->job); 5532 } 5533 } 5534 } 5535 5536 void bdrv_iostatus_set_err(BlockDriverState *bs, int error) 5537 { 5538 assert(bdrv_iostatus_is_enabled(bs)); 5539 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 5540 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : 5541 BLOCK_DEVICE_IO_STATUS_FAILED; 5542 } 5543 } 5544 5545 void bdrv_img_create(const char *filename, const char *fmt, 5546 const char *base_filename, const char *base_fmt, 5547 char *options, uint64_t img_size, int flags, 5548 Error **errp, bool quiet) 5549 { 5550 QemuOptsList *create_opts = NULL; 5551 QemuOpts *opts = NULL; 5552 const char *backing_fmt, *backing_file; 5553 int64_t size; 5554 BlockDriver *drv, *proto_drv; 5555 BlockDriver *backing_drv = NULL; 5556 Error *local_err = NULL; 5557 int ret = 0; 5558 5559 /* Find driver and parse its options */ 5560 drv = bdrv_find_format(fmt); 5561 if (!drv) { 5562 error_setg(errp, "Unknown file format '%s'", fmt); 5563 return; 5564 } 5565 5566 proto_drv = bdrv_find_protocol(filename, true); 5567 if (!proto_drv) { 5568 error_setg(errp, "Unknown protocol '%s'", filename); 5569 return; 5570 } 5571 5572 if (!drv->create_opts) { 5573 error_setg(errp, "Format driver '%s' does not support image creation", 5574 drv->format_name); 5575 return; 5576 } 5577 5578 if (!proto_drv->create_opts) { 5579 error_setg(errp, "Protocol driver '%s' does not support image creation", 5580 proto_drv->format_name); 5581 return; 5582 } 5583 5584 create_opts = qemu_opts_append(create_opts, drv->create_opts); 5585 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); 5586 5587 /* Create parameter list with default values */ 5588 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort); 5589 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size); 5590 5591 /* Parse -o options */ 5592 if (options) { 5593 if (qemu_opts_do_parse(opts, options, NULL) != 0) { 5594 error_setg(errp, "Invalid options for file format '%s'", fmt); 5595 goto out; 5596 } 5597 } 5598 5599 if (base_filename) { 5600 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) { 5601 error_setg(errp, "Backing file not supported for file format '%s'", 5602 fmt); 5603 goto out; 5604 } 5605 } 5606 5607 if (base_fmt) { 5608 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) { 5609 error_setg(errp, "Backing file format not supported for file " 5610 "format '%s'", fmt); 5611 goto out; 5612 } 5613 } 5614 5615 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); 5616 if (backing_file) { 5617 if (!strcmp(filename, backing_file)) { 5618 error_setg(errp, "Error: Trying to create an image with the " 5619 "same filename as the backing file"); 5620 goto out; 5621 } 5622 } 5623 5624 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); 5625 if (backing_fmt) { 5626 backing_drv = bdrv_find_format(backing_fmt); 5627 if (!backing_drv) { 5628 error_setg(errp, "Unknown backing file format '%s'", 5629 backing_fmt); 5630 goto out; 5631 } 5632 } 5633 5634 // The size for the image must always be specified, with one exception: 5635 // If we are using a backing file, we can obtain the size from there 5636 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); 5637 if (size == -1) { 5638 if (backing_file) { 5639 BlockDriverState *bs; 5640 int64_t size; 5641 int back_flags; 5642 5643 /* backing files always opened read-only */ 5644 back_flags = 5645 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 5646 5647 bs = NULL; 5648 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags, 5649 backing_drv, &local_err); 5650 if (ret < 0) { 5651 goto out; 5652 } 5653 size = bdrv_getlength(bs); 5654 if (size < 0) { 5655 error_setg_errno(errp, -size, "Could not get size of '%s'", 5656 backing_file); 5657 bdrv_unref(bs); 5658 goto out; 5659 } 5660 5661 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size); 5662 5663 bdrv_unref(bs); 5664 } else { 5665 error_setg(errp, "Image creation needs a size parameter"); 5666 goto out; 5667 } 5668 } 5669 5670 if (!quiet) { 5671 printf("Formatting '%s', fmt=%s", filename, fmt); 5672 qemu_opts_print(opts, " "); 5673 puts(""); 5674 } 5675 5676 ret = bdrv_create(drv, filename, opts, &local_err); 5677 5678 if (ret == -EFBIG) { 5679 /* This is generally a better message than whatever the driver would 5680 * deliver (especially because of the cluster_size_hint), since that 5681 * is most probably not much different from "image too large". */ 5682 const char *cluster_size_hint = ""; 5683 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) { 5684 cluster_size_hint = " (try using a larger cluster size)"; 5685 } 5686 error_setg(errp, "The image size is too large for file format '%s'" 5687 "%s", fmt, cluster_size_hint); 5688 error_free(local_err); 5689 local_err = NULL; 5690 } 5691 5692 out: 5693 qemu_opts_del(opts); 5694 qemu_opts_free(create_opts); 5695 if (local_err) { 5696 error_propagate(errp, local_err); 5697 } 5698 } 5699 5700 AioContext *bdrv_get_aio_context(BlockDriverState *bs) 5701 { 5702 return bs->aio_context; 5703 } 5704 5705 void bdrv_detach_aio_context(BlockDriverState *bs) 5706 { 5707 BdrvAioNotifier *baf; 5708 5709 if (!bs->drv) { 5710 return; 5711 } 5712 5713 QLIST_FOREACH(baf, &bs->aio_notifiers, list) { 5714 baf->detach_aio_context(baf->opaque); 5715 } 5716 5717 if (bs->io_limits_enabled) { 5718 throttle_detach_aio_context(&bs->throttle_state); 5719 } 5720 if (bs->drv->bdrv_detach_aio_context) { 5721 bs->drv->bdrv_detach_aio_context(bs); 5722 } 5723 if (bs->file) { 5724 bdrv_detach_aio_context(bs->file); 5725 } 5726 if (bs->backing_hd) { 5727 bdrv_detach_aio_context(bs->backing_hd); 5728 } 5729 5730 bs->aio_context = NULL; 5731 } 5732 5733 void bdrv_attach_aio_context(BlockDriverState *bs, 5734 AioContext *new_context) 5735 { 5736 BdrvAioNotifier *ban; 5737 5738 if (!bs->drv) { 5739 return; 5740 } 5741 5742 bs->aio_context = new_context; 5743 5744 if (bs->backing_hd) { 5745 bdrv_attach_aio_context(bs->backing_hd, new_context); 5746 } 5747 if (bs->file) { 5748 bdrv_attach_aio_context(bs->file, new_context); 5749 } 5750 if (bs->drv->bdrv_attach_aio_context) { 5751 bs->drv->bdrv_attach_aio_context(bs, new_context); 5752 } 5753 if (bs->io_limits_enabled) { 5754 throttle_attach_aio_context(&bs->throttle_state, new_context); 5755 } 5756 5757 QLIST_FOREACH(ban, &bs->aio_notifiers, list) { 5758 ban->attached_aio_context(new_context, ban->opaque); 5759 } 5760 } 5761 5762 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context) 5763 { 5764 bdrv_drain_all(); /* ensure there are no in-flight requests */ 5765 5766 bdrv_detach_aio_context(bs); 5767 5768 /* This function executes in the old AioContext so acquire the new one in 5769 * case it runs in a different thread. 5770 */ 5771 aio_context_acquire(new_context); 5772 bdrv_attach_aio_context(bs, new_context); 5773 aio_context_release(new_context); 5774 } 5775 5776 void bdrv_add_aio_context_notifier(BlockDriverState *bs, 5777 void (*attached_aio_context)(AioContext *new_context, void *opaque), 5778 void (*detach_aio_context)(void *opaque), void *opaque) 5779 { 5780 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1); 5781 *ban = (BdrvAioNotifier){ 5782 .attached_aio_context = attached_aio_context, 5783 .detach_aio_context = detach_aio_context, 5784 .opaque = opaque 5785 }; 5786 5787 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list); 5788 } 5789 5790 void bdrv_remove_aio_context_notifier(BlockDriverState *bs, 5791 void (*attached_aio_context)(AioContext *, 5792 void *), 5793 void (*detach_aio_context)(void *), 5794 void *opaque) 5795 { 5796 BdrvAioNotifier *ban, *ban_next; 5797 5798 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) { 5799 if (ban->attached_aio_context == attached_aio_context && 5800 ban->detach_aio_context == detach_aio_context && 5801 ban->opaque == opaque) 5802 { 5803 QLIST_REMOVE(ban, list); 5804 g_free(ban); 5805 5806 return; 5807 } 5808 } 5809 5810 abort(); 5811 } 5812 5813 void bdrv_add_before_write_notifier(BlockDriverState *bs, 5814 NotifierWithReturn *notifier) 5815 { 5816 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 5817 } 5818 5819 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, 5820 BlockDriverAmendStatusCB *status_cb) 5821 { 5822 if (!bs->drv->bdrv_amend_options) { 5823 return -ENOTSUP; 5824 } 5825 return bs->drv->bdrv_amend_options(bs, opts, status_cb); 5826 } 5827 5828 /* This function will be called by the bdrv_recurse_is_first_non_filter method 5829 * of block filter and by bdrv_is_first_non_filter. 5830 * It is used to test if the given bs is the candidate or recurse more in the 5831 * node graph. 5832 */ 5833 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, 5834 BlockDriverState *candidate) 5835 { 5836 /* return false if basic checks fails */ 5837 if (!bs || !bs->drv) { 5838 return false; 5839 } 5840 5841 /* the code reached a non block filter driver -> check if the bs is 5842 * the same as the candidate. It's the recursion termination condition. 5843 */ 5844 if (!bs->drv->is_filter) { 5845 return bs == candidate; 5846 } 5847 /* Down this path the driver is a block filter driver */ 5848 5849 /* If the block filter recursion method is defined use it to recurse down 5850 * the node graph. 5851 */ 5852 if (bs->drv->bdrv_recurse_is_first_non_filter) { 5853 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); 5854 } 5855 5856 /* the driver is a block filter but don't allow to recurse -> return false 5857 */ 5858 return false; 5859 } 5860 5861 /* This function checks if the candidate is the first non filter bs down it's 5862 * bs chain. Since we don't have pointers to parents it explore all bs chains 5863 * from the top. Some filters can choose not to pass down the recursion. 5864 */ 5865 bool bdrv_is_first_non_filter(BlockDriverState *candidate) 5866 { 5867 BlockDriverState *bs; 5868 5869 /* walk down the bs forest recursively */ 5870 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 5871 bool perm; 5872 5873 /* try to recurse in this top level bs */ 5874 perm = bdrv_recurse_is_first_non_filter(bs, candidate); 5875 5876 /* candidate is the first non filter */ 5877 if (perm) { 5878 return true; 5879 } 5880 } 5881 5882 return false; 5883 } 5884 5885 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp) 5886 { 5887 BlockDriverState *to_replace_bs = bdrv_find_node(node_name); 5888 AioContext *aio_context; 5889 5890 if (!to_replace_bs) { 5891 error_setg(errp, "Node name '%s' not found", node_name); 5892 return NULL; 5893 } 5894 5895 aio_context = bdrv_get_aio_context(to_replace_bs); 5896 aio_context_acquire(aio_context); 5897 5898 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) { 5899 to_replace_bs = NULL; 5900 goto out; 5901 } 5902 5903 /* We don't want arbitrary node of the BDS chain to be replaced only the top 5904 * most non filter in order to prevent data corruption. 5905 * Another benefit is that this tests exclude backing files which are 5906 * blocked by the backing blockers. 5907 */ 5908 if (!bdrv_is_first_non_filter(to_replace_bs)) { 5909 error_setg(errp, "Only top most non filter can be replaced"); 5910 to_replace_bs = NULL; 5911 goto out; 5912 } 5913 5914 out: 5915 aio_context_release(aio_context); 5916 return to_replace_bs; 5917 } 5918 5919 void bdrv_io_plug(BlockDriverState *bs) 5920 { 5921 BlockDriver *drv = bs->drv; 5922 if (drv && drv->bdrv_io_plug) { 5923 drv->bdrv_io_plug(bs); 5924 } else if (bs->file) { 5925 bdrv_io_plug(bs->file); 5926 } 5927 } 5928 5929 void bdrv_io_unplug(BlockDriverState *bs) 5930 { 5931 BlockDriver *drv = bs->drv; 5932 if (drv && drv->bdrv_io_unplug) { 5933 drv->bdrv_io_unplug(bs); 5934 } else if (bs->file) { 5935 bdrv_io_unplug(bs->file); 5936 } 5937 } 5938 5939 void bdrv_flush_io_queue(BlockDriverState *bs) 5940 { 5941 BlockDriver *drv = bs->drv; 5942 if (drv && drv->bdrv_flush_io_queue) { 5943 drv->bdrv_flush_io_queue(bs); 5944 } else if (bs->file) { 5945 bdrv_flush_io_queue(bs->file); 5946 } 5947 } 5948 5949 static bool append_open_options(QDict *d, BlockDriverState *bs) 5950 { 5951 const QDictEntry *entry; 5952 bool found_any = false; 5953 5954 for (entry = qdict_first(bs->options); entry; 5955 entry = qdict_next(bs->options, entry)) 5956 { 5957 /* Only take options for this level and exclude all non-driver-specific 5958 * options */ 5959 if (!strchr(qdict_entry_key(entry), '.') && 5960 strcmp(qdict_entry_key(entry), "node-name")) 5961 { 5962 qobject_incref(qdict_entry_value(entry)); 5963 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry)); 5964 found_any = true; 5965 } 5966 } 5967 5968 return found_any; 5969 } 5970 5971 /* Updates the following BDS fields: 5972 * - exact_filename: A filename which may be used for opening a block device 5973 * which (mostly) equals the given BDS (even without any 5974 * other options; so reading and writing must return the same 5975 * results, but caching etc. may be different) 5976 * - full_open_options: Options which, when given when opening a block device 5977 * (without a filename), result in a BDS (mostly) 5978 * equalling the given one 5979 * - filename: If exact_filename is set, it is copied here. Otherwise, 5980 * full_open_options is converted to a JSON object, prefixed with 5981 * "json:" (for use through the JSON pseudo protocol) and put here. 5982 */ 5983 void bdrv_refresh_filename(BlockDriverState *bs) 5984 { 5985 BlockDriver *drv = bs->drv; 5986 QDict *opts; 5987 5988 if (!drv) { 5989 return; 5990 } 5991 5992 /* This BDS's file name will most probably depend on its file's name, so 5993 * refresh that first */ 5994 if (bs->file) { 5995 bdrv_refresh_filename(bs->file); 5996 } 5997 5998 if (drv->bdrv_refresh_filename) { 5999 /* Obsolete information is of no use here, so drop the old file name 6000 * information before refreshing it */ 6001 bs->exact_filename[0] = '\0'; 6002 if (bs->full_open_options) { 6003 QDECREF(bs->full_open_options); 6004 bs->full_open_options = NULL; 6005 } 6006 6007 drv->bdrv_refresh_filename(bs); 6008 } else if (bs->file) { 6009 /* Try to reconstruct valid information from the underlying file */ 6010 bool has_open_options; 6011 6012 bs->exact_filename[0] = '\0'; 6013 if (bs->full_open_options) { 6014 QDECREF(bs->full_open_options); 6015 bs->full_open_options = NULL; 6016 } 6017 6018 opts = qdict_new(); 6019 has_open_options = append_open_options(opts, bs); 6020 6021 /* If no specific options have been given for this BDS, the filename of 6022 * the underlying file should suffice for this one as well */ 6023 if (bs->file->exact_filename[0] && !has_open_options) { 6024 strcpy(bs->exact_filename, bs->file->exact_filename); 6025 } 6026 /* Reconstructing the full options QDict is simple for most format block 6027 * drivers, as long as the full options are known for the underlying 6028 * file BDS. The full options QDict of that file BDS should somehow 6029 * contain a representation of the filename, therefore the following 6030 * suffices without querying the (exact_)filename of this BDS. */ 6031 if (bs->file->full_open_options) { 6032 qdict_put_obj(opts, "driver", 6033 QOBJECT(qstring_from_str(drv->format_name))); 6034 QINCREF(bs->file->full_open_options); 6035 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options)); 6036 6037 bs->full_open_options = opts; 6038 } else { 6039 QDECREF(opts); 6040 } 6041 } else if (!bs->full_open_options && qdict_size(bs->options)) { 6042 /* There is no underlying file BDS (at least referenced by BDS.file), 6043 * so the full options QDict should be equal to the options given 6044 * specifically for this block device when it was opened (plus the 6045 * driver specification). 6046 * Because those options don't change, there is no need to update 6047 * full_open_options when it's already set. */ 6048 6049 opts = qdict_new(); 6050 append_open_options(opts, bs); 6051 qdict_put_obj(opts, "driver", 6052 QOBJECT(qstring_from_str(drv->format_name))); 6053 6054 if (bs->exact_filename[0]) { 6055 /* This may not work for all block protocol drivers (some may 6056 * require this filename to be parsed), but we have to find some 6057 * default solution here, so just include it. If some block driver 6058 * does not support pure options without any filename at all or 6059 * needs some special format of the options QDict, it needs to 6060 * implement the driver-specific bdrv_refresh_filename() function. 6061 */ 6062 qdict_put_obj(opts, "filename", 6063 QOBJECT(qstring_from_str(bs->exact_filename))); 6064 } 6065 6066 bs->full_open_options = opts; 6067 } 6068 6069 if (bs->exact_filename[0]) { 6070 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename); 6071 } else if (bs->full_open_options) { 6072 QString *json = qobject_to_json(QOBJECT(bs->full_open_options)); 6073 snprintf(bs->filename, sizeof(bs->filename), "json:%s", 6074 qstring_get_str(json)); 6075 QDECREF(json); 6076 } 6077 } 6078 6079 /* This accessor function purpose is to allow the device models to access the 6080 * BlockAcctStats structure embedded inside a BlockDriverState without being 6081 * aware of the BlockDriverState structure layout. 6082 * It will go away when the BlockAcctStats structure will be moved inside 6083 * the device models. 6084 */ 6085 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs) 6086 { 6087 return &bs->stats; 6088 } 6089