1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "config-host.h" 25 #include "qemu-common.h" 26 #include "trace.h" 27 #include "block/block_int.h" 28 #include "block/blockjob.h" 29 #include "qemu/module.h" 30 #include "qapi/qmp/qjson.h" 31 #include "sysemu/sysemu.h" 32 #include "qemu/notify.h" 33 #include "block/coroutine.h" 34 #include "block/qapi.h" 35 #include "qmp-commands.h" 36 #include "qemu/timer.h" 37 #include "qapi-event.h" 38 39 #ifdef CONFIG_BSD 40 #include <sys/types.h> 41 #include <sys/stat.h> 42 #include <sys/ioctl.h> 43 #include <sys/queue.h> 44 #ifndef __DragonFly__ 45 #include <sys/disk.h> 46 #endif 47 #endif 48 49 #ifdef _WIN32 50 #include <windows.h> 51 #endif 52 53 struct BdrvDirtyBitmap { 54 HBitmap *bitmap; 55 QLIST_ENTRY(BdrvDirtyBitmap) list; 56 }; 57 58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 59 60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); 61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 63 BlockDriverCompletionFunc *cb, void *opaque); 64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 66 BlockDriverCompletionFunc *cb, void *opaque); 67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 68 int64_t sector_num, int nb_sectors, 69 QEMUIOVector *iov); 70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 71 int64_t sector_num, int nb_sectors, 72 QEMUIOVector *iov); 73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 75 BdrvRequestFlags flags); 76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 78 BdrvRequestFlags flags); 79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 80 int64_t sector_num, 81 QEMUIOVector *qiov, 82 int nb_sectors, 83 BdrvRequestFlags flags, 84 BlockDriverCompletionFunc *cb, 85 void *opaque, 86 bool is_write); 87 static void coroutine_fn bdrv_co_do_rw(void *opaque); 88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 90 91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states = 92 QTAILQ_HEAD_INITIALIZER(bdrv_states); 93 94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = 95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); 96 97 static QLIST_HEAD(, BlockDriver) bdrv_drivers = 98 QLIST_HEAD_INITIALIZER(bdrv_drivers); 99 100 /* If non-zero, use only whitelisted block drivers */ 101 static int use_bdrv_whitelist; 102 103 #ifdef _WIN32 104 static int is_windows_drive_prefix(const char *filename) 105 { 106 return (((filename[0] >= 'a' && filename[0] <= 'z') || 107 (filename[0] >= 'A' && filename[0] <= 'Z')) && 108 filename[1] == ':'); 109 } 110 111 int is_windows_drive(const char *filename) 112 { 113 if (is_windows_drive_prefix(filename) && 114 filename[2] == '\0') 115 return 1; 116 if (strstart(filename, "\\\\.\\", NULL) || 117 strstart(filename, "//./", NULL)) 118 return 1; 119 return 0; 120 } 121 #endif 122 123 /* throttling disk I/O limits */ 124 void bdrv_set_io_limits(BlockDriverState *bs, 125 ThrottleConfig *cfg) 126 { 127 int i; 128 129 throttle_config(&bs->throttle_state, cfg); 130 131 for (i = 0; i < 2; i++) { 132 qemu_co_enter_next(&bs->throttled_reqs[i]); 133 } 134 } 135 136 /* this function drain all the throttled IOs */ 137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 138 { 139 bool drained = false; 140 bool enabled = bs->io_limits_enabled; 141 int i; 142 143 bs->io_limits_enabled = false; 144 145 for (i = 0; i < 2; i++) { 146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 147 drained = true; 148 } 149 } 150 151 bs->io_limits_enabled = enabled; 152 153 return drained; 154 } 155 156 void bdrv_io_limits_disable(BlockDriverState *bs) 157 { 158 bs->io_limits_enabled = false; 159 160 bdrv_start_throttled_reqs(bs); 161 162 throttle_destroy(&bs->throttle_state); 163 } 164 165 static void bdrv_throttle_read_timer_cb(void *opaque) 166 { 167 BlockDriverState *bs = opaque; 168 qemu_co_enter_next(&bs->throttled_reqs[0]); 169 } 170 171 static void bdrv_throttle_write_timer_cb(void *opaque) 172 { 173 BlockDriverState *bs = opaque; 174 qemu_co_enter_next(&bs->throttled_reqs[1]); 175 } 176 177 /* should be called before bdrv_set_io_limits if a limit is set */ 178 void bdrv_io_limits_enable(BlockDriverState *bs) 179 { 180 assert(!bs->io_limits_enabled); 181 throttle_init(&bs->throttle_state, 182 bdrv_get_aio_context(bs), 183 QEMU_CLOCK_VIRTUAL, 184 bdrv_throttle_read_timer_cb, 185 bdrv_throttle_write_timer_cb, 186 bs); 187 bs->io_limits_enabled = true; 188 } 189 190 /* This function makes an IO wait if needed 191 * 192 * @nb_sectors: the number of sectors of the IO 193 * @is_write: is the IO a write 194 */ 195 static void bdrv_io_limits_intercept(BlockDriverState *bs, 196 unsigned int bytes, 197 bool is_write) 198 { 199 /* does this io must wait */ 200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); 201 202 /* if must wait or any request of this type throttled queue the IO */ 203 if (must_wait || 204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { 205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]); 206 } 207 208 /* the IO will be executed, do the accounting */ 209 throttle_account(&bs->throttle_state, is_write, bytes); 210 211 212 /* if the next request must wait -> do nothing */ 213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) { 214 return; 215 } 216 217 /* else queue next request for execution */ 218 qemu_co_queue_next(&bs->throttled_reqs[is_write]); 219 } 220 221 size_t bdrv_opt_mem_align(BlockDriverState *bs) 222 { 223 if (!bs || !bs->drv) { 224 /* 4k should be on the safe side */ 225 return 4096; 226 } 227 228 return bs->bl.opt_mem_alignment; 229 } 230 231 /* check if the path starts with "<protocol>:" */ 232 static int path_has_protocol(const char *path) 233 { 234 const char *p; 235 236 #ifdef _WIN32 237 if (is_windows_drive(path) || 238 is_windows_drive_prefix(path)) { 239 return 0; 240 } 241 p = path + strcspn(path, ":/\\"); 242 #else 243 p = path + strcspn(path, ":/"); 244 #endif 245 246 return *p == ':'; 247 } 248 249 int path_is_absolute(const char *path) 250 { 251 #ifdef _WIN32 252 /* specific case for names like: "\\.\d:" */ 253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) { 254 return 1; 255 } 256 return (*path == '/' || *path == '\\'); 257 #else 258 return (*path == '/'); 259 #endif 260 } 261 262 /* if filename is absolute, just copy it to dest. Otherwise, build a 263 path to it by considering it is relative to base_path. URL are 264 supported. */ 265 void path_combine(char *dest, int dest_size, 266 const char *base_path, 267 const char *filename) 268 { 269 const char *p, *p1; 270 int len; 271 272 if (dest_size <= 0) 273 return; 274 if (path_is_absolute(filename)) { 275 pstrcpy(dest, dest_size, filename); 276 } else { 277 p = strchr(base_path, ':'); 278 if (p) 279 p++; 280 else 281 p = base_path; 282 p1 = strrchr(base_path, '/'); 283 #ifdef _WIN32 284 { 285 const char *p2; 286 p2 = strrchr(base_path, '\\'); 287 if (!p1 || p2 > p1) 288 p1 = p2; 289 } 290 #endif 291 if (p1) 292 p1++; 293 else 294 p1 = base_path; 295 if (p1 > p) 296 p = p1; 297 len = p - base_path; 298 if (len > dest_size - 1) 299 len = dest_size - 1; 300 memcpy(dest, base_path, len); 301 dest[len] = '\0'; 302 pstrcat(dest, dest_size, filename); 303 } 304 } 305 306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz) 307 { 308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) { 309 pstrcpy(dest, sz, bs->backing_file); 310 } else { 311 path_combine(dest, sz, bs->filename, bs->backing_file); 312 } 313 } 314 315 void bdrv_register(BlockDriver *bdrv) 316 { 317 /* Block drivers without coroutine functions need emulation */ 318 if (!bdrv->bdrv_co_readv) { 319 bdrv->bdrv_co_readv = bdrv_co_readv_em; 320 bdrv->bdrv_co_writev = bdrv_co_writev_em; 321 322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 323 * the block driver lacks aio we need to emulate that too. 324 */ 325 if (!bdrv->bdrv_aio_readv) { 326 /* add AIO emulation layer */ 327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 329 } 330 } 331 332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); 333 } 334 335 /* create a new block device (by default it is empty) */ 336 BlockDriverState *bdrv_new(const char *device_name, Error **errp) 337 { 338 BlockDriverState *bs; 339 int i; 340 341 if (bdrv_find(device_name)) { 342 error_setg(errp, "Device with id '%s' already exists", 343 device_name); 344 return NULL; 345 } 346 if (bdrv_find_node(device_name)) { 347 error_setg(errp, "Device with node-name '%s' already exists", 348 device_name); 349 return NULL; 350 } 351 352 bs = g_malloc0(sizeof(BlockDriverState)); 353 QLIST_INIT(&bs->dirty_bitmaps); 354 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); 355 if (device_name[0] != '\0') { 356 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list); 357 } 358 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 359 QLIST_INIT(&bs->op_blockers[i]); 360 } 361 bdrv_iostatus_disable(bs); 362 notifier_list_init(&bs->close_notifiers); 363 notifier_with_return_list_init(&bs->before_write_notifiers); 364 qemu_co_queue_init(&bs->throttled_reqs[0]); 365 qemu_co_queue_init(&bs->throttled_reqs[1]); 366 bs->refcnt = 1; 367 bs->aio_context = qemu_get_aio_context(); 368 369 return bs; 370 } 371 372 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) 373 { 374 notifier_list_add(&bs->close_notifiers, notify); 375 } 376 377 BlockDriver *bdrv_find_format(const char *format_name) 378 { 379 BlockDriver *drv1; 380 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 381 if (!strcmp(drv1->format_name, format_name)) { 382 return drv1; 383 } 384 } 385 return NULL; 386 } 387 388 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only) 389 { 390 static const char *whitelist_rw[] = { 391 CONFIG_BDRV_RW_WHITELIST 392 }; 393 static const char *whitelist_ro[] = { 394 CONFIG_BDRV_RO_WHITELIST 395 }; 396 const char **p; 397 398 if (!whitelist_rw[0] && !whitelist_ro[0]) { 399 return 1; /* no whitelist, anything goes */ 400 } 401 402 for (p = whitelist_rw; *p; p++) { 403 if (!strcmp(drv->format_name, *p)) { 404 return 1; 405 } 406 } 407 if (read_only) { 408 for (p = whitelist_ro; *p; p++) { 409 if (!strcmp(drv->format_name, *p)) { 410 return 1; 411 } 412 } 413 } 414 return 0; 415 } 416 417 BlockDriver *bdrv_find_whitelisted_format(const char *format_name, 418 bool read_only) 419 { 420 BlockDriver *drv = bdrv_find_format(format_name); 421 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL; 422 } 423 424 typedef struct CreateCo { 425 BlockDriver *drv; 426 char *filename; 427 QemuOpts *opts; 428 int ret; 429 Error *err; 430 } CreateCo; 431 432 static void coroutine_fn bdrv_create_co_entry(void *opaque) 433 { 434 Error *local_err = NULL; 435 int ret; 436 437 CreateCo *cco = opaque; 438 assert(cco->drv); 439 440 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err); 441 if (local_err) { 442 error_propagate(&cco->err, local_err); 443 } 444 cco->ret = ret; 445 } 446 447 int bdrv_create(BlockDriver *drv, const char* filename, 448 QemuOpts *opts, Error **errp) 449 { 450 int ret; 451 452 Coroutine *co; 453 CreateCo cco = { 454 .drv = drv, 455 .filename = g_strdup(filename), 456 .opts = opts, 457 .ret = NOT_DONE, 458 .err = NULL, 459 }; 460 461 if (!drv->bdrv_create) { 462 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); 463 ret = -ENOTSUP; 464 goto out; 465 } 466 467 if (qemu_in_coroutine()) { 468 /* Fast-path if already in coroutine context */ 469 bdrv_create_co_entry(&cco); 470 } else { 471 co = qemu_coroutine_create(bdrv_create_co_entry); 472 qemu_coroutine_enter(co, &cco); 473 while (cco.ret == NOT_DONE) { 474 qemu_aio_wait(); 475 } 476 } 477 478 ret = cco.ret; 479 if (ret < 0) { 480 if (cco.err) { 481 error_propagate(errp, cco.err); 482 } else { 483 error_setg_errno(errp, -ret, "Could not create image"); 484 } 485 } 486 487 out: 488 g_free(cco.filename); 489 return ret; 490 } 491 492 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) 493 { 494 BlockDriver *drv; 495 Error *local_err = NULL; 496 int ret; 497 498 drv = bdrv_find_protocol(filename, true); 499 if (drv == NULL) { 500 error_setg(errp, "Could not find protocol for file '%s'", filename); 501 return -ENOENT; 502 } 503 504 ret = bdrv_create(drv, filename, opts, &local_err); 505 if (local_err) { 506 error_propagate(errp, local_err); 507 } 508 return ret; 509 } 510 511 int bdrv_refresh_limits(BlockDriverState *bs) 512 { 513 BlockDriver *drv = bs->drv; 514 515 memset(&bs->bl, 0, sizeof(bs->bl)); 516 517 if (!drv) { 518 return 0; 519 } 520 521 /* Take some limits from the children as a default */ 522 if (bs->file) { 523 bdrv_refresh_limits(bs->file); 524 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; 525 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; 526 } else { 527 bs->bl.opt_mem_alignment = 512; 528 } 529 530 if (bs->backing_hd) { 531 bdrv_refresh_limits(bs->backing_hd); 532 bs->bl.opt_transfer_length = 533 MAX(bs->bl.opt_transfer_length, 534 bs->backing_hd->bl.opt_transfer_length); 535 bs->bl.opt_mem_alignment = 536 MAX(bs->bl.opt_mem_alignment, 537 bs->backing_hd->bl.opt_mem_alignment); 538 } 539 540 /* Then let the driver override it */ 541 if (drv->bdrv_refresh_limits) { 542 return drv->bdrv_refresh_limits(bs); 543 } 544 545 return 0; 546 } 547 548 /* 549 * Create a uniquely-named empty temporary file. 550 * Return 0 upon success, otherwise a negative errno value. 551 */ 552 int get_tmp_filename(char *filename, int size) 553 { 554 #ifdef _WIN32 555 char temp_dir[MAX_PATH]; 556 /* GetTempFileName requires that its output buffer (4th param) 557 have length MAX_PATH or greater. */ 558 assert(size >= MAX_PATH); 559 return (GetTempPath(MAX_PATH, temp_dir) 560 && GetTempFileName(temp_dir, "qem", 0, filename) 561 ? 0 : -GetLastError()); 562 #else 563 int fd; 564 const char *tmpdir; 565 tmpdir = getenv("TMPDIR"); 566 if (!tmpdir) { 567 tmpdir = "/var/tmp"; 568 } 569 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { 570 return -EOVERFLOW; 571 } 572 fd = mkstemp(filename); 573 if (fd < 0) { 574 return -errno; 575 } 576 if (close(fd) != 0) { 577 unlink(filename); 578 return -errno; 579 } 580 return 0; 581 #endif 582 } 583 584 /* 585 * Detect host devices. By convention, /dev/cdrom[N] is always 586 * recognized as a host CDROM. 587 */ 588 static BlockDriver *find_hdev_driver(const char *filename) 589 { 590 int score_max = 0, score; 591 BlockDriver *drv = NULL, *d; 592 593 QLIST_FOREACH(d, &bdrv_drivers, list) { 594 if (d->bdrv_probe_device) { 595 score = d->bdrv_probe_device(filename); 596 if (score > score_max) { 597 score_max = score; 598 drv = d; 599 } 600 } 601 } 602 603 return drv; 604 } 605 606 BlockDriver *bdrv_find_protocol(const char *filename, 607 bool allow_protocol_prefix) 608 { 609 BlockDriver *drv1; 610 char protocol[128]; 611 int len; 612 const char *p; 613 614 /* TODO Drivers without bdrv_file_open must be specified explicitly */ 615 616 /* 617 * XXX(hch): we really should not let host device detection 618 * override an explicit protocol specification, but moving this 619 * later breaks access to device names with colons in them. 620 * Thanks to the brain-dead persistent naming schemes on udev- 621 * based Linux systems those actually are quite common. 622 */ 623 drv1 = find_hdev_driver(filename); 624 if (drv1) { 625 return drv1; 626 } 627 628 if (!path_has_protocol(filename) || !allow_protocol_prefix) { 629 return bdrv_find_format("file"); 630 } 631 632 p = strchr(filename, ':'); 633 assert(p != NULL); 634 len = p - filename; 635 if (len > sizeof(protocol) - 1) 636 len = sizeof(protocol) - 1; 637 memcpy(protocol, filename, len); 638 protocol[len] = '\0'; 639 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 640 if (drv1->protocol_name && 641 !strcmp(drv1->protocol_name, protocol)) { 642 return drv1; 643 } 644 } 645 return NULL; 646 } 647 648 static int find_image_format(BlockDriverState *bs, const char *filename, 649 BlockDriver **pdrv, Error **errp) 650 { 651 int score, score_max; 652 BlockDriver *drv1, *drv; 653 uint8_t buf[2048]; 654 int ret = 0; 655 656 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ 657 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { 658 drv = bdrv_find_format("raw"); 659 if (!drv) { 660 error_setg(errp, "Could not find raw image format"); 661 ret = -ENOENT; 662 } 663 *pdrv = drv; 664 return ret; 665 } 666 667 ret = bdrv_pread(bs, 0, buf, sizeof(buf)); 668 if (ret < 0) { 669 error_setg_errno(errp, -ret, "Could not read image for determining its " 670 "format"); 671 *pdrv = NULL; 672 return ret; 673 } 674 675 score_max = 0; 676 drv = NULL; 677 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 678 if (drv1->bdrv_probe) { 679 score = drv1->bdrv_probe(buf, ret, filename); 680 if (score > score_max) { 681 score_max = score; 682 drv = drv1; 683 } 684 } 685 } 686 if (!drv) { 687 error_setg(errp, "Could not determine image format: No compatible " 688 "driver found"); 689 ret = -ENOENT; 690 } 691 *pdrv = drv; 692 return ret; 693 } 694 695 /** 696 * Set the current 'total_sectors' value 697 */ 698 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) 699 { 700 BlockDriver *drv = bs->drv; 701 702 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ 703 if (bs->sg) 704 return 0; 705 706 /* query actual device if possible, otherwise just trust the hint */ 707 if (drv->bdrv_getlength) { 708 int64_t length = drv->bdrv_getlength(bs); 709 if (length < 0) { 710 return length; 711 } 712 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE); 713 } 714 715 bs->total_sectors = hint; 716 return 0; 717 } 718 719 /** 720 * Set open flags for a given discard mode 721 * 722 * Return 0 on success, -1 if the discard mode was invalid. 723 */ 724 int bdrv_parse_discard_flags(const char *mode, int *flags) 725 { 726 *flags &= ~BDRV_O_UNMAP; 727 728 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) { 729 /* do nothing */ 730 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) { 731 *flags |= BDRV_O_UNMAP; 732 } else { 733 return -1; 734 } 735 736 return 0; 737 } 738 739 /** 740 * Set open flags for a given cache mode 741 * 742 * Return 0 on success, -1 if the cache mode was invalid. 743 */ 744 int bdrv_parse_cache_flags(const char *mode, int *flags) 745 { 746 *flags &= ~BDRV_O_CACHE_MASK; 747 748 if (!strcmp(mode, "off") || !strcmp(mode, "none")) { 749 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; 750 } else if (!strcmp(mode, "directsync")) { 751 *flags |= BDRV_O_NOCACHE; 752 } else if (!strcmp(mode, "writeback")) { 753 *flags |= BDRV_O_CACHE_WB; 754 } else if (!strcmp(mode, "unsafe")) { 755 *flags |= BDRV_O_CACHE_WB; 756 *flags |= BDRV_O_NO_FLUSH; 757 } else if (!strcmp(mode, "writethrough")) { 758 /* this is the default */ 759 } else { 760 return -1; 761 } 762 763 return 0; 764 } 765 766 /** 767 * The copy-on-read flag is actually a reference count so multiple users may 768 * use the feature without worrying about clobbering its previous state. 769 * Copy-on-read stays enabled until all users have called to disable it. 770 */ 771 void bdrv_enable_copy_on_read(BlockDriverState *bs) 772 { 773 bs->copy_on_read++; 774 } 775 776 void bdrv_disable_copy_on_read(BlockDriverState *bs) 777 { 778 assert(bs->copy_on_read > 0); 779 bs->copy_on_read--; 780 } 781 782 /* 783 * Returns the flags that a temporary snapshot should get, based on the 784 * originally requested flags (the originally requested image will have flags 785 * like a backing file) 786 */ 787 static int bdrv_temp_snapshot_flags(int flags) 788 { 789 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; 790 } 791 792 /* 793 * Returns the flags that bs->file should get, based on the given flags for 794 * the parent BDS 795 */ 796 static int bdrv_inherited_flags(int flags) 797 { 798 /* Enable protocol handling, disable format probing for bs->file */ 799 flags |= BDRV_O_PROTOCOL; 800 801 /* Our block drivers take care to send flushes and respect unmap policy, 802 * so we can enable both unconditionally on lower layers. */ 803 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP; 804 805 /* Clear flags that only apply to the top layer */ 806 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ); 807 808 return flags; 809 } 810 811 /* 812 * Returns the flags that bs->backing_hd should get, based on the given flags 813 * for the parent BDS 814 */ 815 static int bdrv_backing_flags(int flags) 816 { 817 /* backing files always opened read-only */ 818 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ); 819 820 /* snapshot=on is handled on the top layer */ 821 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY); 822 823 return flags; 824 } 825 826 static int bdrv_open_flags(BlockDriverState *bs, int flags) 827 { 828 int open_flags = flags | BDRV_O_CACHE_WB; 829 830 /* 831 * Clear flags that are internal to the block layer before opening the 832 * image. 833 */ 834 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 835 836 /* 837 * Snapshots should be writable. 838 */ 839 if (flags & BDRV_O_TEMPORARY) { 840 open_flags |= BDRV_O_RDWR; 841 } 842 843 return open_flags; 844 } 845 846 static void bdrv_assign_node_name(BlockDriverState *bs, 847 const char *node_name, 848 Error **errp) 849 { 850 if (!node_name) { 851 return; 852 } 853 854 /* empty string node name is invalid */ 855 if (node_name[0] == '\0') { 856 error_setg(errp, "Empty node name"); 857 return; 858 } 859 860 /* takes care of avoiding namespaces collisions */ 861 if (bdrv_find(node_name)) { 862 error_setg(errp, "node-name=%s is conflicting with a device id", 863 node_name); 864 return; 865 } 866 867 /* takes care of avoiding duplicates node names */ 868 if (bdrv_find_node(node_name)) { 869 error_setg(errp, "Duplicate node name"); 870 return; 871 } 872 873 /* copy node name into the bs and insert it into the graph list */ 874 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name); 875 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list); 876 } 877 878 /* 879 * Common part for opening disk images and files 880 * 881 * Removes all processed options from *options. 882 */ 883 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, 884 QDict *options, int flags, BlockDriver *drv, Error **errp) 885 { 886 int ret, open_flags; 887 const char *filename; 888 const char *node_name = NULL; 889 Error *local_err = NULL; 890 891 assert(drv != NULL); 892 assert(bs->file == NULL); 893 assert(options != NULL && bs->options != options); 894 895 if (file != NULL) { 896 filename = file->filename; 897 } else { 898 filename = qdict_get_try_str(options, "filename"); 899 } 900 901 if (drv->bdrv_needs_filename && !filename) { 902 error_setg(errp, "The '%s' block driver requires a file name", 903 drv->format_name); 904 return -EINVAL; 905 } 906 907 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); 908 909 node_name = qdict_get_try_str(options, "node-name"); 910 bdrv_assign_node_name(bs, node_name, &local_err); 911 if (local_err) { 912 error_propagate(errp, local_err); 913 return -EINVAL; 914 } 915 qdict_del(options, "node-name"); 916 917 /* bdrv_open() with directly using a protocol as drv. This layer is already 918 * opened, so assign it to bs (while file becomes a closed BlockDriverState) 919 * and return immediately. */ 920 if (file != NULL && drv->bdrv_file_open) { 921 bdrv_swap(file, bs); 922 return 0; 923 } 924 925 bs->open_flags = flags; 926 bs->guest_block_size = 512; 927 bs->request_alignment = 512; 928 bs->zero_beyond_eof = true; 929 open_flags = bdrv_open_flags(bs, flags); 930 bs->read_only = !(open_flags & BDRV_O_RDWR); 931 932 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) { 933 error_setg(errp, 934 !bs->read_only && bdrv_is_whitelisted(drv, true) 935 ? "Driver '%s' can only be used for read-only devices" 936 : "Driver '%s' is not whitelisted", 937 drv->format_name); 938 return -ENOTSUP; 939 } 940 941 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ 942 if (flags & BDRV_O_COPY_ON_READ) { 943 if (!bs->read_only) { 944 bdrv_enable_copy_on_read(bs); 945 } else { 946 error_setg(errp, "Can't use copy-on-read on read-only device"); 947 return -EINVAL; 948 } 949 } 950 951 if (filename != NULL) { 952 pstrcpy(bs->filename, sizeof(bs->filename), filename); 953 } else { 954 bs->filename[0] = '\0'; 955 } 956 957 bs->drv = drv; 958 bs->opaque = g_malloc0(drv->instance_size); 959 960 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); 961 962 /* Open the image, either directly or using a protocol */ 963 if (drv->bdrv_file_open) { 964 assert(file == NULL); 965 assert(!drv->bdrv_needs_filename || filename != NULL); 966 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err); 967 } else { 968 if (file == NULL) { 969 error_setg(errp, "Can't use '%s' as a block driver for the " 970 "protocol level", drv->format_name); 971 ret = -EINVAL; 972 goto free_and_fail; 973 } 974 bs->file = file; 975 ret = drv->bdrv_open(bs, options, open_flags, &local_err); 976 } 977 978 if (ret < 0) { 979 if (local_err) { 980 error_propagate(errp, local_err); 981 } else if (bs->filename[0]) { 982 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename); 983 } else { 984 error_setg_errno(errp, -ret, "Could not open image"); 985 } 986 goto free_and_fail; 987 } 988 989 ret = refresh_total_sectors(bs, bs->total_sectors); 990 if (ret < 0) { 991 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 992 goto free_and_fail; 993 } 994 995 bdrv_refresh_limits(bs); 996 assert(bdrv_opt_mem_align(bs) != 0); 997 assert((bs->request_alignment != 0) || bs->sg); 998 return 0; 999 1000 free_and_fail: 1001 bs->file = NULL; 1002 g_free(bs->opaque); 1003 bs->opaque = NULL; 1004 bs->drv = NULL; 1005 return ret; 1006 } 1007 1008 /* 1009 * Fills in default options for opening images and converts the legacy 1010 * filename/flags pair to option QDict entries. 1011 */ 1012 static int bdrv_fill_options(QDict **options, const char *filename, int flags, 1013 Error **errp) 1014 { 1015 const char *drvname; 1016 bool protocol = flags & BDRV_O_PROTOCOL; 1017 bool parse_filename = false; 1018 Error *local_err = NULL; 1019 BlockDriver *drv; 1020 1021 if (!protocol) { 1022 return 0; 1023 } 1024 1025 /* Fetch the file name from the options QDict if necessary */ 1026 if (filename) { 1027 if (!qdict_haskey(*options, "filename")) { 1028 qdict_put(*options, "filename", qstring_from_str(filename)); 1029 parse_filename = true; 1030 } else { 1031 error_setg(errp, "Can't specify 'file' and 'filename' options at " 1032 "the same time"); 1033 return -EINVAL; 1034 } 1035 } 1036 1037 /* Find the right block driver */ 1038 filename = qdict_get_try_str(*options, "filename"); 1039 drvname = qdict_get_try_str(*options, "driver"); 1040 1041 if (!drvname) { 1042 if (filename) { 1043 drv = bdrv_find_protocol(filename, parse_filename); 1044 if (!drv) { 1045 error_setg(errp, "Unknown protocol"); 1046 return -EINVAL; 1047 } 1048 1049 drvname = drv->format_name; 1050 qdict_put(*options, "driver", qstring_from_str(drvname)); 1051 } else { 1052 error_setg(errp, "Must specify either driver or file"); 1053 return -EINVAL; 1054 } 1055 } 1056 1057 drv = bdrv_find_format(drvname); 1058 if (!drv) { 1059 error_setg(errp, "Unknown driver '%s'", drvname); 1060 return -ENOENT; 1061 } 1062 1063 /* Driver-specific filename parsing */ 1064 if (drv->bdrv_parse_filename && parse_filename) { 1065 drv->bdrv_parse_filename(filename, *options, &local_err); 1066 if (local_err) { 1067 error_propagate(errp, local_err); 1068 return -EINVAL; 1069 } 1070 1071 if (!drv->bdrv_needs_filename) { 1072 qdict_del(*options, "filename"); 1073 } 1074 } 1075 1076 return 0; 1077 } 1078 1079 /* 1080 * Opens a file using a protocol (file, host_device, nbd, ...) 1081 * 1082 * options is an indirect pointer to a QDict of options to pass to the block 1083 * drivers, or pointer to NULL for an empty set of options. If this function 1084 * takes ownership of the QDict reference, it will set *options to NULL; 1085 * otherwise, it will contain unused/unrecognized options after this function 1086 * returns. Then, the caller is responsible for freeing it. If it intends to 1087 * reuse the QDict, QINCREF() should be called beforehand. 1088 */ 1089 static int bdrv_file_open(BlockDriverState *bs, QDict **options, int flags, 1090 Error **errp) 1091 { 1092 BlockDriver *drv; 1093 const char *filename; 1094 const char *drvname; 1095 Error *local_err = NULL; 1096 int ret; 1097 1098 filename = qdict_get_try_str(*options, "filename"); 1099 drvname = qdict_get_str(*options, "driver"); 1100 1101 drv = bdrv_find_format(drvname); 1102 assert(drv); 1103 qdict_del(*options, "driver"); 1104 1105 /* Open the file */ 1106 if (!drv->bdrv_file_open) { 1107 ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err); 1108 *options = NULL; 1109 } else { 1110 ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err); 1111 } 1112 if (ret < 0) { 1113 error_propagate(errp, local_err); 1114 goto fail; 1115 } 1116 1117 bs->growable = 1; 1118 return 0; 1119 1120 fail: 1121 return ret; 1122 } 1123 1124 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd) 1125 { 1126 1127 if (bs->backing_hd) { 1128 assert(bs->backing_blocker); 1129 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker); 1130 } else if (backing_hd) { 1131 error_setg(&bs->backing_blocker, 1132 "device is used as backing hd of '%s'", 1133 bs->device_name); 1134 } 1135 1136 bs->backing_hd = backing_hd; 1137 if (!backing_hd) { 1138 error_free(bs->backing_blocker); 1139 bs->backing_blocker = NULL; 1140 goto out; 1141 } 1142 bs->open_flags &= ~BDRV_O_NO_BACKING; 1143 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename); 1144 pstrcpy(bs->backing_format, sizeof(bs->backing_format), 1145 backing_hd->drv ? backing_hd->drv->format_name : ""); 1146 1147 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker); 1148 /* Otherwise we won't be able to commit due to check in bdrv_commit */ 1149 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, 1150 bs->backing_blocker); 1151 out: 1152 bdrv_refresh_limits(bs); 1153 } 1154 1155 /* 1156 * Opens the backing file for a BlockDriverState if not yet open 1157 * 1158 * options is a QDict of options to pass to the block drivers, or NULL for an 1159 * empty set of options. The reference to the QDict is transferred to this 1160 * function (even on failure), so if the caller intends to reuse the dictionary, 1161 * it needs to use QINCREF() before calling bdrv_file_open. 1162 */ 1163 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) 1164 { 1165 char *backing_filename = g_malloc0(PATH_MAX); 1166 int ret = 0; 1167 BlockDriver *back_drv = NULL; 1168 BlockDriverState *backing_hd; 1169 Error *local_err = NULL; 1170 1171 if (bs->backing_hd != NULL) { 1172 QDECREF(options); 1173 goto free_exit; 1174 } 1175 1176 /* NULL means an empty set of options */ 1177 if (options == NULL) { 1178 options = qdict_new(); 1179 } 1180 1181 bs->open_flags &= ~BDRV_O_NO_BACKING; 1182 if (qdict_haskey(options, "file.filename")) { 1183 backing_filename[0] = '\0'; 1184 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) { 1185 QDECREF(options); 1186 goto free_exit; 1187 } else { 1188 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX); 1189 } 1190 1191 backing_hd = bdrv_new("", errp); 1192 1193 if (bs->backing_format[0] != '\0') { 1194 back_drv = bdrv_find_format(bs->backing_format); 1195 } 1196 1197 assert(bs->backing_hd == NULL); 1198 ret = bdrv_open(&backing_hd, 1199 *backing_filename ? backing_filename : NULL, NULL, options, 1200 bdrv_backing_flags(bs->open_flags), back_drv, &local_err); 1201 if (ret < 0) { 1202 bdrv_unref(backing_hd); 1203 backing_hd = NULL; 1204 bs->open_flags |= BDRV_O_NO_BACKING; 1205 error_setg(errp, "Could not open backing file: %s", 1206 error_get_pretty(local_err)); 1207 error_free(local_err); 1208 goto free_exit; 1209 } 1210 bdrv_set_backing_hd(bs, backing_hd); 1211 1212 free_exit: 1213 g_free(backing_filename); 1214 return ret; 1215 } 1216 1217 /* 1218 * Opens a disk image whose options are given as BlockdevRef in another block 1219 * device's options. 1220 * 1221 * If allow_none is true, no image will be opened if filename is false and no 1222 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned. 1223 * 1224 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict. 1225 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict 1226 * itself, all options starting with "${bdref_key}." are considered part of the 1227 * BlockdevRef. 1228 * 1229 * The BlockdevRef will be removed from the options QDict. 1230 * 1231 * To conform with the behavior of bdrv_open(), *pbs has to be NULL. 1232 */ 1233 int bdrv_open_image(BlockDriverState **pbs, const char *filename, 1234 QDict *options, const char *bdref_key, int flags, 1235 bool allow_none, Error **errp) 1236 { 1237 QDict *image_options; 1238 int ret; 1239 char *bdref_key_dot; 1240 const char *reference; 1241 1242 assert(pbs); 1243 assert(*pbs == NULL); 1244 1245 bdref_key_dot = g_strdup_printf("%s.", bdref_key); 1246 qdict_extract_subqdict(options, &image_options, bdref_key_dot); 1247 g_free(bdref_key_dot); 1248 1249 reference = qdict_get_try_str(options, bdref_key); 1250 if (!filename && !reference && !qdict_size(image_options)) { 1251 if (allow_none) { 1252 ret = 0; 1253 } else { 1254 error_setg(errp, "A block device must be specified for \"%s\"", 1255 bdref_key); 1256 ret = -EINVAL; 1257 } 1258 QDECREF(image_options); 1259 goto done; 1260 } 1261 1262 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp); 1263 1264 done: 1265 qdict_del(options, bdref_key); 1266 return ret; 1267 } 1268 1269 void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) 1270 { 1271 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ 1272 char *tmp_filename = g_malloc0(PATH_MAX + 1); 1273 int64_t total_size; 1274 BlockDriver *bdrv_qcow2; 1275 QemuOpts *opts = NULL; 1276 QDict *snapshot_options; 1277 BlockDriverState *bs_snapshot; 1278 Error *local_err; 1279 int ret; 1280 1281 /* if snapshot, we create a temporary backing file and open it 1282 instead of opening 'filename' directly */ 1283 1284 /* Get the required size from the image */ 1285 total_size = bdrv_getlength(bs); 1286 if (total_size < 0) { 1287 error_setg_errno(errp, -total_size, "Could not get image size"); 1288 goto out; 1289 } 1290 total_size &= BDRV_SECTOR_MASK; 1291 1292 /* Create the temporary image */ 1293 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1); 1294 if (ret < 0) { 1295 error_setg_errno(errp, -ret, "Could not get temporary filename"); 1296 goto out; 1297 } 1298 1299 bdrv_qcow2 = bdrv_find_format("qcow2"); 1300 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0, 1301 &error_abort); 1302 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size); 1303 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err); 1304 qemu_opts_del(opts); 1305 if (ret < 0) { 1306 error_setg_errno(errp, -ret, "Could not create temporary overlay " 1307 "'%s': %s", tmp_filename, 1308 error_get_pretty(local_err)); 1309 error_free(local_err); 1310 goto out; 1311 } 1312 1313 /* Prepare a new options QDict for the temporary file */ 1314 snapshot_options = qdict_new(); 1315 qdict_put(snapshot_options, "file.driver", 1316 qstring_from_str("file")); 1317 qdict_put(snapshot_options, "file.filename", 1318 qstring_from_str(tmp_filename)); 1319 1320 bs_snapshot = bdrv_new("", &error_abort); 1321 1322 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options, 1323 flags, bdrv_qcow2, &local_err); 1324 if (ret < 0) { 1325 error_propagate(errp, local_err); 1326 goto out; 1327 } 1328 1329 bdrv_append(bs_snapshot, bs); 1330 1331 out: 1332 g_free(tmp_filename); 1333 } 1334 1335 static QDict *parse_json_filename(const char *filename, Error **errp) 1336 { 1337 QObject *options_obj; 1338 QDict *options; 1339 int ret; 1340 1341 ret = strstart(filename, "json:", &filename); 1342 assert(ret); 1343 1344 options_obj = qobject_from_json(filename); 1345 if (!options_obj) { 1346 error_setg(errp, "Could not parse the JSON options"); 1347 return NULL; 1348 } 1349 1350 if (qobject_type(options_obj) != QTYPE_QDICT) { 1351 qobject_decref(options_obj); 1352 error_setg(errp, "Invalid JSON object given"); 1353 return NULL; 1354 } 1355 1356 options = qobject_to_qdict(options_obj); 1357 qdict_flatten(options); 1358 1359 return options; 1360 } 1361 1362 /* 1363 * Opens a disk image (raw, qcow2, vmdk, ...) 1364 * 1365 * options is a QDict of options to pass to the block drivers, or NULL for an 1366 * empty set of options. The reference to the QDict belongs to the block layer 1367 * after the call (even on failure), so if the caller intends to reuse the 1368 * dictionary, it needs to use QINCREF() before calling bdrv_open. 1369 * 1370 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there. 1371 * If it is not NULL, the referenced BDS will be reused. 1372 * 1373 * The reference parameter may be used to specify an existing block device which 1374 * should be opened. If specified, neither options nor a filename may be given, 1375 * nor can an existing BDS be reused (that is, *pbs has to be NULL). 1376 */ 1377 int bdrv_open(BlockDriverState **pbs, const char *filename, 1378 const char *reference, QDict *options, int flags, 1379 BlockDriver *drv, Error **errp) 1380 { 1381 int ret; 1382 BlockDriverState *file = NULL, *bs; 1383 const char *drvname; 1384 Error *local_err = NULL; 1385 int snapshot_flags = 0; 1386 1387 assert(pbs); 1388 1389 if (reference) { 1390 bool options_non_empty = options ? qdict_size(options) : false; 1391 QDECREF(options); 1392 1393 if (*pbs) { 1394 error_setg(errp, "Cannot reuse an existing BDS when referencing " 1395 "another block device"); 1396 return -EINVAL; 1397 } 1398 1399 if (filename || options_non_empty) { 1400 error_setg(errp, "Cannot reference an existing block device with " 1401 "additional options or a new filename"); 1402 return -EINVAL; 1403 } 1404 1405 bs = bdrv_lookup_bs(reference, reference, errp); 1406 if (!bs) { 1407 return -ENODEV; 1408 } 1409 bdrv_ref(bs); 1410 *pbs = bs; 1411 return 0; 1412 } 1413 1414 if (*pbs) { 1415 bs = *pbs; 1416 } else { 1417 bs = bdrv_new("", &error_abort); 1418 } 1419 1420 /* NULL means an empty set of options */ 1421 if (options == NULL) { 1422 options = qdict_new(); 1423 } 1424 1425 if (filename && g_str_has_prefix(filename, "json:")) { 1426 QDict *json_options = parse_json_filename(filename, &local_err); 1427 if (local_err) { 1428 ret = -EINVAL; 1429 goto fail; 1430 } 1431 1432 /* Options given in the filename have lower priority than options 1433 * specified directly */ 1434 qdict_join(options, json_options, false); 1435 QDECREF(json_options); 1436 filename = NULL; 1437 } 1438 1439 ret = bdrv_fill_options(&options, filename, flags, &local_err); 1440 if (local_err) { 1441 goto fail; 1442 } 1443 1444 bs->options = options; 1445 options = qdict_clone_shallow(options); 1446 1447 if (flags & BDRV_O_PROTOCOL) { 1448 assert(!drv); 1449 ret = bdrv_file_open(bs, &options, flags & ~BDRV_O_PROTOCOL, 1450 &local_err); 1451 if (!ret) { 1452 drv = bs->drv; 1453 goto done; 1454 } else if (bs->drv) { 1455 goto close_and_fail; 1456 } else { 1457 goto fail; 1458 } 1459 } 1460 1461 /* Open image file without format layer */ 1462 if (flags & BDRV_O_RDWR) { 1463 flags |= BDRV_O_ALLOW_RDWR; 1464 } 1465 if (flags & BDRV_O_SNAPSHOT) { 1466 snapshot_flags = bdrv_temp_snapshot_flags(flags); 1467 flags = bdrv_backing_flags(flags); 1468 } 1469 1470 assert(file == NULL); 1471 ret = bdrv_open_image(&file, filename, options, "file", 1472 bdrv_inherited_flags(flags), 1473 true, &local_err); 1474 if (ret < 0) { 1475 goto fail; 1476 } 1477 1478 /* Find the right image format driver */ 1479 drvname = qdict_get_try_str(options, "driver"); 1480 if (drvname) { 1481 drv = bdrv_find_format(drvname); 1482 qdict_del(options, "driver"); 1483 if (!drv) { 1484 error_setg(errp, "Invalid driver: '%s'", drvname); 1485 ret = -EINVAL; 1486 goto fail; 1487 } 1488 } 1489 1490 if (!drv) { 1491 if (file) { 1492 ret = find_image_format(file, filename, &drv, &local_err); 1493 } else { 1494 error_setg(errp, "Must specify either driver or file"); 1495 ret = -EINVAL; 1496 goto fail; 1497 } 1498 } 1499 1500 if (!drv) { 1501 goto fail; 1502 } 1503 1504 /* Open the image */ 1505 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err); 1506 if (ret < 0) { 1507 goto fail; 1508 } 1509 1510 if (file && (bs->file != file)) { 1511 bdrv_unref(file); 1512 file = NULL; 1513 } 1514 1515 /* If there is a backing file, use it */ 1516 if ((flags & BDRV_O_NO_BACKING) == 0) { 1517 QDict *backing_options; 1518 1519 qdict_extract_subqdict(options, &backing_options, "backing."); 1520 ret = bdrv_open_backing_file(bs, backing_options, &local_err); 1521 if (ret < 0) { 1522 goto close_and_fail; 1523 } 1524 } 1525 1526 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the 1527 * temporary snapshot afterwards. */ 1528 if (snapshot_flags) { 1529 bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err); 1530 if (local_err) { 1531 error_propagate(errp, local_err); 1532 goto close_and_fail; 1533 } 1534 } 1535 1536 1537 done: 1538 /* Check if any unknown options were used */ 1539 if (options && (qdict_size(options) != 0)) { 1540 const QDictEntry *entry = qdict_first(options); 1541 if (flags & BDRV_O_PROTOCOL) { 1542 error_setg(errp, "Block protocol '%s' doesn't support the option " 1543 "'%s'", drv->format_name, entry->key); 1544 } else { 1545 error_setg(errp, "Block format '%s' used by device '%s' doesn't " 1546 "support the option '%s'", drv->format_name, 1547 bs->device_name, entry->key); 1548 } 1549 1550 ret = -EINVAL; 1551 goto close_and_fail; 1552 } 1553 1554 if (!bdrv_key_required(bs)) { 1555 bdrv_dev_change_media_cb(bs, true); 1556 } else if (!runstate_check(RUN_STATE_PRELAUNCH) 1557 && !runstate_check(RUN_STATE_INMIGRATE) 1558 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */ 1559 error_setg(errp, 1560 "Guest must be stopped for opening of encrypted image"); 1561 ret = -EBUSY; 1562 goto close_and_fail; 1563 } 1564 1565 QDECREF(options); 1566 *pbs = bs; 1567 return 0; 1568 1569 fail: 1570 if (file != NULL) { 1571 bdrv_unref(file); 1572 } 1573 QDECREF(bs->options); 1574 QDECREF(options); 1575 bs->options = NULL; 1576 if (!*pbs) { 1577 /* If *pbs is NULL, a new BDS has been created in this function and 1578 needs to be freed now. Otherwise, it does not need to be closed, 1579 since it has not really been opened yet. */ 1580 bdrv_unref(bs); 1581 } 1582 if (local_err) { 1583 error_propagate(errp, local_err); 1584 } 1585 return ret; 1586 1587 close_and_fail: 1588 /* See fail path, but now the BDS has to be always closed */ 1589 if (*pbs) { 1590 bdrv_close(bs); 1591 } else { 1592 bdrv_unref(bs); 1593 } 1594 QDECREF(options); 1595 if (local_err) { 1596 error_propagate(errp, local_err); 1597 } 1598 return ret; 1599 } 1600 1601 typedef struct BlockReopenQueueEntry { 1602 bool prepared; 1603 BDRVReopenState state; 1604 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; 1605 } BlockReopenQueueEntry; 1606 1607 /* 1608 * Adds a BlockDriverState to a simple queue for an atomic, transactional 1609 * reopen of multiple devices. 1610 * 1611 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT 1612 * already performed, or alternatively may be NULL a new BlockReopenQueue will 1613 * be created and initialized. This newly created BlockReopenQueue should be 1614 * passed back in for subsequent calls that are intended to be of the same 1615 * atomic 'set'. 1616 * 1617 * bs is the BlockDriverState to add to the reopen queue. 1618 * 1619 * flags contains the open flags for the associated bs 1620 * 1621 * returns a pointer to bs_queue, which is either the newly allocated 1622 * bs_queue, or the existing bs_queue being used. 1623 * 1624 */ 1625 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, 1626 BlockDriverState *bs, int flags) 1627 { 1628 assert(bs != NULL); 1629 1630 BlockReopenQueueEntry *bs_entry; 1631 if (bs_queue == NULL) { 1632 bs_queue = g_new0(BlockReopenQueue, 1); 1633 QSIMPLEQ_INIT(bs_queue); 1634 } 1635 1636 /* bdrv_open() masks this flag out */ 1637 flags &= ~BDRV_O_PROTOCOL; 1638 1639 if (bs->file) { 1640 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags)); 1641 } 1642 1643 bs_entry = g_new0(BlockReopenQueueEntry, 1); 1644 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); 1645 1646 bs_entry->state.bs = bs; 1647 bs_entry->state.flags = flags; 1648 1649 return bs_queue; 1650 } 1651 1652 /* 1653 * Reopen multiple BlockDriverStates atomically & transactionally. 1654 * 1655 * The queue passed in (bs_queue) must have been built up previous 1656 * via bdrv_reopen_queue(). 1657 * 1658 * Reopens all BDS specified in the queue, with the appropriate 1659 * flags. All devices are prepared for reopen, and failure of any 1660 * device will cause all device changes to be abandonded, and intermediate 1661 * data cleaned up. 1662 * 1663 * If all devices prepare successfully, then the changes are committed 1664 * to all devices. 1665 * 1666 */ 1667 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) 1668 { 1669 int ret = -1; 1670 BlockReopenQueueEntry *bs_entry, *next; 1671 Error *local_err = NULL; 1672 1673 assert(bs_queue != NULL); 1674 1675 bdrv_drain_all(); 1676 1677 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1678 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { 1679 error_propagate(errp, local_err); 1680 goto cleanup; 1681 } 1682 bs_entry->prepared = true; 1683 } 1684 1685 /* If we reach this point, we have success and just need to apply the 1686 * changes 1687 */ 1688 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1689 bdrv_reopen_commit(&bs_entry->state); 1690 } 1691 1692 ret = 0; 1693 1694 cleanup: 1695 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { 1696 if (ret && bs_entry->prepared) { 1697 bdrv_reopen_abort(&bs_entry->state); 1698 } 1699 g_free(bs_entry); 1700 } 1701 g_free(bs_queue); 1702 return ret; 1703 } 1704 1705 1706 /* Reopen a single BlockDriverState with the specified flags. */ 1707 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) 1708 { 1709 int ret = -1; 1710 Error *local_err = NULL; 1711 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); 1712 1713 ret = bdrv_reopen_multiple(queue, &local_err); 1714 if (local_err != NULL) { 1715 error_propagate(errp, local_err); 1716 } 1717 return ret; 1718 } 1719 1720 1721 /* 1722 * Prepares a BlockDriverState for reopen. All changes are staged in the 1723 * 'opaque' field of the BDRVReopenState, which is used and allocated by 1724 * the block driver layer .bdrv_reopen_prepare() 1725 * 1726 * bs is the BlockDriverState to reopen 1727 * flags are the new open flags 1728 * queue is the reopen queue 1729 * 1730 * Returns 0 on success, non-zero on error. On error errp will be set 1731 * as well. 1732 * 1733 * On failure, bdrv_reopen_abort() will be called to clean up any data. 1734 * It is the responsibility of the caller to then call the abort() or 1735 * commit() for any other BDS that have been left in a prepare() state 1736 * 1737 */ 1738 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, 1739 Error **errp) 1740 { 1741 int ret = -1; 1742 Error *local_err = NULL; 1743 BlockDriver *drv; 1744 1745 assert(reopen_state != NULL); 1746 assert(reopen_state->bs->drv != NULL); 1747 drv = reopen_state->bs->drv; 1748 1749 /* if we are to stay read-only, do not allow permission change 1750 * to r/w */ 1751 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && 1752 reopen_state->flags & BDRV_O_RDWR) { 1753 error_set(errp, QERR_DEVICE_IS_READ_ONLY, 1754 reopen_state->bs->device_name); 1755 goto error; 1756 } 1757 1758 1759 ret = bdrv_flush(reopen_state->bs); 1760 if (ret) { 1761 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", 1762 strerror(-ret)); 1763 goto error; 1764 } 1765 1766 if (drv->bdrv_reopen_prepare) { 1767 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); 1768 if (ret) { 1769 if (local_err != NULL) { 1770 error_propagate(errp, local_err); 1771 } else { 1772 error_setg(errp, "failed while preparing to reopen image '%s'", 1773 reopen_state->bs->filename); 1774 } 1775 goto error; 1776 } 1777 } else { 1778 /* It is currently mandatory to have a bdrv_reopen_prepare() 1779 * handler for each supported drv. */ 1780 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, 1781 drv->format_name, reopen_state->bs->device_name, 1782 "reopening of file"); 1783 ret = -1; 1784 goto error; 1785 } 1786 1787 ret = 0; 1788 1789 error: 1790 return ret; 1791 } 1792 1793 /* 1794 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and 1795 * makes them final by swapping the staging BlockDriverState contents into 1796 * the active BlockDriverState contents. 1797 */ 1798 void bdrv_reopen_commit(BDRVReopenState *reopen_state) 1799 { 1800 BlockDriver *drv; 1801 1802 assert(reopen_state != NULL); 1803 drv = reopen_state->bs->drv; 1804 assert(drv != NULL); 1805 1806 /* If there are any driver level actions to take */ 1807 if (drv->bdrv_reopen_commit) { 1808 drv->bdrv_reopen_commit(reopen_state); 1809 } 1810 1811 /* set BDS specific flags now */ 1812 reopen_state->bs->open_flags = reopen_state->flags; 1813 reopen_state->bs->enable_write_cache = !!(reopen_state->flags & 1814 BDRV_O_CACHE_WB); 1815 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); 1816 1817 bdrv_refresh_limits(reopen_state->bs); 1818 } 1819 1820 /* 1821 * Abort the reopen, and delete and free the staged changes in 1822 * reopen_state 1823 */ 1824 void bdrv_reopen_abort(BDRVReopenState *reopen_state) 1825 { 1826 BlockDriver *drv; 1827 1828 assert(reopen_state != NULL); 1829 drv = reopen_state->bs->drv; 1830 assert(drv != NULL); 1831 1832 if (drv->bdrv_reopen_abort) { 1833 drv->bdrv_reopen_abort(reopen_state); 1834 } 1835 } 1836 1837 1838 void bdrv_close(BlockDriverState *bs) 1839 { 1840 if (bs->job) { 1841 block_job_cancel_sync(bs->job); 1842 } 1843 bdrv_drain_all(); /* complete I/O */ 1844 bdrv_flush(bs); 1845 bdrv_drain_all(); /* in case flush left pending I/O */ 1846 notifier_list_notify(&bs->close_notifiers, bs); 1847 1848 if (bs->drv) { 1849 if (bs->backing_hd) { 1850 BlockDriverState *backing_hd = bs->backing_hd; 1851 bdrv_set_backing_hd(bs, NULL); 1852 bdrv_unref(backing_hd); 1853 } 1854 bs->drv->bdrv_close(bs); 1855 g_free(bs->opaque); 1856 bs->opaque = NULL; 1857 bs->drv = NULL; 1858 bs->copy_on_read = 0; 1859 bs->backing_file[0] = '\0'; 1860 bs->backing_format[0] = '\0'; 1861 bs->total_sectors = 0; 1862 bs->encrypted = 0; 1863 bs->valid_key = 0; 1864 bs->sg = 0; 1865 bs->growable = 0; 1866 bs->zero_beyond_eof = false; 1867 QDECREF(bs->options); 1868 bs->options = NULL; 1869 1870 if (bs->file != NULL) { 1871 bdrv_unref(bs->file); 1872 bs->file = NULL; 1873 } 1874 } 1875 1876 bdrv_dev_change_media_cb(bs, false); 1877 1878 /*throttling disk I/O limits*/ 1879 if (bs->io_limits_enabled) { 1880 bdrv_io_limits_disable(bs); 1881 } 1882 } 1883 1884 void bdrv_close_all(void) 1885 { 1886 BlockDriverState *bs; 1887 1888 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 1889 AioContext *aio_context = bdrv_get_aio_context(bs); 1890 1891 aio_context_acquire(aio_context); 1892 bdrv_close(bs); 1893 aio_context_release(aio_context); 1894 } 1895 } 1896 1897 /* Check if any requests are in-flight (including throttled requests) */ 1898 static bool bdrv_requests_pending(BlockDriverState *bs) 1899 { 1900 if (!QLIST_EMPTY(&bs->tracked_requests)) { 1901 return true; 1902 } 1903 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 1904 return true; 1905 } 1906 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 1907 return true; 1908 } 1909 if (bs->file && bdrv_requests_pending(bs->file)) { 1910 return true; 1911 } 1912 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 1913 return true; 1914 } 1915 return false; 1916 } 1917 1918 /* 1919 * Wait for pending requests to complete across all BlockDriverStates 1920 * 1921 * This function does not flush data to disk, use bdrv_flush_all() for that 1922 * after calling this function. 1923 * 1924 * Note that completion of an asynchronous I/O operation can trigger any 1925 * number of other I/O operations on other devices---for example a coroutine 1926 * can be arbitrarily complex and a constant flow of I/O can come until the 1927 * coroutine is complete. Because of this, it is not possible to have a 1928 * function to drain a single device's I/O queue. 1929 */ 1930 void bdrv_drain_all(void) 1931 { 1932 /* Always run first iteration so any pending completion BHs run */ 1933 bool busy = true; 1934 BlockDriverState *bs; 1935 1936 while (busy) { 1937 busy = false; 1938 1939 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 1940 AioContext *aio_context = bdrv_get_aio_context(bs); 1941 bool bs_busy; 1942 1943 aio_context_acquire(aio_context); 1944 bdrv_start_throttled_reqs(bs); 1945 bs_busy = bdrv_requests_pending(bs); 1946 bs_busy |= aio_poll(aio_context, bs_busy); 1947 aio_context_release(aio_context); 1948 1949 busy |= bs_busy; 1950 } 1951 } 1952 } 1953 1954 /* make a BlockDriverState anonymous by removing from bdrv_state and 1955 * graph_bdrv_state list. 1956 Also, NULL terminate the device_name to prevent double remove */ 1957 void bdrv_make_anon(BlockDriverState *bs) 1958 { 1959 if (bs->device_name[0] != '\0') { 1960 QTAILQ_REMOVE(&bdrv_states, bs, device_list); 1961 } 1962 bs->device_name[0] = '\0'; 1963 if (bs->node_name[0] != '\0') { 1964 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list); 1965 } 1966 bs->node_name[0] = '\0'; 1967 } 1968 1969 static void bdrv_rebind(BlockDriverState *bs) 1970 { 1971 if (bs->drv && bs->drv->bdrv_rebind) { 1972 bs->drv->bdrv_rebind(bs); 1973 } 1974 } 1975 1976 static void bdrv_move_feature_fields(BlockDriverState *bs_dest, 1977 BlockDriverState *bs_src) 1978 { 1979 /* move some fields that need to stay attached to the device */ 1980 1981 /* dev info */ 1982 bs_dest->dev_ops = bs_src->dev_ops; 1983 bs_dest->dev_opaque = bs_src->dev_opaque; 1984 bs_dest->dev = bs_src->dev; 1985 bs_dest->guest_block_size = bs_src->guest_block_size; 1986 bs_dest->copy_on_read = bs_src->copy_on_read; 1987 1988 bs_dest->enable_write_cache = bs_src->enable_write_cache; 1989 1990 /* i/o throttled req */ 1991 memcpy(&bs_dest->throttle_state, 1992 &bs_src->throttle_state, 1993 sizeof(ThrottleState)); 1994 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0]; 1995 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1]; 1996 bs_dest->io_limits_enabled = bs_src->io_limits_enabled; 1997 1998 /* r/w error */ 1999 bs_dest->on_read_error = bs_src->on_read_error; 2000 bs_dest->on_write_error = bs_src->on_write_error; 2001 2002 /* i/o status */ 2003 bs_dest->iostatus_enabled = bs_src->iostatus_enabled; 2004 bs_dest->iostatus = bs_src->iostatus; 2005 2006 /* dirty bitmap */ 2007 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps; 2008 2009 /* reference count */ 2010 bs_dest->refcnt = bs_src->refcnt; 2011 2012 /* job */ 2013 bs_dest->job = bs_src->job; 2014 2015 /* keep the same entry in bdrv_states */ 2016 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), 2017 bs_src->device_name); 2018 bs_dest->device_list = bs_src->device_list; 2019 memcpy(bs_dest->op_blockers, bs_src->op_blockers, 2020 sizeof(bs_dest->op_blockers)); 2021 } 2022 2023 /* 2024 * Swap bs contents for two image chains while they are live, 2025 * while keeping required fields on the BlockDriverState that is 2026 * actually attached to a device. 2027 * 2028 * This will modify the BlockDriverState fields, and swap contents 2029 * between bs_new and bs_old. Both bs_new and bs_old are modified. 2030 * 2031 * bs_new is required to be anonymous. 2032 * 2033 * This function does not create any image files. 2034 */ 2035 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) 2036 { 2037 BlockDriverState tmp; 2038 2039 /* The code needs to swap the node_name but simply swapping node_list won't 2040 * work so first remove the nodes from the graph list, do the swap then 2041 * insert them back if needed. 2042 */ 2043 if (bs_new->node_name[0] != '\0') { 2044 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list); 2045 } 2046 if (bs_old->node_name[0] != '\0') { 2047 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list); 2048 } 2049 2050 /* bs_new must be anonymous and shouldn't have anything fancy enabled */ 2051 assert(bs_new->device_name[0] == '\0'); 2052 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps)); 2053 assert(bs_new->job == NULL); 2054 assert(bs_new->dev == NULL); 2055 assert(bs_new->io_limits_enabled == false); 2056 assert(!throttle_have_timer(&bs_new->throttle_state)); 2057 2058 tmp = *bs_new; 2059 *bs_new = *bs_old; 2060 *bs_old = tmp; 2061 2062 /* there are some fields that should not be swapped, move them back */ 2063 bdrv_move_feature_fields(&tmp, bs_old); 2064 bdrv_move_feature_fields(bs_old, bs_new); 2065 bdrv_move_feature_fields(bs_new, &tmp); 2066 2067 /* bs_new shouldn't be in bdrv_states even after the swap! */ 2068 assert(bs_new->device_name[0] == '\0'); 2069 2070 /* Check a few fields that should remain attached to the device */ 2071 assert(bs_new->dev == NULL); 2072 assert(bs_new->job == NULL); 2073 assert(bs_new->io_limits_enabled == false); 2074 assert(!throttle_have_timer(&bs_new->throttle_state)); 2075 2076 /* insert the nodes back into the graph node list if needed */ 2077 if (bs_new->node_name[0] != '\0') { 2078 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list); 2079 } 2080 if (bs_old->node_name[0] != '\0') { 2081 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list); 2082 } 2083 2084 bdrv_rebind(bs_new); 2085 bdrv_rebind(bs_old); 2086 } 2087 2088 /* 2089 * Add new bs contents at the top of an image chain while the chain is 2090 * live, while keeping required fields on the top layer. 2091 * 2092 * This will modify the BlockDriverState fields, and swap contents 2093 * between bs_new and bs_top. Both bs_new and bs_top are modified. 2094 * 2095 * bs_new is required to be anonymous. 2096 * 2097 * This function does not create any image files. 2098 */ 2099 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) 2100 { 2101 bdrv_swap(bs_new, bs_top); 2102 2103 /* The contents of 'tmp' will become bs_top, as we are 2104 * swapping bs_new and bs_top contents. */ 2105 bdrv_set_backing_hd(bs_top, bs_new); 2106 } 2107 2108 static void bdrv_delete(BlockDriverState *bs) 2109 { 2110 assert(!bs->dev); 2111 assert(!bs->job); 2112 assert(bdrv_op_blocker_is_empty(bs)); 2113 assert(!bs->refcnt); 2114 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 2115 2116 bdrv_close(bs); 2117 2118 /* remove from list, if necessary */ 2119 bdrv_make_anon(bs); 2120 2121 g_free(bs); 2122 } 2123 2124 int bdrv_attach_dev(BlockDriverState *bs, void *dev) 2125 /* TODO change to DeviceState *dev when all users are qdevified */ 2126 { 2127 if (bs->dev) { 2128 return -EBUSY; 2129 } 2130 bs->dev = dev; 2131 bdrv_iostatus_reset(bs); 2132 return 0; 2133 } 2134 2135 /* TODO qdevified devices don't use this, remove when devices are qdevified */ 2136 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev) 2137 { 2138 if (bdrv_attach_dev(bs, dev) < 0) { 2139 abort(); 2140 } 2141 } 2142 2143 void bdrv_detach_dev(BlockDriverState *bs, void *dev) 2144 /* TODO change to DeviceState *dev when all users are qdevified */ 2145 { 2146 assert(bs->dev == dev); 2147 bs->dev = NULL; 2148 bs->dev_ops = NULL; 2149 bs->dev_opaque = NULL; 2150 bs->guest_block_size = 512; 2151 } 2152 2153 /* TODO change to return DeviceState * when all users are qdevified */ 2154 void *bdrv_get_attached_dev(BlockDriverState *bs) 2155 { 2156 return bs->dev; 2157 } 2158 2159 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, 2160 void *opaque) 2161 { 2162 bs->dev_ops = ops; 2163 bs->dev_opaque = opaque; 2164 } 2165 2166 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load) 2167 { 2168 if (bs->dev_ops && bs->dev_ops->change_media_cb) { 2169 bool tray_was_closed = !bdrv_dev_is_tray_open(bs); 2170 bs->dev_ops->change_media_cb(bs->dev_opaque, load); 2171 if (tray_was_closed) { 2172 /* tray open */ 2173 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs), 2174 true, &error_abort); 2175 } 2176 if (load) { 2177 /* tray close */ 2178 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs), 2179 false, &error_abort); 2180 } 2181 } 2182 } 2183 2184 bool bdrv_dev_has_removable_media(BlockDriverState *bs) 2185 { 2186 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb); 2187 } 2188 2189 void bdrv_dev_eject_request(BlockDriverState *bs, bool force) 2190 { 2191 if (bs->dev_ops && bs->dev_ops->eject_request_cb) { 2192 bs->dev_ops->eject_request_cb(bs->dev_opaque, force); 2193 } 2194 } 2195 2196 bool bdrv_dev_is_tray_open(BlockDriverState *bs) 2197 { 2198 if (bs->dev_ops && bs->dev_ops->is_tray_open) { 2199 return bs->dev_ops->is_tray_open(bs->dev_opaque); 2200 } 2201 return false; 2202 } 2203 2204 static void bdrv_dev_resize_cb(BlockDriverState *bs) 2205 { 2206 if (bs->dev_ops && bs->dev_ops->resize_cb) { 2207 bs->dev_ops->resize_cb(bs->dev_opaque); 2208 } 2209 } 2210 2211 bool bdrv_dev_is_medium_locked(BlockDriverState *bs) 2212 { 2213 if (bs->dev_ops && bs->dev_ops->is_medium_locked) { 2214 return bs->dev_ops->is_medium_locked(bs->dev_opaque); 2215 } 2216 return false; 2217 } 2218 2219 /* 2220 * Run consistency checks on an image 2221 * 2222 * Returns 0 if the check could be completed (it doesn't mean that the image is 2223 * free of errors) or -errno when an internal error occurred. The results of the 2224 * check are stored in res. 2225 */ 2226 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 2227 { 2228 if (bs->drv->bdrv_check == NULL) { 2229 return -ENOTSUP; 2230 } 2231 2232 memset(res, 0, sizeof(*res)); 2233 return bs->drv->bdrv_check(bs, res, fix); 2234 } 2235 2236 #define COMMIT_BUF_SECTORS 2048 2237 2238 /* commit COW file into the raw image */ 2239 int bdrv_commit(BlockDriverState *bs) 2240 { 2241 BlockDriver *drv = bs->drv; 2242 int64_t sector, total_sectors, length, backing_length; 2243 int n, ro, open_flags; 2244 int ret = 0; 2245 uint8_t *buf = NULL; 2246 char filename[PATH_MAX]; 2247 2248 if (!drv) 2249 return -ENOMEDIUM; 2250 2251 if (!bs->backing_hd) { 2252 return -ENOTSUP; 2253 } 2254 2255 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) || 2256 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) { 2257 return -EBUSY; 2258 } 2259 2260 ro = bs->backing_hd->read_only; 2261 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ 2262 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); 2263 open_flags = bs->backing_hd->open_flags; 2264 2265 if (ro) { 2266 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { 2267 return -EACCES; 2268 } 2269 } 2270 2271 length = bdrv_getlength(bs); 2272 if (length < 0) { 2273 ret = length; 2274 goto ro_cleanup; 2275 } 2276 2277 backing_length = bdrv_getlength(bs->backing_hd); 2278 if (backing_length < 0) { 2279 ret = backing_length; 2280 goto ro_cleanup; 2281 } 2282 2283 /* If our top snapshot is larger than the backing file image, 2284 * grow the backing file image if possible. If not possible, 2285 * we must return an error */ 2286 if (length > backing_length) { 2287 ret = bdrv_truncate(bs->backing_hd, length); 2288 if (ret < 0) { 2289 goto ro_cleanup; 2290 } 2291 } 2292 2293 total_sectors = length >> BDRV_SECTOR_BITS; 2294 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); 2295 2296 for (sector = 0; sector < total_sectors; sector += n) { 2297 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); 2298 if (ret < 0) { 2299 goto ro_cleanup; 2300 } 2301 if (ret) { 2302 ret = bdrv_read(bs, sector, buf, n); 2303 if (ret < 0) { 2304 goto ro_cleanup; 2305 } 2306 2307 ret = bdrv_write(bs->backing_hd, sector, buf, n); 2308 if (ret < 0) { 2309 goto ro_cleanup; 2310 } 2311 } 2312 } 2313 2314 if (drv->bdrv_make_empty) { 2315 ret = drv->bdrv_make_empty(bs); 2316 if (ret < 0) { 2317 goto ro_cleanup; 2318 } 2319 bdrv_flush(bs); 2320 } 2321 2322 /* 2323 * Make sure all data we wrote to the backing device is actually 2324 * stable on disk. 2325 */ 2326 if (bs->backing_hd) { 2327 bdrv_flush(bs->backing_hd); 2328 } 2329 2330 ret = 0; 2331 ro_cleanup: 2332 g_free(buf); 2333 2334 if (ro) { 2335 /* ignoring error return here */ 2336 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); 2337 } 2338 2339 return ret; 2340 } 2341 2342 int bdrv_commit_all(void) 2343 { 2344 BlockDriverState *bs; 2345 2346 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 2347 AioContext *aio_context = bdrv_get_aio_context(bs); 2348 2349 aio_context_acquire(aio_context); 2350 if (bs->drv && bs->backing_hd) { 2351 int ret = bdrv_commit(bs); 2352 if (ret < 0) { 2353 aio_context_release(aio_context); 2354 return ret; 2355 } 2356 } 2357 aio_context_release(aio_context); 2358 } 2359 return 0; 2360 } 2361 2362 /** 2363 * Remove an active request from the tracked requests list 2364 * 2365 * This function should be called when a tracked request is completing. 2366 */ 2367 static void tracked_request_end(BdrvTrackedRequest *req) 2368 { 2369 if (req->serialising) { 2370 req->bs->serialising_in_flight--; 2371 } 2372 2373 QLIST_REMOVE(req, list); 2374 qemu_co_queue_restart_all(&req->wait_queue); 2375 } 2376 2377 /** 2378 * Add an active request to the tracked requests list 2379 */ 2380 static void tracked_request_begin(BdrvTrackedRequest *req, 2381 BlockDriverState *bs, 2382 int64_t offset, 2383 unsigned int bytes, bool is_write) 2384 { 2385 *req = (BdrvTrackedRequest){ 2386 .bs = bs, 2387 .offset = offset, 2388 .bytes = bytes, 2389 .is_write = is_write, 2390 .co = qemu_coroutine_self(), 2391 .serialising = false, 2392 .overlap_offset = offset, 2393 .overlap_bytes = bytes, 2394 }; 2395 2396 qemu_co_queue_init(&req->wait_queue); 2397 2398 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 2399 } 2400 2401 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 2402 { 2403 int64_t overlap_offset = req->offset & ~(align - 1); 2404 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 2405 - overlap_offset; 2406 2407 if (!req->serialising) { 2408 req->bs->serialising_in_flight++; 2409 req->serialising = true; 2410 } 2411 2412 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 2413 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 2414 } 2415 2416 /** 2417 * Round a region to cluster boundaries 2418 */ 2419 void bdrv_round_to_clusters(BlockDriverState *bs, 2420 int64_t sector_num, int nb_sectors, 2421 int64_t *cluster_sector_num, 2422 int *cluster_nb_sectors) 2423 { 2424 BlockDriverInfo bdi; 2425 2426 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 2427 *cluster_sector_num = sector_num; 2428 *cluster_nb_sectors = nb_sectors; 2429 } else { 2430 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 2431 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 2432 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 2433 nb_sectors, c); 2434 } 2435 } 2436 2437 static int bdrv_get_cluster_size(BlockDriverState *bs) 2438 { 2439 BlockDriverInfo bdi; 2440 int ret; 2441 2442 ret = bdrv_get_info(bs, &bdi); 2443 if (ret < 0 || bdi.cluster_size == 0) { 2444 return bs->request_alignment; 2445 } else { 2446 return bdi.cluster_size; 2447 } 2448 } 2449 2450 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 2451 int64_t offset, unsigned int bytes) 2452 { 2453 /* aaaa bbbb */ 2454 if (offset >= req->overlap_offset + req->overlap_bytes) { 2455 return false; 2456 } 2457 /* bbbb aaaa */ 2458 if (req->overlap_offset >= offset + bytes) { 2459 return false; 2460 } 2461 return true; 2462 } 2463 2464 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 2465 { 2466 BlockDriverState *bs = self->bs; 2467 BdrvTrackedRequest *req; 2468 bool retry; 2469 bool waited = false; 2470 2471 if (!bs->serialising_in_flight) { 2472 return false; 2473 } 2474 2475 do { 2476 retry = false; 2477 QLIST_FOREACH(req, &bs->tracked_requests, list) { 2478 if (req == self || (!req->serialising && !self->serialising)) { 2479 continue; 2480 } 2481 if (tracked_request_overlaps(req, self->overlap_offset, 2482 self->overlap_bytes)) 2483 { 2484 /* Hitting this means there was a reentrant request, for 2485 * example, a block driver issuing nested requests. This must 2486 * never happen since it means deadlock. 2487 */ 2488 assert(qemu_coroutine_self() != req->co); 2489 2490 /* If the request is already (indirectly) waiting for us, or 2491 * will wait for us as soon as it wakes up, then just go on 2492 * (instead of producing a deadlock in the former case). */ 2493 if (!req->waiting_for) { 2494 self->waiting_for = req; 2495 qemu_co_queue_wait(&req->wait_queue); 2496 self->waiting_for = NULL; 2497 retry = true; 2498 waited = true; 2499 break; 2500 } 2501 } 2502 } 2503 } while (retry); 2504 2505 return waited; 2506 } 2507 2508 /* 2509 * Return values: 2510 * 0 - success 2511 * -EINVAL - backing format specified, but no file 2512 * -ENOSPC - can't update the backing file because no space is left in the 2513 * image file header 2514 * -ENOTSUP - format driver doesn't support changing the backing file 2515 */ 2516 int bdrv_change_backing_file(BlockDriverState *bs, 2517 const char *backing_file, const char *backing_fmt) 2518 { 2519 BlockDriver *drv = bs->drv; 2520 int ret; 2521 2522 /* Backing file format doesn't make sense without a backing file */ 2523 if (backing_fmt && !backing_file) { 2524 return -EINVAL; 2525 } 2526 2527 if (drv->bdrv_change_backing_file != NULL) { 2528 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); 2529 } else { 2530 ret = -ENOTSUP; 2531 } 2532 2533 if (ret == 0) { 2534 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 2535 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 2536 } 2537 return ret; 2538 } 2539 2540 /* 2541 * Finds the image layer in the chain that has 'bs' as its backing file. 2542 * 2543 * active is the current topmost image. 2544 * 2545 * Returns NULL if bs is not found in active's image chain, 2546 * or if active == bs. 2547 */ 2548 BlockDriverState *bdrv_find_overlay(BlockDriverState *active, 2549 BlockDriverState *bs) 2550 { 2551 BlockDriverState *overlay = NULL; 2552 BlockDriverState *intermediate; 2553 2554 assert(active != NULL); 2555 assert(bs != NULL); 2556 2557 /* if bs is the same as active, then by definition it has no overlay 2558 */ 2559 if (active == bs) { 2560 return NULL; 2561 } 2562 2563 intermediate = active; 2564 while (intermediate->backing_hd) { 2565 if (intermediate->backing_hd == bs) { 2566 overlay = intermediate; 2567 break; 2568 } 2569 intermediate = intermediate->backing_hd; 2570 } 2571 2572 return overlay; 2573 } 2574 2575 typedef struct BlkIntermediateStates { 2576 BlockDriverState *bs; 2577 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; 2578 } BlkIntermediateStates; 2579 2580 2581 /* 2582 * Drops images above 'base' up to and including 'top', and sets the image 2583 * above 'top' to have base as its backing file. 2584 * 2585 * Requires that the overlay to 'top' is opened r/w, so that the backing file 2586 * information in 'bs' can be properly updated. 2587 * 2588 * E.g., this will convert the following chain: 2589 * bottom <- base <- intermediate <- top <- active 2590 * 2591 * to 2592 * 2593 * bottom <- base <- active 2594 * 2595 * It is allowed for bottom==base, in which case it converts: 2596 * 2597 * base <- intermediate <- top <- active 2598 * 2599 * to 2600 * 2601 * base <- active 2602 * 2603 * Error conditions: 2604 * if active == top, that is considered an error 2605 * 2606 */ 2607 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, 2608 BlockDriverState *base) 2609 { 2610 BlockDriverState *intermediate; 2611 BlockDriverState *base_bs = NULL; 2612 BlockDriverState *new_top_bs = NULL; 2613 BlkIntermediateStates *intermediate_state, *next; 2614 int ret = -EIO; 2615 2616 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; 2617 QSIMPLEQ_INIT(&states_to_delete); 2618 2619 if (!top->drv || !base->drv) { 2620 goto exit; 2621 } 2622 2623 new_top_bs = bdrv_find_overlay(active, top); 2624 2625 if (new_top_bs == NULL) { 2626 /* we could not find the image above 'top', this is an error */ 2627 goto exit; 2628 } 2629 2630 /* special case of new_top_bs->backing_hd already pointing to base - nothing 2631 * to do, no intermediate images */ 2632 if (new_top_bs->backing_hd == base) { 2633 ret = 0; 2634 goto exit; 2635 } 2636 2637 intermediate = top; 2638 2639 /* now we will go down through the list, and add each BDS we find 2640 * into our deletion queue, until we hit the 'base' 2641 */ 2642 while (intermediate) { 2643 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates)); 2644 intermediate_state->bs = intermediate; 2645 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); 2646 2647 if (intermediate->backing_hd == base) { 2648 base_bs = intermediate->backing_hd; 2649 break; 2650 } 2651 intermediate = intermediate->backing_hd; 2652 } 2653 if (base_bs == NULL) { 2654 /* something went wrong, we did not end at the base. safely 2655 * unravel everything, and exit with error */ 2656 goto exit; 2657 } 2658 2659 /* success - we can delete the intermediate states, and link top->base */ 2660 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename, 2661 base_bs->drv ? base_bs->drv->format_name : ""); 2662 if (ret) { 2663 goto exit; 2664 } 2665 bdrv_set_backing_hd(new_top_bs, base_bs); 2666 2667 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2668 /* so that bdrv_close() does not recursively close the chain */ 2669 bdrv_set_backing_hd(intermediate_state->bs, NULL); 2670 bdrv_unref(intermediate_state->bs); 2671 } 2672 ret = 0; 2673 2674 exit: 2675 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2676 g_free(intermediate_state); 2677 } 2678 return ret; 2679 } 2680 2681 2682 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 2683 size_t size) 2684 { 2685 int64_t len; 2686 2687 if (size > INT_MAX) { 2688 return -EIO; 2689 } 2690 2691 if (!bdrv_is_inserted(bs)) 2692 return -ENOMEDIUM; 2693 2694 if (bs->growable) 2695 return 0; 2696 2697 len = bdrv_getlength(bs); 2698 2699 if (offset < 0) 2700 return -EIO; 2701 2702 if ((offset > len) || (len - offset < size)) 2703 return -EIO; 2704 2705 return 0; 2706 } 2707 2708 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 2709 int nb_sectors) 2710 { 2711 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2712 return -EIO; 2713 } 2714 2715 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 2716 nb_sectors * BDRV_SECTOR_SIZE); 2717 } 2718 2719 typedef struct RwCo { 2720 BlockDriverState *bs; 2721 int64_t offset; 2722 QEMUIOVector *qiov; 2723 bool is_write; 2724 int ret; 2725 BdrvRequestFlags flags; 2726 } RwCo; 2727 2728 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 2729 { 2730 RwCo *rwco = opaque; 2731 2732 if (!rwco->is_write) { 2733 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 2734 rwco->qiov->size, rwco->qiov, 2735 rwco->flags); 2736 } else { 2737 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 2738 rwco->qiov->size, rwco->qiov, 2739 rwco->flags); 2740 } 2741 } 2742 2743 /* 2744 * Process a vectored synchronous request using coroutines 2745 */ 2746 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 2747 QEMUIOVector *qiov, bool is_write, 2748 BdrvRequestFlags flags) 2749 { 2750 Coroutine *co; 2751 RwCo rwco = { 2752 .bs = bs, 2753 .offset = offset, 2754 .qiov = qiov, 2755 .is_write = is_write, 2756 .ret = NOT_DONE, 2757 .flags = flags, 2758 }; 2759 2760 /** 2761 * In sync call context, when the vcpu is blocked, this throttling timer 2762 * will not fire; so the I/O throttling function has to be disabled here 2763 * if it has been enabled. 2764 */ 2765 if (bs->io_limits_enabled) { 2766 fprintf(stderr, "Disabling I/O throttling on '%s' due " 2767 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 2768 bdrv_io_limits_disable(bs); 2769 } 2770 2771 if (qemu_in_coroutine()) { 2772 /* Fast-path if already in coroutine context */ 2773 bdrv_rw_co_entry(&rwco); 2774 } else { 2775 AioContext *aio_context = bdrv_get_aio_context(bs); 2776 2777 co = qemu_coroutine_create(bdrv_rw_co_entry); 2778 qemu_coroutine_enter(co, &rwco); 2779 while (rwco.ret == NOT_DONE) { 2780 aio_poll(aio_context, true); 2781 } 2782 } 2783 return rwco.ret; 2784 } 2785 2786 /* 2787 * Process a synchronous request using coroutines 2788 */ 2789 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 2790 int nb_sectors, bool is_write, BdrvRequestFlags flags) 2791 { 2792 QEMUIOVector qiov; 2793 struct iovec iov = { 2794 .iov_base = (void *)buf, 2795 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 2796 }; 2797 2798 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { 2799 return -EINVAL; 2800 } 2801 2802 qemu_iovec_init_external(&qiov, &iov, 1); 2803 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 2804 &qiov, is_write, flags); 2805 } 2806 2807 /* return < 0 if error. See bdrv_write() for the return codes */ 2808 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 2809 uint8_t *buf, int nb_sectors) 2810 { 2811 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 2812 } 2813 2814 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 2815 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 2816 uint8_t *buf, int nb_sectors) 2817 { 2818 bool enabled; 2819 int ret; 2820 2821 enabled = bs->io_limits_enabled; 2822 bs->io_limits_enabled = false; 2823 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 2824 bs->io_limits_enabled = enabled; 2825 return ret; 2826 } 2827 2828 /* Return < 0 if error. Important errors are: 2829 -EIO generic I/O error (may happen for all errors) 2830 -ENOMEDIUM No media inserted. 2831 -EINVAL Invalid sector number or nb_sectors 2832 -EACCES Trying to write a read-only device 2833 */ 2834 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 2835 const uint8_t *buf, int nb_sectors) 2836 { 2837 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 2838 } 2839 2840 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 2841 int nb_sectors, BdrvRequestFlags flags) 2842 { 2843 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 2844 BDRV_REQ_ZERO_WRITE | flags); 2845 } 2846 2847 /* 2848 * Completely zero out a block device with the help of bdrv_write_zeroes. 2849 * The operation is sped up by checking the block status and only writing 2850 * zeroes to the device if they currently do not return zeroes. Optional 2851 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 2852 * 2853 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 2854 */ 2855 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 2856 { 2857 int64_t target_size; 2858 int64_t ret, nb_sectors, sector_num = 0; 2859 int n; 2860 2861 target_size = bdrv_getlength(bs); 2862 if (target_size < 0) { 2863 return target_size; 2864 } 2865 target_size /= BDRV_SECTOR_SIZE; 2866 2867 for (;;) { 2868 nb_sectors = target_size - sector_num; 2869 if (nb_sectors <= 0) { 2870 return 0; 2871 } 2872 if (nb_sectors > INT_MAX) { 2873 nb_sectors = INT_MAX; 2874 } 2875 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 2876 if (ret < 0) { 2877 error_report("error getting block status at sector %" PRId64 ": %s", 2878 sector_num, strerror(-ret)); 2879 return ret; 2880 } 2881 if (ret & BDRV_BLOCK_ZERO) { 2882 sector_num += n; 2883 continue; 2884 } 2885 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 2886 if (ret < 0) { 2887 error_report("error writing zeroes at sector %" PRId64 ": %s", 2888 sector_num, strerror(-ret)); 2889 return ret; 2890 } 2891 sector_num += n; 2892 } 2893 } 2894 2895 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 2896 { 2897 QEMUIOVector qiov; 2898 struct iovec iov = { 2899 .iov_base = (void *)buf, 2900 .iov_len = bytes, 2901 }; 2902 int ret; 2903 2904 if (bytes < 0) { 2905 return -EINVAL; 2906 } 2907 2908 qemu_iovec_init_external(&qiov, &iov, 1); 2909 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 2910 if (ret < 0) { 2911 return ret; 2912 } 2913 2914 return bytes; 2915 } 2916 2917 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 2918 { 2919 int ret; 2920 2921 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 2922 if (ret < 0) { 2923 return ret; 2924 } 2925 2926 return qiov->size; 2927 } 2928 2929 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 2930 const void *buf, int bytes) 2931 { 2932 QEMUIOVector qiov; 2933 struct iovec iov = { 2934 .iov_base = (void *) buf, 2935 .iov_len = bytes, 2936 }; 2937 2938 if (bytes < 0) { 2939 return -EINVAL; 2940 } 2941 2942 qemu_iovec_init_external(&qiov, &iov, 1); 2943 return bdrv_pwritev(bs, offset, &qiov); 2944 } 2945 2946 /* 2947 * Writes to the file and ensures that no writes are reordered across this 2948 * request (acts as a barrier) 2949 * 2950 * Returns 0 on success, -errno in error cases. 2951 */ 2952 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 2953 const void *buf, int count) 2954 { 2955 int ret; 2956 2957 ret = bdrv_pwrite(bs, offset, buf, count); 2958 if (ret < 0) { 2959 return ret; 2960 } 2961 2962 /* No flush needed for cache modes that already do it */ 2963 if (bs->enable_write_cache) { 2964 bdrv_flush(bs); 2965 } 2966 2967 return 0; 2968 } 2969 2970 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 2971 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2972 { 2973 /* Perform I/O through a temporary buffer so that users who scribble over 2974 * their read buffer while the operation is in progress do not end up 2975 * modifying the image file. This is critical for zero-copy guest I/O 2976 * where anything might happen inside guest memory. 2977 */ 2978 void *bounce_buffer; 2979 2980 BlockDriver *drv = bs->drv; 2981 struct iovec iov; 2982 QEMUIOVector bounce_qiov; 2983 int64_t cluster_sector_num; 2984 int cluster_nb_sectors; 2985 size_t skip_bytes; 2986 int ret; 2987 2988 /* Cover entire cluster so no additional backing file I/O is required when 2989 * allocating cluster in the image file. 2990 */ 2991 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2992 &cluster_sector_num, &cluster_nb_sectors); 2993 2994 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 2995 cluster_sector_num, cluster_nb_sectors); 2996 2997 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 2998 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); 2999 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 3000 3001 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 3002 &bounce_qiov); 3003 if (ret < 0) { 3004 goto err; 3005 } 3006 3007 if (drv->bdrv_co_write_zeroes && 3008 buffer_is_zero(bounce_buffer, iov.iov_len)) { 3009 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 3010 cluster_nb_sectors, 0); 3011 } else { 3012 /* This does not change the data on the disk, it is not necessary 3013 * to flush even in cache=writethrough mode. 3014 */ 3015 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 3016 &bounce_qiov); 3017 } 3018 3019 if (ret < 0) { 3020 /* It might be okay to ignore write errors for guest requests. If this 3021 * is a deliberate copy-on-read then we don't want to ignore the error. 3022 * Simply report it in all cases. 3023 */ 3024 goto err; 3025 } 3026 3027 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 3028 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 3029 nb_sectors * BDRV_SECTOR_SIZE); 3030 3031 err: 3032 qemu_vfree(bounce_buffer); 3033 return ret; 3034 } 3035 3036 /* 3037 * Forwards an already correctly aligned request to the BlockDriver. This 3038 * handles copy on read and zeroing after EOF; any other features must be 3039 * implemented by the caller. 3040 */ 3041 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 3042 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 3043 int64_t align, QEMUIOVector *qiov, int flags) 3044 { 3045 BlockDriver *drv = bs->drv; 3046 int ret; 3047 3048 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 3049 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 3050 3051 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 3052 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 3053 3054 /* Handle Copy on Read and associated serialisation */ 3055 if (flags & BDRV_REQ_COPY_ON_READ) { 3056 /* If we touch the same cluster it counts as an overlap. This 3057 * guarantees that allocating writes will be serialized and not race 3058 * with each other for the same cluster. For example, in copy-on-read 3059 * it ensures that the CoR read and write operations are atomic and 3060 * guest writes cannot interleave between them. */ 3061 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 3062 } 3063 3064 wait_serialising_requests(req); 3065 3066 if (flags & BDRV_REQ_COPY_ON_READ) { 3067 int pnum; 3068 3069 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 3070 if (ret < 0) { 3071 goto out; 3072 } 3073 3074 if (!ret || pnum != nb_sectors) { 3075 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 3076 goto out; 3077 } 3078 } 3079 3080 /* Forward the request to the BlockDriver */ 3081 if (!(bs->zero_beyond_eof && bs->growable)) { 3082 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 3083 } else { 3084 /* Read zeros after EOF of growable BDSes */ 3085 int64_t len, total_sectors, max_nb_sectors; 3086 3087 len = bdrv_getlength(bs); 3088 if (len < 0) { 3089 ret = len; 3090 goto out; 3091 } 3092 3093 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); 3094 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 3095 align >> BDRV_SECTOR_BITS); 3096 if (max_nb_sectors > 0) { 3097 ret = drv->bdrv_co_readv(bs, sector_num, 3098 MIN(nb_sectors, max_nb_sectors), qiov); 3099 } else { 3100 ret = 0; 3101 } 3102 3103 /* Reading beyond end of file is supposed to produce zeroes */ 3104 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 3105 uint64_t offset = MAX(0, total_sectors - sector_num); 3106 uint64_t bytes = (sector_num + nb_sectors - offset) * 3107 BDRV_SECTOR_SIZE; 3108 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 3109 } 3110 } 3111 3112 out: 3113 return ret; 3114 } 3115 3116 /* 3117 * Handle a read request in coroutine context 3118 */ 3119 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 3120 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 3121 BdrvRequestFlags flags) 3122 { 3123 BlockDriver *drv = bs->drv; 3124 BdrvTrackedRequest req; 3125 3126 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 3127 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 3128 uint8_t *head_buf = NULL; 3129 uint8_t *tail_buf = NULL; 3130 QEMUIOVector local_qiov; 3131 bool use_local_qiov = false; 3132 int ret; 3133 3134 if (!drv) { 3135 return -ENOMEDIUM; 3136 } 3137 if (bdrv_check_byte_request(bs, offset, bytes)) { 3138 return -EIO; 3139 } 3140 3141 if (bs->copy_on_read) { 3142 flags |= BDRV_REQ_COPY_ON_READ; 3143 } 3144 3145 /* throttling disk I/O */ 3146 if (bs->io_limits_enabled) { 3147 bdrv_io_limits_intercept(bs, bytes, false); 3148 } 3149 3150 /* Align read if necessary by padding qiov */ 3151 if (offset & (align - 1)) { 3152 head_buf = qemu_blockalign(bs, align); 3153 qemu_iovec_init(&local_qiov, qiov->niov + 2); 3154 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 3155 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3156 use_local_qiov = true; 3157 3158 bytes += offset & (align - 1); 3159 offset = offset & ~(align - 1); 3160 } 3161 3162 if ((offset + bytes) & (align - 1)) { 3163 if (!use_local_qiov) { 3164 qemu_iovec_init(&local_qiov, qiov->niov + 1); 3165 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3166 use_local_qiov = true; 3167 } 3168 tail_buf = qemu_blockalign(bs, align); 3169 qemu_iovec_add(&local_qiov, tail_buf, 3170 align - ((offset + bytes) & (align - 1))); 3171 3172 bytes = ROUND_UP(bytes, align); 3173 } 3174 3175 tracked_request_begin(&req, bs, offset, bytes, false); 3176 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 3177 use_local_qiov ? &local_qiov : qiov, 3178 flags); 3179 tracked_request_end(&req); 3180 3181 if (use_local_qiov) { 3182 qemu_iovec_destroy(&local_qiov); 3183 qemu_vfree(head_buf); 3184 qemu_vfree(tail_buf); 3185 } 3186 3187 return ret; 3188 } 3189 3190 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 3191 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 3192 BdrvRequestFlags flags) 3193 { 3194 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { 3195 return -EINVAL; 3196 } 3197 3198 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 3199 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 3200 } 3201 3202 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 3203 int nb_sectors, QEMUIOVector *qiov) 3204 { 3205 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 3206 3207 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 3208 } 3209 3210 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 3211 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 3212 { 3213 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 3214 3215 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 3216 BDRV_REQ_COPY_ON_READ); 3217 } 3218 3219 /* if no limit is specified in the BlockLimits use a default 3220 * of 32768 512-byte sectors (16 MiB) per request. 3221 */ 3222 #define MAX_WRITE_ZEROES_DEFAULT 32768 3223 3224 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 3225 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 3226 { 3227 BlockDriver *drv = bs->drv; 3228 QEMUIOVector qiov; 3229 struct iovec iov = {0}; 3230 int ret = 0; 3231 3232 int max_write_zeroes = bs->bl.max_write_zeroes ? 3233 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; 3234 3235 while (nb_sectors > 0 && !ret) { 3236 int num = nb_sectors; 3237 3238 /* Align request. Block drivers can expect the "bulk" of the request 3239 * to be aligned. 3240 */ 3241 if (bs->bl.write_zeroes_alignment 3242 && num > bs->bl.write_zeroes_alignment) { 3243 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 3244 /* Make a small request up to the first aligned sector. */ 3245 num = bs->bl.write_zeroes_alignment; 3246 num -= sector_num % bs->bl.write_zeroes_alignment; 3247 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 3248 /* Shorten the request to the last aligned sector. num cannot 3249 * underflow because num > bs->bl.write_zeroes_alignment. 3250 */ 3251 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 3252 } 3253 } 3254 3255 /* limit request size */ 3256 if (num > max_write_zeroes) { 3257 num = max_write_zeroes; 3258 } 3259 3260 ret = -ENOTSUP; 3261 /* First try the efficient write zeroes operation */ 3262 if (drv->bdrv_co_write_zeroes) { 3263 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 3264 } 3265 3266 if (ret == -ENOTSUP) { 3267 /* Fall back to bounce buffer if write zeroes is unsupported */ 3268 iov.iov_len = num * BDRV_SECTOR_SIZE; 3269 if (iov.iov_base == NULL) { 3270 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE); 3271 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 3272 } 3273 qemu_iovec_init_external(&qiov, &iov, 1); 3274 3275 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 3276 3277 /* Keep bounce buffer around if it is big enough for all 3278 * all future requests. 3279 */ 3280 if (num < max_write_zeroes) { 3281 qemu_vfree(iov.iov_base); 3282 iov.iov_base = NULL; 3283 } 3284 } 3285 3286 sector_num += num; 3287 nb_sectors -= num; 3288 } 3289 3290 qemu_vfree(iov.iov_base); 3291 return ret; 3292 } 3293 3294 /* 3295 * Forwards an already correctly aligned write request to the BlockDriver. 3296 */ 3297 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 3298 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 3299 QEMUIOVector *qiov, int flags) 3300 { 3301 BlockDriver *drv = bs->drv; 3302 bool waited; 3303 int ret; 3304 3305 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 3306 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 3307 3308 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 3309 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 3310 3311 waited = wait_serialising_requests(req); 3312 assert(!waited || !req->serialising); 3313 assert(req->overlap_offset <= offset); 3314 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 3315 3316 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 3317 3318 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 3319 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 3320 qemu_iovec_is_zero(qiov)) { 3321 flags |= BDRV_REQ_ZERO_WRITE; 3322 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 3323 flags |= BDRV_REQ_MAY_UNMAP; 3324 } 3325 } 3326 3327 if (ret < 0) { 3328 /* Do nothing, write notifier decided to fail this request */ 3329 } else if (flags & BDRV_REQ_ZERO_WRITE) { 3330 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); 3331 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 3332 } else { 3333 BLKDBG_EVENT(bs, BLKDBG_PWRITEV); 3334 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 3335 } 3336 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); 3337 3338 if (ret == 0 && !bs->enable_write_cache) { 3339 ret = bdrv_co_flush(bs); 3340 } 3341 3342 bdrv_set_dirty(bs, sector_num, nb_sectors); 3343 3344 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { 3345 bs->wr_highest_sector = sector_num + nb_sectors - 1; 3346 } 3347 if (bs->growable && ret >= 0) { 3348 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 3349 } 3350 3351 return ret; 3352 } 3353 3354 /* 3355 * Handle a write request in coroutine context 3356 */ 3357 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 3358 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 3359 BdrvRequestFlags flags) 3360 { 3361 BdrvTrackedRequest req; 3362 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 3363 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 3364 uint8_t *head_buf = NULL; 3365 uint8_t *tail_buf = NULL; 3366 QEMUIOVector local_qiov; 3367 bool use_local_qiov = false; 3368 int ret; 3369 3370 if (!bs->drv) { 3371 return -ENOMEDIUM; 3372 } 3373 if (bs->read_only) { 3374 return -EACCES; 3375 } 3376 if (bdrv_check_byte_request(bs, offset, bytes)) { 3377 return -EIO; 3378 } 3379 3380 /* throttling disk I/O */ 3381 if (bs->io_limits_enabled) { 3382 bdrv_io_limits_intercept(bs, bytes, true); 3383 } 3384 3385 /* 3386 * Align write if necessary by performing a read-modify-write cycle. 3387 * Pad qiov with the read parts and be sure to have a tracked request not 3388 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 3389 */ 3390 tracked_request_begin(&req, bs, offset, bytes, true); 3391 3392 if (offset & (align - 1)) { 3393 QEMUIOVector head_qiov; 3394 struct iovec head_iov; 3395 3396 mark_request_serialising(&req, align); 3397 wait_serialising_requests(&req); 3398 3399 head_buf = qemu_blockalign(bs, align); 3400 head_iov = (struct iovec) { 3401 .iov_base = head_buf, 3402 .iov_len = align, 3403 }; 3404 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 3405 3406 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 3407 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 3408 align, &head_qiov, 0); 3409 if (ret < 0) { 3410 goto fail; 3411 } 3412 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 3413 3414 qemu_iovec_init(&local_qiov, qiov->niov + 2); 3415 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 3416 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3417 use_local_qiov = true; 3418 3419 bytes += offset & (align - 1); 3420 offset = offset & ~(align - 1); 3421 } 3422 3423 if ((offset + bytes) & (align - 1)) { 3424 QEMUIOVector tail_qiov; 3425 struct iovec tail_iov; 3426 size_t tail_bytes; 3427 bool waited; 3428 3429 mark_request_serialising(&req, align); 3430 waited = wait_serialising_requests(&req); 3431 assert(!waited || !use_local_qiov); 3432 3433 tail_buf = qemu_blockalign(bs, align); 3434 tail_iov = (struct iovec) { 3435 .iov_base = tail_buf, 3436 .iov_len = align, 3437 }; 3438 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 3439 3440 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 3441 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 3442 align, &tail_qiov, 0); 3443 if (ret < 0) { 3444 goto fail; 3445 } 3446 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 3447 3448 if (!use_local_qiov) { 3449 qemu_iovec_init(&local_qiov, qiov->niov + 1); 3450 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 3451 use_local_qiov = true; 3452 } 3453 3454 tail_bytes = (offset + bytes) & (align - 1); 3455 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 3456 3457 bytes = ROUND_UP(bytes, align); 3458 } 3459 3460 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 3461 use_local_qiov ? &local_qiov : qiov, 3462 flags); 3463 3464 fail: 3465 tracked_request_end(&req); 3466 3467 if (use_local_qiov) { 3468 qemu_iovec_destroy(&local_qiov); 3469 } 3470 qemu_vfree(head_buf); 3471 qemu_vfree(tail_buf); 3472 3473 return ret; 3474 } 3475 3476 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 3477 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 3478 BdrvRequestFlags flags) 3479 { 3480 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { 3481 return -EINVAL; 3482 } 3483 3484 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 3485 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 3486 } 3487 3488 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 3489 int nb_sectors, QEMUIOVector *qiov) 3490 { 3491 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 3492 3493 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 3494 } 3495 3496 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 3497 int64_t sector_num, int nb_sectors, 3498 BdrvRequestFlags flags) 3499 { 3500 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 3501 3502 if (!(bs->open_flags & BDRV_O_UNMAP)) { 3503 flags &= ~BDRV_REQ_MAY_UNMAP; 3504 } 3505 3506 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 3507 BDRV_REQ_ZERO_WRITE | flags); 3508 } 3509 3510 /** 3511 * Truncate file to 'offset' bytes (needed only for file protocols) 3512 */ 3513 int bdrv_truncate(BlockDriverState *bs, int64_t offset) 3514 { 3515 BlockDriver *drv = bs->drv; 3516 int ret; 3517 if (!drv) 3518 return -ENOMEDIUM; 3519 if (!drv->bdrv_truncate) 3520 return -ENOTSUP; 3521 if (bs->read_only) 3522 return -EACCES; 3523 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) { 3524 return -EBUSY; 3525 } 3526 ret = drv->bdrv_truncate(bs, offset); 3527 if (ret == 0) { 3528 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3529 bdrv_dev_resize_cb(bs); 3530 } 3531 return ret; 3532 } 3533 3534 /** 3535 * Length of a allocated file in bytes. Sparse files are counted by actual 3536 * allocated space. Return < 0 if error or unknown. 3537 */ 3538 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) 3539 { 3540 BlockDriver *drv = bs->drv; 3541 if (!drv) { 3542 return -ENOMEDIUM; 3543 } 3544 if (drv->bdrv_get_allocated_file_size) { 3545 return drv->bdrv_get_allocated_file_size(bs); 3546 } 3547 if (bs->file) { 3548 return bdrv_get_allocated_file_size(bs->file); 3549 } 3550 return -ENOTSUP; 3551 } 3552 3553 /** 3554 * Length of a file in bytes. Return < 0 if error or unknown. 3555 */ 3556 int64_t bdrv_getlength(BlockDriverState *bs) 3557 { 3558 BlockDriver *drv = bs->drv; 3559 if (!drv) 3560 return -ENOMEDIUM; 3561 3562 if (drv->has_variable_length) { 3563 int ret = refresh_total_sectors(bs, bs->total_sectors); 3564 if (ret < 0) { 3565 return ret; 3566 } 3567 } 3568 return bs->total_sectors * BDRV_SECTOR_SIZE; 3569 } 3570 3571 /* return 0 as number of sectors if no device present or error */ 3572 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) 3573 { 3574 int64_t length; 3575 length = bdrv_getlength(bs); 3576 if (length < 0) 3577 length = 0; 3578 else 3579 length = length >> BDRV_SECTOR_BITS; 3580 *nb_sectors_ptr = length; 3581 } 3582 3583 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, 3584 BlockdevOnError on_write_error) 3585 { 3586 bs->on_read_error = on_read_error; 3587 bs->on_write_error = on_write_error; 3588 } 3589 3590 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) 3591 { 3592 return is_read ? bs->on_read_error : bs->on_write_error; 3593 } 3594 3595 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) 3596 { 3597 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; 3598 3599 switch (on_err) { 3600 case BLOCKDEV_ON_ERROR_ENOSPC: 3601 return (error == ENOSPC) ? 3602 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; 3603 case BLOCKDEV_ON_ERROR_STOP: 3604 return BLOCK_ERROR_ACTION_STOP; 3605 case BLOCKDEV_ON_ERROR_REPORT: 3606 return BLOCK_ERROR_ACTION_REPORT; 3607 case BLOCKDEV_ON_ERROR_IGNORE: 3608 return BLOCK_ERROR_ACTION_IGNORE; 3609 default: 3610 abort(); 3611 } 3612 } 3613 3614 /* This is done by device models because, while the block layer knows 3615 * about the error, it does not know whether an operation comes from 3616 * the device or the block layer (from a job, for example). 3617 */ 3618 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, 3619 bool is_read, int error) 3620 { 3621 assert(error >= 0); 3622 3623 if (action == BLOCK_ERROR_ACTION_STOP) { 3624 /* First set the iostatus, so that "info block" returns an iostatus 3625 * that matches the events raised so far (an additional error iostatus 3626 * is fine, but not a lost one). 3627 */ 3628 bdrv_iostatus_set_err(bs, error); 3629 3630 /* Then raise the request to stop the VM and the event. 3631 * qemu_system_vmstop_request_prepare has two effects. First, 3632 * it ensures that the STOP event always comes after the 3633 * BLOCK_IO_ERROR event. Second, it ensures that even if management 3634 * can observe the STOP event and do a "cont" before the STOP 3635 * event is issued, the VM will not stop. In this case, vm_start() 3636 * also ensures that the STOP/RESUME pair of events is emitted. 3637 */ 3638 qemu_system_vmstop_request_prepare(); 3639 qapi_event_send_block_io_error(bdrv_get_device_name(bs), 3640 is_read ? IO_OPERATION_TYPE_READ : 3641 IO_OPERATION_TYPE_WRITE, 3642 action, &error_abort); 3643 qemu_system_vmstop_request(RUN_STATE_IO_ERROR); 3644 } else { 3645 qapi_event_send_block_io_error(bdrv_get_device_name(bs), 3646 is_read ? IO_OPERATION_TYPE_READ : 3647 IO_OPERATION_TYPE_WRITE, 3648 action, &error_abort); 3649 } 3650 } 3651 3652 int bdrv_is_read_only(BlockDriverState *bs) 3653 { 3654 return bs->read_only; 3655 } 3656 3657 int bdrv_is_sg(BlockDriverState *bs) 3658 { 3659 return bs->sg; 3660 } 3661 3662 int bdrv_enable_write_cache(BlockDriverState *bs) 3663 { 3664 return bs->enable_write_cache; 3665 } 3666 3667 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) 3668 { 3669 bs->enable_write_cache = wce; 3670 3671 /* so a reopen() will preserve wce */ 3672 if (wce) { 3673 bs->open_flags |= BDRV_O_CACHE_WB; 3674 } else { 3675 bs->open_flags &= ~BDRV_O_CACHE_WB; 3676 } 3677 } 3678 3679 int bdrv_is_encrypted(BlockDriverState *bs) 3680 { 3681 if (bs->backing_hd && bs->backing_hd->encrypted) 3682 return 1; 3683 return bs->encrypted; 3684 } 3685 3686 int bdrv_key_required(BlockDriverState *bs) 3687 { 3688 BlockDriverState *backing_hd = bs->backing_hd; 3689 3690 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) 3691 return 1; 3692 return (bs->encrypted && !bs->valid_key); 3693 } 3694 3695 int bdrv_set_key(BlockDriverState *bs, const char *key) 3696 { 3697 int ret; 3698 if (bs->backing_hd && bs->backing_hd->encrypted) { 3699 ret = bdrv_set_key(bs->backing_hd, key); 3700 if (ret < 0) 3701 return ret; 3702 if (!bs->encrypted) 3703 return 0; 3704 } 3705 if (!bs->encrypted) { 3706 return -EINVAL; 3707 } else if (!bs->drv || !bs->drv->bdrv_set_key) { 3708 return -ENOMEDIUM; 3709 } 3710 ret = bs->drv->bdrv_set_key(bs, key); 3711 if (ret < 0) { 3712 bs->valid_key = 0; 3713 } else if (!bs->valid_key) { 3714 bs->valid_key = 1; 3715 /* call the change callback now, we skipped it on open */ 3716 bdrv_dev_change_media_cb(bs, true); 3717 } 3718 return ret; 3719 } 3720 3721 const char *bdrv_get_format_name(BlockDriverState *bs) 3722 { 3723 return bs->drv ? bs->drv->format_name : NULL; 3724 } 3725 3726 void bdrv_iterate_format(void (*it)(void *opaque, const char *name), 3727 void *opaque) 3728 { 3729 BlockDriver *drv; 3730 int count = 0; 3731 const char **formats = NULL; 3732 3733 QLIST_FOREACH(drv, &bdrv_drivers, list) { 3734 if (drv->format_name) { 3735 bool found = false; 3736 int i = count; 3737 while (formats && i && !found) { 3738 found = !strcmp(formats[--i], drv->format_name); 3739 } 3740 3741 if (!found) { 3742 formats = g_realloc(formats, (count + 1) * sizeof(char *)); 3743 formats[count++] = drv->format_name; 3744 it(opaque, drv->format_name); 3745 } 3746 } 3747 } 3748 g_free(formats); 3749 } 3750 3751 /* This function is to find block backend bs */ 3752 BlockDriverState *bdrv_find(const char *name) 3753 { 3754 BlockDriverState *bs; 3755 3756 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 3757 if (!strcmp(name, bs->device_name)) { 3758 return bs; 3759 } 3760 } 3761 return NULL; 3762 } 3763 3764 /* This function is to find a node in the bs graph */ 3765 BlockDriverState *bdrv_find_node(const char *node_name) 3766 { 3767 BlockDriverState *bs; 3768 3769 assert(node_name); 3770 3771 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { 3772 if (!strcmp(node_name, bs->node_name)) { 3773 return bs; 3774 } 3775 } 3776 return NULL; 3777 } 3778 3779 /* Put this QMP function here so it can access the static graph_bdrv_states. */ 3780 BlockDeviceInfoList *bdrv_named_nodes_list(void) 3781 { 3782 BlockDeviceInfoList *list, *entry; 3783 BlockDriverState *bs; 3784 3785 list = NULL; 3786 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { 3787 entry = g_malloc0(sizeof(*entry)); 3788 entry->value = bdrv_block_device_info(bs); 3789 entry->next = list; 3790 list = entry; 3791 } 3792 3793 return list; 3794 } 3795 3796 BlockDriverState *bdrv_lookup_bs(const char *device, 3797 const char *node_name, 3798 Error **errp) 3799 { 3800 BlockDriverState *bs = NULL; 3801 3802 if (device) { 3803 bs = bdrv_find(device); 3804 3805 if (bs) { 3806 return bs; 3807 } 3808 } 3809 3810 if (node_name) { 3811 bs = bdrv_find_node(node_name); 3812 3813 if (bs) { 3814 return bs; 3815 } 3816 } 3817 3818 error_setg(errp, "Cannot find device=%s nor node_name=%s", 3819 device ? device : "", 3820 node_name ? node_name : ""); 3821 return NULL; 3822 } 3823 3824 BlockDriverState *bdrv_next(BlockDriverState *bs) 3825 { 3826 if (!bs) { 3827 return QTAILQ_FIRST(&bdrv_states); 3828 } 3829 return QTAILQ_NEXT(bs, device_list); 3830 } 3831 3832 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) 3833 { 3834 BlockDriverState *bs; 3835 3836 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 3837 it(opaque, bs); 3838 } 3839 } 3840 3841 const char *bdrv_get_device_name(BlockDriverState *bs) 3842 { 3843 return bs->device_name; 3844 } 3845 3846 int bdrv_get_flags(BlockDriverState *bs) 3847 { 3848 return bs->open_flags; 3849 } 3850 3851 int bdrv_flush_all(void) 3852 { 3853 BlockDriverState *bs; 3854 int result = 0; 3855 3856 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 3857 AioContext *aio_context = bdrv_get_aio_context(bs); 3858 int ret; 3859 3860 aio_context_acquire(aio_context); 3861 ret = bdrv_flush(bs); 3862 if (ret < 0 && !result) { 3863 result = ret; 3864 } 3865 aio_context_release(aio_context); 3866 } 3867 3868 return result; 3869 } 3870 3871 int bdrv_has_zero_init_1(BlockDriverState *bs) 3872 { 3873 return 1; 3874 } 3875 3876 int bdrv_has_zero_init(BlockDriverState *bs) 3877 { 3878 assert(bs->drv); 3879 3880 /* If BS is a copy on write image, it is initialized to 3881 the contents of the base image, which may not be zeroes. */ 3882 if (bs->backing_hd) { 3883 return 0; 3884 } 3885 if (bs->drv->bdrv_has_zero_init) { 3886 return bs->drv->bdrv_has_zero_init(bs); 3887 } 3888 3889 /* safe default */ 3890 return 0; 3891 } 3892 3893 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) 3894 { 3895 BlockDriverInfo bdi; 3896 3897 if (bs->backing_hd) { 3898 return false; 3899 } 3900 3901 if (bdrv_get_info(bs, &bdi) == 0) { 3902 return bdi.unallocated_blocks_are_zero; 3903 } 3904 3905 return false; 3906 } 3907 3908 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) 3909 { 3910 BlockDriverInfo bdi; 3911 3912 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { 3913 return false; 3914 } 3915 3916 if (bdrv_get_info(bs, &bdi) == 0) { 3917 return bdi.can_write_zeroes_with_unmap; 3918 } 3919 3920 return false; 3921 } 3922 3923 typedef struct BdrvCoGetBlockStatusData { 3924 BlockDriverState *bs; 3925 BlockDriverState *base; 3926 int64_t sector_num; 3927 int nb_sectors; 3928 int *pnum; 3929 int64_t ret; 3930 bool done; 3931 } BdrvCoGetBlockStatusData; 3932 3933 /* 3934 * Returns true iff the specified sector is present in the disk image. Drivers 3935 * not implementing the functionality are assumed to not support backing files, 3936 * hence all their sectors are reported as allocated. 3937 * 3938 * If 'sector_num' is beyond the end of the disk image the return value is 0 3939 * and 'pnum' is set to 0. 3940 * 3941 * 'pnum' is set to the number of sectors (including and immediately following 3942 * the specified sector) that are known to be in the same 3943 * allocated/unallocated state. 3944 * 3945 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 3946 * beyond the end of the disk image it will be clamped. 3947 */ 3948 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 3949 int64_t sector_num, 3950 int nb_sectors, int *pnum) 3951 { 3952 int64_t length; 3953 int64_t n; 3954 int64_t ret, ret2; 3955 3956 length = bdrv_getlength(bs); 3957 if (length < 0) { 3958 return length; 3959 } 3960 3961 if (sector_num >= (length >> BDRV_SECTOR_BITS)) { 3962 *pnum = 0; 3963 return 0; 3964 } 3965 3966 n = bs->total_sectors - sector_num; 3967 if (n < nb_sectors) { 3968 nb_sectors = n; 3969 } 3970 3971 if (!bs->drv->bdrv_co_get_block_status) { 3972 *pnum = nb_sectors; 3973 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 3974 if (bs->drv->protocol_name) { 3975 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 3976 } 3977 return ret; 3978 } 3979 3980 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 3981 if (ret < 0) { 3982 *pnum = 0; 3983 return ret; 3984 } 3985 3986 if (ret & BDRV_BLOCK_RAW) { 3987 assert(ret & BDRV_BLOCK_OFFSET_VALID); 3988 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 3989 *pnum, pnum); 3990 } 3991 3992 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 3993 ret |= BDRV_BLOCK_ALLOCATED; 3994 } 3995 3996 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { 3997 if (bdrv_unallocated_blocks_are_zero(bs)) { 3998 ret |= BDRV_BLOCK_ZERO; 3999 } else if (bs->backing_hd) { 4000 BlockDriverState *bs2 = bs->backing_hd; 4001 int64_t length2 = bdrv_getlength(bs2); 4002 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) { 4003 ret |= BDRV_BLOCK_ZERO; 4004 } 4005 } 4006 } 4007 4008 if (bs->file && 4009 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 4010 (ret & BDRV_BLOCK_OFFSET_VALID)) { 4011 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 4012 *pnum, pnum); 4013 if (ret2 >= 0) { 4014 /* Ignore errors. This is just providing extra information, it 4015 * is useful but not necessary. 4016 */ 4017 ret |= (ret2 & BDRV_BLOCK_ZERO); 4018 } 4019 } 4020 4021 return ret; 4022 } 4023 4024 /* Coroutine wrapper for bdrv_get_block_status() */ 4025 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) 4026 { 4027 BdrvCoGetBlockStatusData *data = opaque; 4028 BlockDriverState *bs = data->bs; 4029 4030 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, 4031 data->pnum); 4032 data->done = true; 4033 } 4034 4035 /* 4036 * Synchronous wrapper around bdrv_co_get_block_status(). 4037 * 4038 * See bdrv_co_get_block_status() for details. 4039 */ 4040 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, 4041 int nb_sectors, int *pnum) 4042 { 4043 Coroutine *co; 4044 BdrvCoGetBlockStatusData data = { 4045 .bs = bs, 4046 .sector_num = sector_num, 4047 .nb_sectors = nb_sectors, 4048 .pnum = pnum, 4049 .done = false, 4050 }; 4051 4052 if (qemu_in_coroutine()) { 4053 /* Fast-path if already in coroutine context */ 4054 bdrv_get_block_status_co_entry(&data); 4055 } else { 4056 AioContext *aio_context = bdrv_get_aio_context(bs); 4057 4058 co = qemu_coroutine_create(bdrv_get_block_status_co_entry); 4059 qemu_coroutine_enter(co, &data); 4060 while (!data.done) { 4061 aio_poll(aio_context, true); 4062 } 4063 } 4064 return data.ret; 4065 } 4066 4067 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 4068 int nb_sectors, int *pnum) 4069 { 4070 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 4071 if (ret < 0) { 4072 return ret; 4073 } 4074 return (ret & BDRV_BLOCK_ALLOCATED); 4075 } 4076 4077 /* 4078 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 4079 * 4080 * Return true if the given sector is allocated in any image between 4081 * BASE and TOP (inclusive). BASE can be NULL to check if the given 4082 * sector is allocated in any image of the chain. Return false otherwise. 4083 * 4084 * 'pnum' is set to the number of sectors (including and immediately following 4085 * the specified sector) that are known to be in the same 4086 * allocated/unallocated state. 4087 * 4088 */ 4089 int bdrv_is_allocated_above(BlockDriverState *top, 4090 BlockDriverState *base, 4091 int64_t sector_num, 4092 int nb_sectors, int *pnum) 4093 { 4094 BlockDriverState *intermediate; 4095 int ret, n = nb_sectors; 4096 4097 intermediate = top; 4098 while (intermediate && intermediate != base) { 4099 int pnum_inter; 4100 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 4101 &pnum_inter); 4102 if (ret < 0) { 4103 return ret; 4104 } else if (ret) { 4105 *pnum = pnum_inter; 4106 return 1; 4107 } 4108 4109 /* 4110 * [sector_num, nb_sectors] is unallocated on top but intermediate 4111 * might have 4112 * 4113 * [sector_num+x, nr_sectors] allocated. 4114 */ 4115 if (n > pnum_inter && 4116 (intermediate == top || 4117 sector_num + pnum_inter < intermediate->total_sectors)) { 4118 n = pnum_inter; 4119 } 4120 4121 intermediate = intermediate->backing_hd; 4122 } 4123 4124 *pnum = n; 4125 return 0; 4126 } 4127 4128 const char *bdrv_get_encrypted_filename(BlockDriverState *bs) 4129 { 4130 if (bs->backing_hd && bs->backing_hd->encrypted) 4131 return bs->backing_file; 4132 else if (bs->encrypted) 4133 return bs->filename; 4134 else 4135 return NULL; 4136 } 4137 4138 void bdrv_get_backing_filename(BlockDriverState *bs, 4139 char *filename, int filename_size) 4140 { 4141 pstrcpy(filename, filename_size, bs->backing_file); 4142 } 4143 4144 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 4145 const uint8_t *buf, int nb_sectors) 4146 { 4147 BlockDriver *drv = bs->drv; 4148 if (!drv) 4149 return -ENOMEDIUM; 4150 if (!drv->bdrv_write_compressed) 4151 return -ENOTSUP; 4152 if (bdrv_check_request(bs, sector_num, nb_sectors)) 4153 return -EIO; 4154 4155 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 4156 4157 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 4158 } 4159 4160 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 4161 { 4162 BlockDriver *drv = bs->drv; 4163 if (!drv) 4164 return -ENOMEDIUM; 4165 if (!drv->bdrv_get_info) 4166 return -ENOTSUP; 4167 memset(bdi, 0, sizeof(*bdi)); 4168 return drv->bdrv_get_info(bs, bdi); 4169 } 4170 4171 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) 4172 { 4173 BlockDriver *drv = bs->drv; 4174 if (drv && drv->bdrv_get_specific_info) { 4175 return drv->bdrv_get_specific_info(bs); 4176 } 4177 return NULL; 4178 } 4179 4180 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 4181 int64_t pos, int size) 4182 { 4183 QEMUIOVector qiov; 4184 struct iovec iov = { 4185 .iov_base = (void *) buf, 4186 .iov_len = size, 4187 }; 4188 4189 qemu_iovec_init_external(&qiov, &iov, 1); 4190 return bdrv_writev_vmstate(bs, &qiov, pos); 4191 } 4192 4193 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 4194 { 4195 BlockDriver *drv = bs->drv; 4196 4197 if (!drv) { 4198 return -ENOMEDIUM; 4199 } else if (drv->bdrv_save_vmstate) { 4200 return drv->bdrv_save_vmstate(bs, qiov, pos); 4201 } else if (bs->file) { 4202 return bdrv_writev_vmstate(bs->file, qiov, pos); 4203 } 4204 4205 return -ENOTSUP; 4206 } 4207 4208 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 4209 int64_t pos, int size) 4210 { 4211 BlockDriver *drv = bs->drv; 4212 if (!drv) 4213 return -ENOMEDIUM; 4214 if (drv->bdrv_load_vmstate) 4215 return drv->bdrv_load_vmstate(bs, buf, pos, size); 4216 if (bs->file) 4217 return bdrv_load_vmstate(bs->file, buf, pos, size); 4218 return -ENOTSUP; 4219 } 4220 4221 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) 4222 { 4223 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { 4224 return; 4225 } 4226 4227 bs->drv->bdrv_debug_event(bs, event); 4228 } 4229 4230 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, 4231 const char *tag) 4232 { 4233 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { 4234 bs = bs->file; 4235 } 4236 4237 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { 4238 return bs->drv->bdrv_debug_breakpoint(bs, event, tag); 4239 } 4240 4241 return -ENOTSUP; 4242 } 4243 4244 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) 4245 { 4246 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { 4247 bs = bs->file; 4248 } 4249 4250 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { 4251 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); 4252 } 4253 4254 return -ENOTSUP; 4255 } 4256 4257 int bdrv_debug_resume(BlockDriverState *bs, const char *tag) 4258 { 4259 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { 4260 bs = bs->file; 4261 } 4262 4263 if (bs && bs->drv && bs->drv->bdrv_debug_resume) { 4264 return bs->drv->bdrv_debug_resume(bs, tag); 4265 } 4266 4267 return -ENOTSUP; 4268 } 4269 4270 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) 4271 { 4272 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { 4273 bs = bs->file; 4274 } 4275 4276 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { 4277 return bs->drv->bdrv_debug_is_suspended(bs, tag); 4278 } 4279 4280 return false; 4281 } 4282 4283 int bdrv_is_snapshot(BlockDriverState *bs) 4284 { 4285 return !!(bs->open_flags & BDRV_O_SNAPSHOT); 4286 } 4287 4288 /* backing_file can either be relative, or absolute, or a protocol. If it is 4289 * relative, it must be relative to the chain. So, passing in bs->filename 4290 * from a BDS as backing_file should not be done, as that may be relative to 4291 * the CWD rather than the chain. */ 4292 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, 4293 const char *backing_file) 4294 { 4295 char *filename_full = NULL; 4296 char *backing_file_full = NULL; 4297 char *filename_tmp = NULL; 4298 int is_protocol = 0; 4299 BlockDriverState *curr_bs = NULL; 4300 BlockDriverState *retval = NULL; 4301 4302 if (!bs || !bs->drv || !backing_file) { 4303 return NULL; 4304 } 4305 4306 filename_full = g_malloc(PATH_MAX); 4307 backing_file_full = g_malloc(PATH_MAX); 4308 filename_tmp = g_malloc(PATH_MAX); 4309 4310 is_protocol = path_has_protocol(backing_file); 4311 4312 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { 4313 4314 /* If either of the filename paths is actually a protocol, then 4315 * compare unmodified paths; otherwise make paths relative */ 4316 if (is_protocol || path_has_protocol(curr_bs->backing_file)) { 4317 if (strcmp(backing_file, curr_bs->backing_file) == 0) { 4318 retval = curr_bs->backing_hd; 4319 break; 4320 } 4321 } else { 4322 /* If not an absolute filename path, make it relative to the current 4323 * image's filename path */ 4324 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 4325 backing_file); 4326 4327 /* We are going to compare absolute pathnames */ 4328 if (!realpath(filename_tmp, filename_full)) { 4329 continue; 4330 } 4331 4332 /* We need to make sure the backing filename we are comparing against 4333 * is relative to the current image filename (or absolute) */ 4334 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 4335 curr_bs->backing_file); 4336 4337 if (!realpath(filename_tmp, backing_file_full)) { 4338 continue; 4339 } 4340 4341 if (strcmp(backing_file_full, filename_full) == 0) { 4342 retval = curr_bs->backing_hd; 4343 break; 4344 } 4345 } 4346 } 4347 4348 g_free(filename_full); 4349 g_free(backing_file_full); 4350 g_free(filename_tmp); 4351 return retval; 4352 } 4353 4354 int bdrv_get_backing_file_depth(BlockDriverState *bs) 4355 { 4356 if (!bs->drv) { 4357 return 0; 4358 } 4359 4360 if (!bs->backing_hd) { 4361 return 0; 4362 } 4363 4364 return 1 + bdrv_get_backing_file_depth(bs->backing_hd); 4365 } 4366 4367 BlockDriverState *bdrv_find_base(BlockDriverState *bs) 4368 { 4369 BlockDriverState *curr_bs = NULL; 4370 4371 if (!bs) { 4372 return NULL; 4373 } 4374 4375 curr_bs = bs; 4376 4377 while (curr_bs->backing_hd) { 4378 curr_bs = curr_bs->backing_hd; 4379 } 4380 return curr_bs; 4381 } 4382 4383 /**************************************************************/ 4384 /* async I/Os */ 4385 4386 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 4387 QEMUIOVector *qiov, int nb_sectors, 4388 BlockDriverCompletionFunc *cb, void *opaque) 4389 { 4390 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 4391 4392 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 4393 cb, opaque, false); 4394 } 4395 4396 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 4397 QEMUIOVector *qiov, int nb_sectors, 4398 BlockDriverCompletionFunc *cb, void *opaque) 4399 { 4400 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 4401 4402 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 4403 cb, opaque, true); 4404 } 4405 4406 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 4407 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 4408 BlockDriverCompletionFunc *cb, void *opaque) 4409 { 4410 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 4411 4412 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 4413 BDRV_REQ_ZERO_WRITE | flags, 4414 cb, opaque, true); 4415 } 4416 4417 4418 typedef struct MultiwriteCB { 4419 int error; 4420 int num_requests; 4421 int num_callbacks; 4422 struct { 4423 BlockDriverCompletionFunc *cb; 4424 void *opaque; 4425 QEMUIOVector *free_qiov; 4426 } callbacks[]; 4427 } MultiwriteCB; 4428 4429 static void multiwrite_user_cb(MultiwriteCB *mcb) 4430 { 4431 int i; 4432 4433 for (i = 0; i < mcb->num_callbacks; i++) { 4434 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 4435 if (mcb->callbacks[i].free_qiov) { 4436 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 4437 } 4438 g_free(mcb->callbacks[i].free_qiov); 4439 } 4440 } 4441 4442 static void multiwrite_cb(void *opaque, int ret) 4443 { 4444 MultiwriteCB *mcb = opaque; 4445 4446 trace_multiwrite_cb(mcb, ret); 4447 4448 if (ret < 0 && !mcb->error) { 4449 mcb->error = ret; 4450 } 4451 4452 mcb->num_requests--; 4453 if (mcb->num_requests == 0) { 4454 multiwrite_user_cb(mcb); 4455 g_free(mcb); 4456 } 4457 } 4458 4459 static int multiwrite_req_compare(const void *a, const void *b) 4460 { 4461 const BlockRequest *req1 = a, *req2 = b; 4462 4463 /* 4464 * Note that we can't simply subtract req2->sector from req1->sector 4465 * here as that could overflow the return value. 4466 */ 4467 if (req1->sector > req2->sector) { 4468 return 1; 4469 } else if (req1->sector < req2->sector) { 4470 return -1; 4471 } else { 4472 return 0; 4473 } 4474 } 4475 4476 /* 4477 * Takes a bunch of requests and tries to merge them. Returns the number of 4478 * requests that remain after merging. 4479 */ 4480 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 4481 int num_reqs, MultiwriteCB *mcb) 4482 { 4483 int i, outidx; 4484 4485 // Sort requests by start sector 4486 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 4487 4488 // Check if adjacent requests touch the same clusters. If so, combine them, 4489 // filling up gaps with zero sectors. 4490 outidx = 0; 4491 for (i = 1; i < num_reqs; i++) { 4492 int merge = 0; 4493 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 4494 4495 // Handle exactly sequential writes and overlapping writes. 4496 if (reqs[i].sector <= oldreq_last) { 4497 merge = 1; 4498 } 4499 4500 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 4501 merge = 0; 4502 } 4503 4504 if (merge) { 4505 size_t size; 4506 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 4507 qemu_iovec_init(qiov, 4508 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 4509 4510 // Add the first request to the merged one. If the requests are 4511 // overlapping, drop the last sectors of the first request. 4512 size = (reqs[i].sector - reqs[outidx].sector) << 9; 4513 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 4514 4515 // We should need to add any zeros between the two requests 4516 assert (reqs[i].sector <= oldreq_last); 4517 4518 // Add the second request 4519 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 4520 4521 reqs[outidx].nb_sectors = qiov->size >> 9; 4522 reqs[outidx].qiov = qiov; 4523 4524 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 4525 } else { 4526 outidx++; 4527 reqs[outidx].sector = reqs[i].sector; 4528 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 4529 reqs[outidx].qiov = reqs[i].qiov; 4530 } 4531 } 4532 4533 return outidx + 1; 4534 } 4535 4536 /* 4537 * Submit multiple AIO write requests at once. 4538 * 4539 * On success, the function returns 0 and all requests in the reqs array have 4540 * been submitted. In error case this function returns -1, and any of the 4541 * requests may or may not be submitted yet. In particular, this means that the 4542 * callback will be called for some of the requests, for others it won't. The 4543 * caller must check the error field of the BlockRequest to wait for the right 4544 * callbacks (if error != 0, no callback will be called). 4545 * 4546 * The implementation may modify the contents of the reqs array, e.g. to merge 4547 * requests. However, the fields opaque and error are left unmodified as they 4548 * are used to signal failure for a single request to the caller. 4549 */ 4550 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 4551 { 4552 MultiwriteCB *mcb; 4553 int i; 4554 4555 /* don't submit writes if we don't have a medium */ 4556 if (bs->drv == NULL) { 4557 for (i = 0; i < num_reqs; i++) { 4558 reqs[i].error = -ENOMEDIUM; 4559 } 4560 return -1; 4561 } 4562 4563 if (num_reqs == 0) { 4564 return 0; 4565 } 4566 4567 // Create MultiwriteCB structure 4568 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 4569 mcb->num_requests = 0; 4570 mcb->num_callbacks = num_reqs; 4571 4572 for (i = 0; i < num_reqs; i++) { 4573 mcb->callbacks[i].cb = reqs[i].cb; 4574 mcb->callbacks[i].opaque = reqs[i].opaque; 4575 } 4576 4577 // Check for mergable requests 4578 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 4579 4580 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 4581 4582 /* Run the aio requests. */ 4583 mcb->num_requests = num_reqs; 4584 for (i = 0; i < num_reqs; i++) { 4585 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 4586 reqs[i].nb_sectors, reqs[i].flags, 4587 multiwrite_cb, mcb, 4588 true); 4589 } 4590 4591 return 0; 4592 } 4593 4594 void bdrv_aio_cancel(BlockDriverAIOCB *acb) 4595 { 4596 acb->aiocb_info->cancel(acb); 4597 } 4598 4599 /**************************************************************/ 4600 /* async block device emulation */ 4601 4602 typedef struct BlockDriverAIOCBSync { 4603 BlockDriverAIOCB common; 4604 QEMUBH *bh; 4605 int ret; 4606 /* vector translation state */ 4607 QEMUIOVector *qiov; 4608 uint8_t *bounce; 4609 int is_write; 4610 } BlockDriverAIOCBSync; 4611 4612 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) 4613 { 4614 BlockDriverAIOCBSync *acb = 4615 container_of(blockacb, BlockDriverAIOCBSync, common); 4616 qemu_bh_delete(acb->bh); 4617 acb->bh = NULL; 4618 qemu_aio_release(acb); 4619 } 4620 4621 static const AIOCBInfo bdrv_em_aiocb_info = { 4622 .aiocb_size = sizeof(BlockDriverAIOCBSync), 4623 .cancel = bdrv_aio_cancel_em, 4624 }; 4625 4626 static void bdrv_aio_bh_cb(void *opaque) 4627 { 4628 BlockDriverAIOCBSync *acb = opaque; 4629 4630 if (!acb->is_write) 4631 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 4632 qemu_vfree(acb->bounce); 4633 acb->common.cb(acb->common.opaque, acb->ret); 4634 qemu_bh_delete(acb->bh); 4635 acb->bh = NULL; 4636 qemu_aio_release(acb); 4637 } 4638 4639 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 4640 int64_t sector_num, 4641 QEMUIOVector *qiov, 4642 int nb_sectors, 4643 BlockDriverCompletionFunc *cb, 4644 void *opaque, 4645 int is_write) 4646 4647 { 4648 BlockDriverAIOCBSync *acb; 4649 4650 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 4651 acb->is_write = is_write; 4652 acb->qiov = qiov; 4653 acb->bounce = qemu_blockalign(bs, qiov->size); 4654 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 4655 4656 if (is_write) { 4657 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 4658 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 4659 } else { 4660 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 4661 } 4662 4663 qemu_bh_schedule(acb->bh); 4664 4665 return &acb->common; 4666 } 4667 4668 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 4669 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 4670 BlockDriverCompletionFunc *cb, void *opaque) 4671 { 4672 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 4673 } 4674 4675 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 4676 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 4677 BlockDriverCompletionFunc *cb, void *opaque) 4678 { 4679 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 4680 } 4681 4682 4683 typedef struct BlockDriverAIOCBCoroutine { 4684 BlockDriverAIOCB common; 4685 BlockRequest req; 4686 bool is_write; 4687 bool *done; 4688 QEMUBH* bh; 4689 } BlockDriverAIOCBCoroutine; 4690 4691 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) 4692 { 4693 AioContext *aio_context = bdrv_get_aio_context(blockacb->bs); 4694 BlockDriverAIOCBCoroutine *acb = 4695 container_of(blockacb, BlockDriverAIOCBCoroutine, common); 4696 bool done = false; 4697 4698 acb->done = &done; 4699 while (!done) { 4700 aio_poll(aio_context, true); 4701 } 4702 } 4703 4704 static const AIOCBInfo bdrv_em_co_aiocb_info = { 4705 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), 4706 .cancel = bdrv_aio_co_cancel_em, 4707 }; 4708 4709 static void bdrv_co_em_bh(void *opaque) 4710 { 4711 BlockDriverAIOCBCoroutine *acb = opaque; 4712 4713 acb->common.cb(acb->common.opaque, acb->req.error); 4714 4715 if (acb->done) { 4716 *acb->done = true; 4717 } 4718 4719 qemu_bh_delete(acb->bh); 4720 qemu_aio_release(acb); 4721 } 4722 4723 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 4724 static void coroutine_fn bdrv_co_do_rw(void *opaque) 4725 { 4726 BlockDriverAIOCBCoroutine *acb = opaque; 4727 BlockDriverState *bs = acb->common.bs; 4728 4729 if (!acb->is_write) { 4730 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 4731 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 4732 } else { 4733 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 4734 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 4735 } 4736 4737 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4738 qemu_bh_schedule(acb->bh); 4739 } 4740 4741 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 4742 int64_t sector_num, 4743 QEMUIOVector *qiov, 4744 int nb_sectors, 4745 BdrvRequestFlags flags, 4746 BlockDriverCompletionFunc *cb, 4747 void *opaque, 4748 bool is_write) 4749 { 4750 Coroutine *co; 4751 BlockDriverAIOCBCoroutine *acb; 4752 4753 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4754 acb->req.sector = sector_num; 4755 acb->req.nb_sectors = nb_sectors; 4756 acb->req.qiov = qiov; 4757 acb->req.flags = flags; 4758 acb->is_write = is_write; 4759 acb->done = NULL; 4760 4761 co = qemu_coroutine_create(bdrv_co_do_rw); 4762 qemu_coroutine_enter(co, acb); 4763 4764 return &acb->common; 4765 } 4766 4767 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 4768 { 4769 BlockDriverAIOCBCoroutine *acb = opaque; 4770 BlockDriverState *bs = acb->common.bs; 4771 4772 acb->req.error = bdrv_co_flush(bs); 4773 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4774 qemu_bh_schedule(acb->bh); 4775 } 4776 4777 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 4778 BlockDriverCompletionFunc *cb, void *opaque) 4779 { 4780 trace_bdrv_aio_flush(bs, opaque); 4781 4782 Coroutine *co; 4783 BlockDriverAIOCBCoroutine *acb; 4784 4785 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4786 acb->done = NULL; 4787 4788 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 4789 qemu_coroutine_enter(co, acb); 4790 4791 return &acb->common; 4792 } 4793 4794 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 4795 { 4796 BlockDriverAIOCBCoroutine *acb = opaque; 4797 BlockDriverState *bs = acb->common.bs; 4798 4799 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 4800 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 4801 qemu_bh_schedule(acb->bh); 4802 } 4803 4804 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, 4805 int64_t sector_num, int nb_sectors, 4806 BlockDriverCompletionFunc *cb, void *opaque) 4807 { 4808 Coroutine *co; 4809 BlockDriverAIOCBCoroutine *acb; 4810 4811 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 4812 4813 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4814 acb->req.sector = sector_num; 4815 acb->req.nb_sectors = nb_sectors; 4816 acb->done = NULL; 4817 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 4818 qemu_coroutine_enter(co, acb); 4819 4820 return &acb->common; 4821 } 4822 4823 void bdrv_init(void) 4824 { 4825 module_call_init(MODULE_INIT_BLOCK); 4826 } 4827 4828 void bdrv_init_with_whitelist(void) 4829 { 4830 use_bdrv_whitelist = 1; 4831 bdrv_init(); 4832 } 4833 4834 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 4835 BlockDriverCompletionFunc *cb, void *opaque) 4836 { 4837 BlockDriverAIOCB *acb; 4838 4839 acb = g_slice_alloc(aiocb_info->aiocb_size); 4840 acb->aiocb_info = aiocb_info; 4841 acb->bs = bs; 4842 acb->cb = cb; 4843 acb->opaque = opaque; 4844 return acb; 4845 } 4846 4847 void qemu_aio_release(void *p) 4848 { 4849 BlockDriverAIOCB *acb = p; 4850 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 4851 } 4852 4853 /**************************************************************/ 4854 /* Coroutine block device emulation */ 4855 4856 typedef struct CoroutineIOCompletion { 4857 Coroutine *coroutine; 4858 int ret; 4859 } CoroutineIOCompletion; 4860 4861 static void bdrv_co_io_em_complete(void *opaque, int ret) 4862 { 4863 CoroutineIOCompletion *co = opaque; 4864 4865 co->ret = ret; 4866 qemu_coroutine_enter(co->coroutine, NULL); 4867 } 4868 4869 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 4870 int nb_sectors, QEMUIOVector *iov, 4871 bool is_write) 4872 { 4873 CoroutineIOCompletion co = { 4874 .coroutine = qemu_coroutine_self(), 4875 }; 4876 BlockDriverAIOCB *acb; 4877 4878 if (is_write) { 4879 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 4880 bdrv_co_io_em_complete, &co); 4881 } else { 4882 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 4883 bdrv_co_io_em_complete, &co); 4884 } 4885 4886 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 4887 if (!acb) { 4888 return -EIO; 4889 } 4890 qemu_coroutine_yield(); 4891 4892 return co.ret; 4893 } 4894 4895 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 4896 int64_t sector_num, int nb_sectors, 4897 QEMUIOVector *iov) 4898 { 4899 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 4900 } 4901 4902 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 4903 int64_t sector_num, int nb_sectors, 4904 QEMUIOVector *iov) 4905 { 4906 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 4907 } 4908 4909 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 4910 { 4911 RwCo *rwco = opaque; 4912 4913 rwco->ret = bdrv_co_flush(rwco->bs); 4914 } 4915 4916 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 4917 { 4918 int ret; 4919 4920 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 4921 return 0; 4922 } 4923 4924 /* Write back cached data to the OS even with cache=unsafe */ 4925 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 4926 if (bs->drv->bdrv_co_flush_to_os) { 4927 ret = bs->drv->bdrv_co_flush_to_os(bs); 4928 if (ret < 0) { 4929 return ret; 4930 } 4931 } 4932 4933 /* But don't actually force it to the disk with cache=unsafe */ 4934 if (bs->open_flags & BDRV_O_NO_FLUSH) { 4935 goto flush_parent; 4936 } 4937 4938 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 4939 if (bs->drv->bdrv_co_flush_to_disk) { 4940 ret = bs->drv->bdrv_co_flush_to_disk(bs); 4941 } else if (bs->drv->bdrv_aio_flush) { 4942 BlockDriverAIOCB *acb; 4943 CoroutineIOCompletion co = { 4944 .coroutine = qemu_coroutine_self(), 4945 }; 4946 4947 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 4948 if (acb == NULL) { 4949 ret = -EIO; 4950 } else { 4951 qemu_coroutine_yield(); 4952 ret = co.ret; 4953 } 4954 } else { 4955 /* 4956 * Some block drivers always operate in either writethrough or unsafe 4957 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 4958 * know how the server works (because the behaviour is hardcoded or 4959 * depends on server-side configuration), so we can't ensure that 4960 * everything is safe on disk. Returning an error doesn't work because 4961 * that would break guests even if the server operates in writethrough 4962 * mode. 4963 * 4964 * Let's hope the user knows what he's doing. 4965 */ 4966 ret = 0; 4967 } 4968 if (ret < 0) { 4969 return ret; 4970 } 4971 4972 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 4973 * in the case of cache=unsafe, so there are no useless flushes. 4974 */ 4975 flush_parent: 4976 return bdrv_co_flush(bs->file); 4977 } 4978 4979 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) 4980 { 4981 Error *local_err = NULL; 4982 int ret; 4983 4984 if (!bs->drv) { 4985 return; 4986 } 4987 4988 if (bs->drv->bdrv_invalidate_cache) { 4989 bs->drv->bdrv_invalidate_cache(bs, &local_err); 4990 } else if (bs->file) { 4991 bdrv_invalidate_cache(bs->file, &local_err); 4992 } 4993 if (local_err) { 4994 error_propagate(errp, local_err); 4995 return; 4996 } 4997 4998 ret = refresh_total_sectors(bs, bs->total_sectors); 4999 if (ret < 0) { 5000 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 5001 return; 5002 } 5003 } 5004 5005 void bdrv_invalidate_cache_all(Error **errp) 5006 { 5007 BlockDriverState *bs; 5008 Error *local_err = NULL; 5009 5010 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 5011 AioContext *aio_context = bdrv_get_aio_context(bs); 5012 5013 aio_context_acquire(aio_context); 5014 bdrv_invalidate_cache(bs, &local_err); 5015 aio_context_release(aio_context); 5016 if (local_err) { 5017 error_propagate(errp, local_err); 5018 return; 5019 } 5020 } 5021 } 5022 5023 void bdrv_clear_incoming_migration_all(void) 5024 { 5025 BlockDriverState *bs; 5026 5027 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 5028 AioContext *aio_context = bdrv_get_aio_context(bs); 5029 5030 aio_context_acquire(aio_context); 5031 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); 5032 aio_context_release(aio_context); 5033 } 5034 } 5035 5036 int bdrv_flush(BlockDriverState *bs) 5037 { 5038 Coroutine *co; 5039 RwCo rwco = { 5040 .bs = bs, 5041 .ret = NOT_DONE, 5042 }; 5043 5044 if (qemu_in_coroutine()) { 5045 /* Fast-path if already in coroutine context */ 5046 bdrv_flush_co_entry(&rwco); 5047 } else { 5048 AioContext *aio_context = bdrv_get_aio_context(bs); 5049 5050 co = qemu_coroutine_create(bdrv_flush_co_entry); 5051 qemu_coroutine_enter(co, &rwco); 5052 while (rwco.ret == NOT_DONE) { 5053 aio_poll(aio_context, true); 5054 } 5055 } 5056 5057 return rwco.ret; 5058 } 5059 5060 typedef struct DiscardCo { 5061 BlockDriverState *bs; 5062 int64_t sector_num; 5063 int nb_sectors; 5064 int ret; 5065 } DiscardCo; 5066 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 5067 { 5068 DiscardCo *rwco = opaque; 5069 5070 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 5071 } 5072 5073 /* if no limit is specified in the BlockLimits use a default 5074 * of 32768 512-byte sectors (16 MiB) per request. 5075 */ 5076 #define MAX_DISCARD_DEFAULT 32768 5077 5078 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 5079 int nb_sectors) 5080 { 5081 int max_discard; 5082 5083 if (!bs->drv) { 5084 return -ENOMEDIUM; 5085 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { 5086 return -EIO; 5087 } else if (bs->read_only) { 5088 return -EROFS; 5089 } 5090 5091 bdrv_reset_dirty(bs, sector_num, nb_sectors); 5092 5093 /* Do nothing if disabled. */ 5094 if (!(bs->open_flags & BDRV_O_UNMAP)) { 5095 return 0; 5096 } 5097 5098 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 5099 return 0; 5100 } 5101 5102 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; 5103 while (nb_sectors > 0) { 5104 int ret; 5105 int num = nb_sectors; 5106 5107 /* align request */ 5108 if (bs->bl.discard_alignment && 5109 num >= bs->bl.discard_alignment && 5110 sector_num % bs->bl.discard_alignment) { 5111 if (num > bs->bl.discard_alignment) { 5112 num = bs->bl.discard_alignment; 5113 } 5114 num -= sector_num % bs->bl.discard_alignment; 5115 } 5116 5117 /* limit request size */ 5118 if (num > max_discard) { 5119 num = max_discard; 5120 } 5121 5122 if (bs->drv->bdrv_co_discard) { 5123 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 5124 } else { 5125 BlockDriverAIOCB *acb; 5126 CoroutineIOCompletion co = { 5127 .coroutine = qemu_coroutine_self(), 5128 }; 5129 5130 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 5131 bdrv_co_io_em_complete, &co); 5132 if (acb == NULL) { 5133 return -EIO; 5134 } else { 5135 qemu_coroutine_yield(); 5136 ret = co.ret; 5137 } 5138 } 5139 if (ret && ret != -ENOTSUP) { 5140 return ret; 5141 } 5142 5143 sector_num += num; 5144 nb_sectors -= num; 5145 } 5146 return 0; 5147 } 5148 5149 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 5150 { 5151 Coroutine *co; 5152 DiscardCo rwco = { 5153 .bs = bs, 5154 .sector_num = sector_num, 5155 .nb_sectors = nb_sectors, 5156 .ret = NOT_DONE, 5157 }; 5158 5159 if (qemu_in_coroutine()) { 5160 /* Fast-path if already in coroutine context */ 5161 bdrv_discard_co_entry(&rwco); 5162 } else { 5163 AioContext *aio_context = bdrv_get_aio_context(bs); 5164 5165 co = qemu_coroutine_create(bdrv_discard_co_entry); 5166 qemu_coroutine_enter(co, &rwco); 5167 while (rwco.ret == NOT_DONE) { 5168 aio_poll(aio_context, true); 5169 } 5170 } 5171 5172 return rwco.ret; 5173 } 5174 5175 /**************************************************************/ 5176 /* removable device support */ 5177 5178 /** 5179 * Return TRUE if the media is present 5180 */ 5181 int bdrv_is_inserted(BlockDriverState *bs) 5182 { 5183 BlockDriver *drv = bs->drv; 5184 5185 if (!drv) 5186 return 0; 5187 if (!drv->bdrv_is_inserted) 5188 return 1; 5189 return drv->bdrv_is_inserted(bs); 5190 } 5191 5192 /** 5193 * Return whether the media changed since the last call to this 5194 * function, or -ENOTSUP if we don't know. Most drivers don't know. 5195 */ 5196 int bdrv_media_changed(BlockDriverState *bs) 5197 { 5198 BlockDriver *drv = bs->drv; 5199 5200 if (drv && drv->bdrv_media_changed) { 5201 return drv->bdrv_media_changed(bs); 5202 } 5203 return -ENOTSUP; 5204 } 5205 5206 /** 5207 * If eject_flag is TRUE, eject the media. Otherwise, close the tray 5208 */ 5209 void bdrv_eject(BlockDriverState *bs, bool eject_flag) 5210 { 5211 BlockDriver *drv = bs->drv; 5212 5213 if (drv && drv->bdrv_eject) { 5214 drv->bdrv_eject(bs, eject_flag); 5215 } 5216 5217 if (bs->device_name[0] != '\0') { 5218 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs), 5219 eject_flag, &error_abort); 5220 } 5221 } 5222 5223 /** 5224 * Lock or unlock the media (if it is locked, the user won't be able 5225 * to eject it manually). 5226 */ 5227 void bdrv_lock_medium(BlockDriverState *bs, bool locked) 5228 { 5229 BlockDriver *drv = bs->drv; 5230 5231 trace_bdrv_lock_medium(bs, locked); 5232 5233 if (drv && drv->bdrv_lock_medium) { 5234 drv->bdrv_lock_medium(bs, locked); 5235 } 5236 } 5237 5238 /* needed for generic scsi interface */ 5239 5240 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 5241 { 5242 BlockDriver *drv = bs->drv; 5243 5244 if (drv && drv->bdrv_ioctl) 5245 return drv->bdrv_ioctl(bs, req, buf); 5246 return -ENOTSUP; 5247 } 5248 5249 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 5250 unsigned long int req, void *buf, 5251 BlockDriverCompletionFunc *cb, void *opaque) 5252 { 5253 BlockDriver *drv = bs->drv; 5254 5255 if (drv && drv->bdrv_aio_ioctl) 5256 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 5257 return NULL; 5258 } 5259 5260 void bdrv_set_guest_block_size(BlockDriverState *bs, int align) 5261 { 5262 bs->guest_block_size = align; 5263 } 5264 5265 void *qemu_blockalign(BlockDriverState *bs, size_t size) 5266 { 5267 return qemu_memalign(bdrv_opt_mem_align(bs), size); 5268 } 5269 5270 /* 5271 * Check if all memory in this vector is sector aligned. 5272 */ 5273 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 5274 { 5275 int i; 5276 size_t alignment = bdrv_opt_mem_align(bs); 5277 5278 for (i = 0; i < qiov->niov; i++) { 5279 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 5280 return false; 5281 } 5282 if (qiov->iov[i].iov_len % alignment) { 5283 return false; 5284 } 5285 } 5286 5287 return true; 5288 } 5289 5290 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, 5291 Error **errp) 5292 { 5293 int64_t bitmap_size; 5294 BdrvDirtyBitmap *bitmap; 5295 5296 assert((granularity & (granularity - 1)) == 0); 5297 5298 granularity >>= BDRV_SECTOR_BITS; 5299 assert(granularity); 5300 bitmap_size = bdrv_getlength(bs); 5301 if (bitmap_size < 0) { 5302 error_setg_errno(errp, -bitmap_size, "could not get length of device"); 5303 errno = -bitmap_size; 5304 return NULL; 5305 } 5306 bitmap_size >>= BDRV_SECTOR_BITS; 5307 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap)); 5308 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); 5309 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); 5310 return bitmap; 5311 } 5312 5313 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) 5314 { 5315 BdrvDirtyBitmap *bm, *next; 5316 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { 5317 if (bm == bitmap) { 5318 QLIST_REMOVE(bitmap, list); 5319 hbitmap_free(bitmap->bitmap); 5320 g_free(bitmap); 5321 return; 5322 } 5323 } 5324 } 5325 5326 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) 5327 { 5328 BdrvDirtyBitmap *bm; 5329 BlockDirtyInfoList *list = NULL; 5330 BlockDirtyInfoList **plist = &list; 5331 5332 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { 5333 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo)); 5334 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList)); 5335 info->count = bdrv_get_dirty_count(bs, bm); 5336 info->granularity = 5337 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap)); 5338 entry->value = info; 5339 *plist = entry; 5340 plist = &entry->next; 5341 } 5342 5343 return list; 5344 } 5345 5346 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector) 5347 { 5348 if (bitmap) { 5349 return hbitmap_get(bitmap->bitmap, sector); 5350 } else { 5351 return 0; 5352 } 5353 } 5354 5355 void bdrv_dirty_iter_init(BlockDriverState *bs, 5356 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) 5357 { 5358 hbitmap_iter_init(hbi, bitmap->bitmap, 0); 5359 } 5360 5361 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, 5362 int nr_sectors) 5363 { 5364 BdrvDirtyBitmap *bitmap; 5365 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { 5366 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); 5367 } 5368 } 5369 5370 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) 5371 { 5372 BdrvDirtyBitmap *bitmap; 5373 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { 5374 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); 5375 } 5376 } 5377 5378 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) 5379 { 5380 return hbitmap_count(bitmap->bitmap); 5381 } 5382 5383 /* Get a reference to bs */ 5384 void bdrv_ref(BlockDriverState *bs) 5385 { 5386 bs->refcnt++; 5387 } 5388 5389 /* Release a previously grabbed reference to bs. 5390 * If after releasing, reference count is zero, the BlockDriverState is 5391 * deleted. */ 5392 void bdrv_unref(BlockDriverState *bs) 5393 { 5394 assert(bs->refcnt > 0); 5395 if (--bs->refcnt == 0) { 5396 bdrv_delete(bs); 5397 } 5398 } 5399 5400 struct BdrvOpBlocker { 5401 Error *reason; 5402 QLIST_ENTRY(BdrvOpBlocker) list; 5403 }; 5404 5405 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp) 5406 { 5407 BdrvOpBlocker *blocker; 5408 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5409 if (!QLIST_EMPTY(&bs->op_blockers[op])) { 5410 blocker = QLIST_FIRST(&bs->op_blockers[op]); 5411 if (errp) { 5412 error_setg(errp, "Device '%s' is busy: %s", 5413 bs->device_name, error_get_pretty(blocker->reason)); 5414 } 5415 return true; 5416 } 5417 return false; 5418 } 5419 5420 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason) 5421 { 5422 BdrvOpBlocker *blocker; 5423 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5424 5425 blocker = g_malloc0(sizeof(BdrvOpBlocker)); 5426 blocker->reason = reason; 5427 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list); 5428 } 5429 5430 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason) 5431 { 5432 BdrvOpBlocker *blocker, *next; 5433 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX); 5434 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) { 5435 if (blocker->reason == reason) { 5436 QLIST_REMOVE(blocker, list); 5437 g_free(blocker); 5438 } 5439 } 5440 } 5441 5442 void bdrv_op_block_all(BlockDriverState *bs, Error *reason) 5443 { 5444 int i; 5445 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5446 bdrv_op_block(bs, i, reason); 5447 } 5448 } 5449 5450 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason) 5451 { 5452 int i; 5453 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5454 bdrv_op_unblock(bs, i, reason); 5455 } 5456 } 5457 5458 bool bdrv_op_blocker_is_empty(BlockDriverState *bs) 5459 { 5460 int i; 5461 5462 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) { 5463 if (!QLIST_EMPTY(&bs->op_blockers[i])) { 5464 return false; 5465 } 5466 } 5467 return true; 5468 } 5469 5470 void bdrv_iostatus_enable(BlockDriverState *bs) 5471 { 5472 bs->iostatus_enabled = true; 5473 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 5474 } 5475 5476 /* The I/O status is only enabled if the drive explicitly 5477 * enables it _and_ the VM is configured to stop on errors */ 5478 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) 5479 { 5480 return (bs->iostatus_enabled && 5481 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || 5482 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || 5483 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); 5484 } 5485 5486 void bdrv_iostatus_disable(BlockDriverState *bs) 5487 { 5488 bs->iostatus_enabled = false; 5489 } 5490 5491 void bdrv_iostatus_reset(BlockDriverState *bs) 5492 { 5493 if (bdrv_iostatus_is_enabled(bs)) { 5494 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 5495 if (bs->job) { 5496 block_job_iostatus_reset(bs->job); 5497 } 5498 } 5499 } 5500 5501 void bdrv_iostatus_set_err(BlockDriverState *bs, int error) 5502 { 5503 assert(bdrv_iostatus_is_enabled(bs)); 5504 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 5505 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : 5506 BLOCK_DEVICE_IO_STATUS_FAILED; 5507 } 5508 } 5509 5510 void 5511 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, 5512 enum BlockAcctType type) 5513 { 5514 assert(type < BDRV_MAX_IOTYPE); 5515 5516 cookie->bytes = bytes; 5517 cookie->start_time_ns = get_clock(); 5518 cookie->type = type; 5519 } 5520 5521 void 5522 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie) 5523 { 5524 assert(cookie->type < BDRV_MAX_IOTYPE); 5525 5526 bs->nr_bytes[cookie->type] += cookie->bytes; 5527 bs->nr_ops[cookie->type]++; 5528 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns; 5529 } 5530 5531 void bdrv_img_create(const char *filename, const char *fmt, 5532 const char *base_filename, const char *base_fmt, 5533 char *options, uint64_t img_size, int flags, 5534 Error **errp, bool quiet) 5535 { 5536 QemuOptsList *create_opts = NULL; 5537 QemuOpts *opts = NULL; 5538 const char *backing_fmt, *backing_file; 5539 int64_t size; 5540 BlockDriver *drv, *proto_drv; 5541 BlockDriver *backing_drv = NULL; 5542 Error *local_err = NULL; 5543 int ret = 0; 5544 5545 /* Find driver and parse its options */ 5546 drv = bdrv_find_format(fmt); 5547 if (!drv) { 5548 error_setg(errp, "Unknown file format '%s'", fmt); 5549 return; 5550 } 5551 5552 proto_drv = bdrv_find_protocol(filename, true); 5553 if (!proto_drv) { 5554 error_setg(errp, "Unknown protocol '%s'", filename); 5555 return; 5556 } 5557 5558 create_opts = qemu_opts_append(create_opts, drv->create_opts); 5559 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); 5560 5561 /* Create parameter list with default values */ 5562 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort); 5563 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size); 5564 5565 /* Parse -o options */ 5566 if (options) { 5567 if (qemu_opts_do_parse(opts, options, NULL) != 0) { 5568 error_setg(errp, "Invalid options for file format '%s'", fmt); 5569 goto out; 5570 } 5571 } 5572 5573 if (base_filename) { 5574 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) { 5575 error_setg(errp, "Backing file not supported for file format '%s'", 5576 fmt); 5577 goto out; 5578 } 5579 } 5580 5581 if (base_fmt) { 5582 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) { 5583 error_setg(errp, "Backing file format not supported for file " 5584 "format '%s'", fmt); 5585 goto out; 5586 } 5587 } 5588 5589 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); 5590 if (backing_file) { 5591 if (!strcmp(filename, backing_file)) { 5592 error_setg(errp, "Error: Trying to create an image with the " 5593 "same filename as the backing file"); 5594 goto out; 5595 } 5596 } 5597 5598 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); 5599 if (backing_fmt) { 5600 backing_drv = bdrv_find_format(backing_fmt); 5601 if (!backing_drv) { 5602 error_setg(errp, "Unknown backing file format '%s'", 5603 backing_fmt); 5604 goto out; 5605 } 5606 } 5607 5608 // The size for the image must always be specified, with one exception: 5609 // If we are using a backing file, we can obtain the size from there 5610 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); 5611 if (size == -1) { 5612 if (backing_file) { 5613 BlockDriverState *bs; 5614 uint64_t size; 5615 int back_flags; 5616 5617 /* backing files always opened read-only */ 5618 back_flags = 5619 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 5620 5621 bs = NULL; 5622 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags, 5623 backing_drv, &local_err); 5624 if (ret < 0) { 5625 error_setg_errno(errp, -ret, "Could not open '%s': %s", 5626 backing_file, 5627 error_get_pretty(local_err)); 5628 error_free(local_err); 5629 local_err = NULL; 5630 goto out; 5631 } 5632 bdrv_get_geometry(bs, &size); 5633 size *= 512; 5634 5635 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size); 5636 5637 bdrv_unref(bs); 5638 } else { 5639 error_setg(errp, "Image creation needs a size parameter"); 5640 goto out; 5641 } 5642 } 5643 5644 if (!quiet) { 5645 printf("Formatting '%s', fmt=%s ", filename, fmt); 5646 qemu_opts_print(opts); 5647 puts(""); 5648 } 5649 5650 ret = bdrv_create(drv, filename, opts, &local_err); 5651 5652 if (ret == -EFBIG) { 5653 /* This is generally a better message than whatever the driver would 5654 * deliver (especially because of the cluster_size_hint), since that 5655 * is most probably not much different from "image too large". */ 5656 const char *cluster_size_hint = ""; 5657 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) { 5658 cluster_size_hint = " (try using a larger cluster size)"; 5659 } 5660 error_setg(errp, "The image size is too large for file format '%s'" 5661 "%s", fmt, cluster_size_hint); 5662 error_free(local_err); 5663 local_err = NULL; 5664 } 5665 5666 out: 5667 qemu_opts_del(opts); 5668 qemu_opts_free(create_opts); 5669 if (local_err) { 5670 error_propagate(errp, local_err); 5671 } 5672 } 5673 5674 AioContext *bdrv_get_aio_context(BlockDriverState *bs) 5675 { 5676 return bs->aio_context; 5677 } 5678 5679 void bdrv_detach_aio_context(BlockDriverState *bs) 5680 { 5681 if (!bs->drv) { 5682 return; 5683 } 5684 5685 if (bs->io_limits_enabled) { 5686 throttle_detach_aio_context(&bs->throttle_state); 5687 } 5688 if (bs->drv->bdrv_detach_aio_context) { 5689 bs->drv->bdrv_detach_aio_context(bs); 5690 } 5691 if (bs->file) { 5692 bdrv_detach_aio_context(bs->file); 5693 } 5694 if (bs->backing_hd) { 5695 bdrv_detach_aio_context(bs->backing_hd); 5696 } 5697 5698 bs->aio_context = NULL; 5699 } 5700 5701 void bdrv_attach_aio_context(BlockDriverState *bs, 5702 AioContext *new_context) 5703 { 5704 if (!bs->drv) { 5705 return; 5706 } 5707 5708 bs->aio_context = new_context; 5709 5710 if (bs->backing_hd) { 5711 bdrv_attach_aio_context(bs->backing_hd, new_context); 5712 } 5713 if (bs->file) { 5714 bdrv_attach_aio_context(bs->file, new_context); 5715 } 5716 if (bs->drv->bdrv_attach_aio_context) { 5717 bs->drv->bdrv_attach_aio_context(bs, new_context); 5718 } 5719 if (bs->io_limits_enabled) { 5720 throttle_attach_aio_context(&bs->throttle_state, new_context); 5721 } 5722 } 5723 5724 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context) 5725 { 5726 bdrv_drain_all(); /* ensure there are no in-flight requests */ 5727 5728 bdrv_detach_aio_context(bs); 5729 5730 /* This function executes in the old AioContext so acquire the new one in 5731 * case it runs in a different thread. 5732 */ 5733 aio_context_acquire(new_context); 5734 bdrv_attach_aio_context(bs, new_context); 5735 aio_context_release(new_context); 5736 } 5737 5738 void bdrv_add_before_write_notifier(BlockDriverState *bs, 5739 NotifierWithReturn *notifier) 5740 { 5741 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 5742 } 5743 5744 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts) 5745 { 5746 if (!bs->drv->bdrv_amend_options) { 5747 return -ENOTSUP; 5748 } 5749 return bs->drv->bdrv_amend_options(bs, opts); 5750 } 5751 5752 /* This function will be called by the bdrv_recurse_is_first_non_filter method 5753 * of block filter and by bdrv_is_first_non_filter. 5754 * It is used to test if the given bs is the candidate or recurse more in the 5755 * node graph. 5756 */ 5757 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, 5758 BlockDriverState *candidate) 5759 { 5760 /* return false if basic checks fails */ 5761 if (!bs || !bs->drv) { 5762 return false; 5763 } 5764 5765 /* the code reached a non block filter driver -> check if the bs is 5766 * the same as the candidate. It's the recursion termination condition. 5767 */ 5768 if (!bs->drv->is_filter) { 5769 return bs == candidate; 5770 } 5771 /* Down this path the driver is a block filter driver */ 5772 5773 /* If the block filter recursion method is defined use it to recurse down 5774 * the node graph. 5775 */ 5776 if (bs->drv->bdrv_recurse_is_first_non_filter) { 5777 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); 5778 } 5779 5780 /* the driver is a block filter but don't allow to recurse -> return false 5781 */ 5782 return false; 5783 } 5784 5785 /* This function checks if the candidate is the first non filter bs down it's 5786 * bs chain. Since we don't have pointers to parents it explore all bs chains 5787 * from the top. Some filters can choose not to pass down the recursion. 5788 */ 5789 bool bdrv_is_first_non_filter(BlockDriverState *candidate) 5790 { 5791 BlockDriverState *bs; 5792 5793 /* walk down the bs forest recursively */ 5794 QTAILQ_FOREACH(bs, &bdrv_states, device_list) { 5795 bool perm; 5796 5797 /* try to recurse in this top level bs */ 5798 perm = bdrv_recurse_is_first_non_filter(bs, candidate); 5799 5800 /* candidate is the first non filter */ 5801 if (perm) { 5802 return true; 5803 } 5804 } 5805 5806 return false; 5807 } 5808