1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "config-host.h" 25 #include "qemu-common.h" 26 #include "trace.h" 27 #include "monitor/monitor.h" 28 #include "block/block_int.h" 29 #include "block/blockjob.h" 30 #include "qemu/module.h" 31 #include "qapi/qmp/qjson.h" 32 #include "sysemu/sysemu.h" 33 #include "qemu/notify.h" 34 #include "block/coroutine.h" 35 #include "qmp-commands.h" 36 #include "qemu/timer.h" 37 38 #ifdef CONFIG_BSD 39 #include <sys/types.h> 40 #include <sys/stat.h> 41 #include <sys/ioctl.h> 42 #include <sys/queue.h> 43 #ifndef __DragonFly__ 44 #include <sys/disk.h> 45 #endif 46 #endif 47 48 #ifdef _WIN32 49 #include <windows.h> 50 #endif 51 52 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 53 54 typedef enum { 55 BDRV_REQ_COPY_ON_READ = 0x1, 56 BDRV_REQ_ZERO_WRITE = 0x2, 57 } BdrvRequestFlags; 58 59 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); 60 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 62 BlockDriverCompletionFunc *cb, void *opaque); 63 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 64 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 65 BlockDriverCompletionFunc *cb, void *opaque); 66 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 67 int64_t sector_num, int nb_sectors, 68 QEMUIOVector *iov); 69 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 70 int64_t sector_num, int nb_sectors, 71 QEMUIOVector *iov); 72 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 74 BdrvRequestFlags flags); 75 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 76 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 77 BdrvRequestFlags flags); 78 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 79 int64_t sector_num, 80 QEMUIOVector *qiov, 81 int nb_sectors, 82 BlockDriverCompletionFunc *cb, 83 void *opaque, 84 bool is_write); 85 static void coroutine_fn bdrv_co_do_rw(void *opaque); 86 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 87 int64_t sector_num, int nb_sectors); 88 89 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, 90 bool is_write, double elapsed_time, uint64_t *wait); 91 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, 92 double elapsed_time, uint64_t *wait); 93 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, 94 bool is_write, int64_t *wait); 95 96 static QTAILQ_HEAD(, BlockDriverState) bdrv_states = 97 QTAILQ_HEAD_INITIALIZER(bdrv_states); 98 99 static QLIST_HEAD(, BlockDriver) bdrv_drivers = 100 QLIST_HEAD_INITIALIZER(bdrv_drivers); 101 102 /* The device to use for VM snapshots */ 103 static BlockDriverState *bs_snapshots; 104 105 /* If non-zero, use only whitelisted block drivers */ 106 static int use_bdrv_whitelist; 107 108 #ifdef _WIN32 109 static int is_windows_drive_prefix(const char *filename) 110 { 111 return (((filename[0] >= 'a' && filename[0] <= 'z') || 112 (filename[0] >= 'A' && filename[0] <= 'Z')) && 113 filename[1] == ':'); 114 } 115 116 int is_windows_drive(const char *filename) 117 { 118 if (is_windows_drive_prefix(filename) && 119 filename[2] == '\0') 120 return 1; 121 if (strstart(filename, "\\\\.\\", NULL) || 122 strstart(filename, "//./", NULL)) 123 return 1; 124 return 0; 125 } 126 #endif 127 128 /* throttling disk I/O limits */ 129 void bdrv_io_limits_disable(BlockDriverState *bs) 130 { 131 bs->io_limits_enabled = false; 132 133 while (qemu_co_queue_next(&bs->throttled_reqs)); 134 135 if (bs->block_timer) { 136 qemu_del_timer(bs->block_timer); 137 qemu_free_timer(bs->block_timer); 138 bs->block_timer = NULL; 139 } 140 141 bs->slice_start = 0; 142 bs->slice_end = 0; 143 bs->slice_time = 0; 144 memset(&bs->io_base, 0, sizeof(bs->io_base)); 145 } 146 147 static void bdrv_block_timer(void *opaque) 148 { 149 BlockDriverState *bs = opaque; 150 151 qemu_co_queue_next(&bs->throttled_reqs); 152 } 153 154 void bdrv_io_limits_enable(BlockDriverState *bs) 155 { 156 qemu_co_queue_init(&bs->throttled_reqs); 157 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); 158 bs->io_limits_enabled = true; 159 } 160 161 bool bdrv_io_limits_enabled(BlockDriverState *bs) 162 { 163 BlockIOLimit *io_limits = &bs->io_limits; 164 return io_limits->bps[BLOCK_IO_LIMIT_READ] 165 || io_limits->bps[BLOCK_IO_LIMIT_WRITE] 166 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL] 167 || io_limits->iops[BLOCK_IO_LIMIT_READ] 168 || io_limits->iops[BLOCK_IO_LIMIT_WRITE] 169 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]; 170 } 171 172 static void bdrv_io_limits_intercept(BlockDriverState *bs, 173 bool is_write, int nb_sectors) 174 { 175 int64_t wait_time = -1; 176 177 if (!qemu_co_queue_empty(&bs->throttled_reqs)) { 178 qemu_co_queue_wait(&bs->throttled_reqs); 179 } 180 181 /* In fact, we hope to keep each request's timing, in FIFO mode. The next 182 * throttled requests will not be dequeued until the current request is 183 * allowed to be serviced. So if the current request still exceeds the 184 * limits, it will be inserted to the head. All requests followed it will 185 * be still in throttled_reqs queue. 186 */ 187 188 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) { 189 qemu_mod_timer(bs->block_timer, 190 wait_time + qemu_get_clock_ns(vm_clock)); 191 qemu_co_queue_wait_insert_head(&bs->throttled_reqs); 192 } 193 194 qemu_co_queue_next(&bs->throttled_reqs); 195 } 196 197 /* check if the path starts with "<protocol>:" */ 198 static int path_has_protocol(const char *path) 199 { 200 const char *p; 201 202 #ifdef _WIN32 203 if (is_windows_drive(path) || 204 is_windows_drive_prefix(path)) { 205 return 0; 206 } 207 p = path + strcspn(path, ":/\\"); 208 #else 209 p = path + strcspn(path, ":/"); 210 #endif 211 212 return *p == ':'; 213 } 214 215 int path_is_absolute(const char *path) 216 { 217 #ifdef _WIN32 218 /* specific case for names like: "\\.\d:" */ 219 if (is_windows_drive(path) || is_windows_drive_prefix(path)) { 220 return 1; 221 } 222 return (*path == '/' || *path == '\\'); 223 #else 224 return (*path == '/'); 225 #endif 226 } 227 228 /* if filename is absolute, just copy it to dest. Otherwise, build a 229 path to it by considering it is relative to base_path. URL are 230 supported. */ 231 void path_combine(char *dest, int dest_size, 232 const char *base_path, 233 const char *filename) 234 { 235 const char *p, *p1; 236 int len; 237 238 if (dest_size <= 0) 239 return; 240 if (path_is_absolute(filename)) { 241 pstrcpy(dest, dest_size, filename); 242 } else { 243 p = strchr(base_path, ':'); 244 if (p) 245 p++; 246 else 247 p = base_path; 248 p1 = strrchr(base_path, '/'); 249 #ifdef _WIN32 250 { 251 const char *p2; 252 p2 = strrchr(base_path, '\\'); 253 if (!p1 || p2 > p1) 254 p1 = p2; 255 } 256 #endif 257 if (p1) 258 p1++; 259 else 260 p1 = base_path; 261 if (p1 > p) 262 p = p1; 263 len = p - base_path; 264 if (len > dest_size - 1) 265 len = dest_size - 1; 266 memcpy(dest, base_path, len); 267 dest[len] = '\0'; 268 pstrcat(dest, dest_size, filename); 269 } 270 } 271 272 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz) 273 { 274 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) { 275 pstrcpy(dest, sz, bs->backing_file); 276 } else { 277 path_combine(dest, sz, bs->filename, bs->backing_file); 278 } 279 } 280 281 void bdrv_register(BlockDriver *bdrv) 282 { 283 /* Block drivers without coroutine functions need emulation */ 284 if (!bdrv->bdrv_co_readv) { 285 bdrv->bdrv_co_readv = bdrv_co_readv_em; 286 bdrv->bdrv_co_writev = bdrv_co_writev_em; 287 288 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 289 * the block driver lacks aio we need to emulate that too. 290 */ 291 if (!bdrv->bdrv_aio_readv) { 292 /* add AIO emulation layer */ 293 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 294 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 295 } 296 } 297 298 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); 299 } 300 301 /* create a new block device (by default it is empty) */ 302 BlockDriverState *bdrv_new(const char *device_name) 303 { 304 BlockDriverState *bs; 305 306 bs = g_malloc0(sizeof(BlockDriverState)); 307 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); 308 if (device_name[0] != '\0') { 309 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); 310 } 311 bdrv_iostatus_disable(bs); 312 notifier_list_init(&bs->close_notifiers); 313 314 return bs; 315 } 316 317 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) 318 { 319 notifier_list_add(&bs->close_notifiers, notify); 320 } 321 322 BlockDriver *bdrv_find_format(const char *format_name) 323 { 324 BlockDriver *drv1; 325 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 326 if (!strcmp(drv1->format_name, format_name)) { 327 return drv1; 328 } 329 } 330 return NULL; 331 } 332 333 static int bdrv_is_whitelisted(BlockDriver *drv) 334 { 335 static const char *whitelist[] = { 336 CONFIG_BDRV_WHITELIST 337 }; 338 const char **p; 339 340 if (!whitelist[0]) 341 return 1; /* no whitelist, anything goes */ 342 343 for (p = whitelist; *p; p++) { 344 if (!strcmp(drv->format_name, *p)) { 345 return 1; 346 } 347 } 348 return 0; 349 } 350 351 BlockDriver *bdrv_find_whitelisted_format(const char *format_name) 352 { 353 BlockDriver *drv = bdrv_find_format(format_name); 354 return drv && bdrv_is_whitelisted(drv) ? drv : NULL; 355 } 356 357 typedef struct CreateCo { 358 BlockDriver *drv; 359 char *filename; 360 QEMUOptionParameter *options; 361 int ret; 362 } CreateCo; 363 364 static void coroutine_fn bdrv_create_co_entry(void *opaque) 365 { 366 CreateCo *cco = opaque; 367 assert(cco->drv); 368 369 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options); 370 } 371 372 int bdrv_create(BlockDriver *drv, const char* filename, 373 QEMUOptionParameter *options) 374 { 375 int ret; 376 377 Coroutine *co; 378 CreateCo cco = { 379 .drv = drv, 380 .filename = g_strdup(filename), 381 .options = options, 382 .ret = NOT_DONE, 383 }; 384 385 if (!drv->bdrv_create) { 386 ret = -ENOTSUP; 387 goto out; 388 } 389 390 if (qemu_in_coroutine()) { 391 /* Fast-path if already in coroutine context */ 392 bdrv_create_co_entry(&cco); 393 } else { 394 co = qemu_coroutine_create(bdrv_create_co_entry); 395 qemu_coroutine_enter(co, &cco); 396 while (cco.ret == NOT_DONE) { 397 qemu_aio_wait(); 398 } 399 } 400 401 ret = cco.ret; 402 403 out: 404 g_free(cco.filename); 405 return ret; 406 } 407 408 int bdrv_create_file(const char* filename, QEMUOptionParameter *options) 409 { 410 BlockDriver *drv; 411 412 drv = bdrv_find_protocol(filename); 413 if (drv == NULL) { 414 return -ENOENT; 415 } 416 417 return bdrv_create(drv, filename, options); 418 } 419 420 /* 421 * Create a uniquely-named empty temporary file. 422 * Return 0 upon success, otherwise a negative errno value. 423 */ 424 int get_tmp_filename(char *filename, int size) 425 { 426 #ifdef _WIN32 427 char temp_dir[MAX_PATH]; 428 /* GetTempFileName requires that its output buffer (4th param) 429 have length MAX_PATH or greater. */ 430 assert(size >= MAX_PATH); 431 return (GetTempPath(MAX_PATH, temp_dir) 432 && GetTempFileName(temp_dir, "qem", 0, filename) 433 ? 0 : -GetLastError()); 434 #else 435 int fd; 436 const char *tmpdir; 437 tmpdir = getenv("TMPDIR"); 438 if (!tmpdir) 439 tmpdir = "/tmp"; 440 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { 441 return -EOVERFLOW; 442 } 443 fd = mkstemp(filename); 444 if (fd < 0) { 445 return -errno; 446 } 447 if (close(fd) != 0) { 448 unlink(filename); 449 return -errno; 450 } 451 return 0; 452 #endif 453 } 454 455 /* 456 * Detect host devices. By convention, /dev/cdrom[N] is always 457 * recognized as a host CDROM. 458 */ 459 static BlockDriver *find_hdev_driver(const char *filename) 460 { 461 int score_max = 0, score; 462 BlockDriver *drv = NULL, *d; 463 464 QLIST_FOREACH(d, &bdrv_drivers, list) { 465 if (d->bdrv_probe_device) { 466 score = d->bdrv_probe_device(filename); 467 if (score > score_max) { 468 score_max = score; 469 drv = d; 470 } 471 } 472 } 473 474 return drv; 475 } 476 477 BlockDriver *bdrv_find_protocol(const char *filename) 478 { 479 BlockDriver *drv1; 480 char protocol[128]; 481 int len; 482 const char *p; 483 484 /* TODO Drivers without bdrv_file_open must be specified explicitly */ 485 486 /* 487 * XXX(hch): we really should not let host device detection 488 * override an explicit protocol specification, but moving this 489 * later breaks access to device names with colons in them. 490 * Thanks to the brain-dead persistent naming schemes on udev- 491 * based Linux systems those actually are quite common. 492 */ 493 drv1 = find_hdev_driver(filename); 494 if (drv1) { 495 return drv1; 496 } 497 498 if (!path_has_protocol(filename)) { 499 return bdrv_find_format("file"); 500 } 501 p = strchr(filename, ':'); 502 assert(p != NULL); 503 len = p - filename; 504 if (len > sizeof(protocol) - 1) 505 len = sizeof(protocol) - 1; 506 memcpy(protocol, filename, len); 507 protocol[len] = '\0'; 508 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 509 if (drv1->protocol_name && 510 !strcmp(drv1->protocol_name, protocol)) { 511 return drv1; 512 } 513 } 514 return NULL; 515 } 516 517 static int find_image_format(BlockDriverState *bs, const char *filename, 518 BlockDriver **pdrv) 519 { 520 int score, score_max; 521 BlockDriver *drv1, *drv; 522 uint8_t buf[2048]; 523 int ret = 0; 524 525 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ 526 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { 527 drv = bdrv_find_format("raw"); 528 if (!drv) { 529 ret = -ENOENT; 530 } 531 *pdrv = drv; 532 return ret; 533 } 534 535 ret = bdrv_pread(bs, 0, buf, sizeof(buf)); 536 if (ret < 0) { 537 *pdrv = NULL; 538 return ret; 539 } 540 541 score_max = 0; 542 drv = NULL; 543 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 544 if (drv1->bdrv_probe) { 545 score = drv1->bdrv_probe(buf, ret, filename); 546 if (score > score_max) { 547 score_max = score; 548 drv = drv1; 549 } 550 } 551 } 552 if (!drv) { 553 ret = -ENOENT; 554 } 555 *pdrv = drv; 556 return ret; 557 } 558 559 /** 560 * Set the current 'total_sectors' value 561 */ 562 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) 563 { 564 BlockDriver *drv = bs->drv; 565 566 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ 567 if (bs->sg) 568 return 0; 569 570 /* query actual device if possible, otherwise just trust the hint */ 571 if (drv->bdrv_getlength) { 572 int64_t length = drv->bdrv_getlength(bs); 573 if (length < 0) { 574 return length; 575 } 576 hint = length >> BDRV_SECTOR_BITS; 577 } 578 579 bs->total_sectors = hint; 580 return 0; 581 } 582 583 /** 584 * Set open flags for a given discard mode 585 * 586 * Return 0 on success, -1 if the discard mode was invalid. 587 */ 588 int bdrv_parse_discard_flags(const char *mode, int *flags) 589 { 590 *flags &= ~BDRV_O_UNMAP; 591 592 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) { 593 /* do nothing */ 594 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) { 595 *flags |= BDRV_O_UNMAP; 596 } else { 597 return -1; 598 } 599 600 return 0; 601 } 602 603 /** 604 * Set open flags for a given cache mode 605 * 606 * Return 0 on success, -1 if the cache mode was invalid. 607 */ 608 int bdrv_parse_cache_flags(const char *mode, int *flags) 609 { 610 *flags &= ~BDRV_O_CACHE_MASK; 611 612 if (!strcmp(mode, "off") || !strcmp(mode, "none")) { 613 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; 614 } else if (!strcmp(mode, "directsync")) { 615 *flags |= BDRV_O_NOCACHE; 616 } else if (!strcmp(mode, "writeback")) { 617 *flags |= BDRV_O_CACHE_WB; 618 } else if (!strcmp(mode, "unsafe")) { 619 *flags |= BDRV_O_CACHE_WB; 620 *flags |= BDRV_O_NO_FLUSH; 621 } else if (!strcmp(mode, "writethrough")) { 622 /* this is the default */ 623 } else { 624 return -1; 625 } 626 627 return 0; 628 } 629 630 /** 631 * The copy-on-read flag is actually a reference count so multiple users may 632 * use the feature without worrying about clobbering its previous state. 633 * Copy-on-read stays enabled until all users have called to disable it. 634 */ 635 void bdrv_enable_copy_on_read(BlockDriverState *bs) 636 { 637 bs->copy_on_read++; 638 } 639 640 void bdrv_disable_copy_on_read(BlockDriverState *bs) 641 { 642 assert(bs->copy_on_read > 0); 643 bs->copy_on_read--; 644 } 645 646 static int bdrv_open_flags(BlockDriverState *bs, int flags) 647 { 648 int open_flags = flags | BDRV_O_CACHE_WB; 649 650 /* 651 * Clear flags that are internal to the block layer before opening the 652 * image. 653 */ 654 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 655 656 /* 657 * Snapshots should be writable. 658 */ 659 if (bs->is_temporary) { 660 open_flags |= BDRV_O_RDWR; 661 } 662 663 return open_flags; 664 } 665 666 /* 667 * Common part for opening disk images and files 668 * 669 * Removes all processed options from *options. 670 */ 671 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, 672 const char *filename, QDict *options, 673 int flags, BlockDriver *drv) 674 { 675 int ret, open_flags; 676 677 assert(drv != NULL); 678 assert(bs->file == NULL); 679 assert(options != NULL && bs->options != options); 680 681 trace_bdrv_open_common(bs, filename, flags, drv->format_name); 682 683 bs->open_flags = flags; 684 bs->buffer_alignment = 512; 685 686 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ 687 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) { 688 bdrv_enable_copy_on_read(bs); 689 } 690 691 pstrcpy(bs->filename, sizeof(bs->filename), filename); 692 693 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) { 694 return -ENOTSUP; 695 } 696 697 bs->drv = drv; 698 bs->opaque = g_malloc0(drv->instance_size); 699 700 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); 701 open_flags = bdrv_open_flags(bs, flags); 702 703 bs->read_only = !(open_flags & BDRV_O_RDWR); 704 705 /* Open the image, either directly or using a protocol */ 706 if (drv->bdrv_file_open) { 707 if (file != NULL) { 708 bdrv_swap(file, bs); 709 ret = 0; 710 } else { 711 ret = drv->bdrv_file_open(bs, filename, options, open_flags); 712 } 713 } else { 714 assert(file != NULL); 715 bs->file = file; 716 ret = drv->bdrv_open(bs, options, open_flags); 717 } 718 719 if (ret < 0) { 720 goto free_and_fail; 721 } 722 723 ret = refresh_total_sectors(bs, bs->total_sectors); 724 if (ret < 0) { 725 goto free_and_fail; 726 } 727 728 #ifndef _WIN32 729 if (bs->is_temporary) { 730 unlink(filename); 731 } 732 #endif 733 return 0; 734 735 free_and_fail: 736 bs->file = NULL; 737 g_free(bs->opaque); 738 bs->opaque = NULL; 739 bs->drv = NULL; 740 return ret; 741 } 742 743 /* 744 * Opens a file using a protocol (file, host_device, nbd, ...) 745 * 746 * options is a QDict of options to pass to the block drivers, or NULL for an 747 * empty set of options. The reference to the QDict belongs to the block layer 748 * after the call (even on failure), so if the caller intends to reuse the 749 * dictionary, it needs to use QINCREF() before calling bdrv_file_open. 750 */ 751 int bdrv_file_open(BlockDriverState **pbs, const char *filename, 752 QDict *options, int flags) 753 { 754 BlockDriverState *bs; 755 BlockDriver *drv; 756 int ret; 757 758 drv = bdrv_find_protocol(filename); 759 if (!drv) { 760 QDECREF(options); 761 return -ENOENT; 762 } 763 764 /* NULL means an empty set of options */ 765 if (options == NULL) { 766 options = qdict_new(); 767 } 768 769 bs = bdrv_new(""); 770 bs->options = options; 771 options = qdict_clone_shallow(options); 772 773 if (drv->bdrv_parse_filename) { 774 Error *local_err = NULL; 775 drv->bdrv_parse_filename(filename, options, &local_err); 776 if (error_is_set(&local_err)) { 777 qerror_report_err(local_err); 778 error_free(local_err); 779 ret = -EINVAL; 780 goto fail; 781 } 782 } 783 784 ret = bdrv_open_common(bs, NULL, filename, options, flags, drv); 785 if (ret < 0) { 786 goto fail; 787 } 788 789 /* Check if any unknown options were used */ 790 if (qdict_size(options) != 0) { 791 const QDictEntry *entry = qdict_first(options); 792 qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't " 793 "support the option '%s'", 794 drv->format_name, entry->key); 795 ret = -EINVAL; 796 goto fail; 797 } 798 QDECREF(options); 799 800 bs->growable = 1; 801 *pbs = bs; 802 return 0; 803 804 fail: 805 QDECREF(options); 806 if (!bs->drv) { 807 QDECREF(bs->options); 808 } 809 bdrv_delete(bs); 810 return ret; 811 } 812 813 int bdrv_open_backing_file(BlockDriverState *bs) 814 { 815 char backing_filename[PATH_MAX]; 816 int back_flags, ret; 817 BlockDriver *back_drv = NULL; 818 819 if (bs->backing_hd != NULL) { 820 return 0; 821 } 822 823 bs->open_flags &= ~BDRV_O_NO_BACKING; 824 if (bs->backing_file[0] == '\0') { 825 return 0; 826 } 827 828 bs->backing_hd = bdrv_new(""); 829 bdrv_get_full_backing_filename(bs, backing_filename, 830 sizeof(backing_filename)); 831 832 if (bs->backing_format[0] != '\0') { 833 back_drv = bdrv_find_format(bs->backing_format); 834 } 835 836 /* backing files always opened read-only */ 837 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT); 838 839 ret = bdrv_open(bs->backing_hd, backing_filename, NULL, 840 back_flags, back_drv); 841 if (ret < 0) { 842 bdrv_delete(bs->backing_hd); 843 bs->backing_hd = NULL; 844 bs->open_flags |= BDRV_O_NO_BACKING; 845 return ret; 846 } 847 return 0; 848 } 849 850 static void extract_subqdict(QDict *src, QDict **dst, const char *start) 851 { 852 const QDictEntry *entry, *next; 853 const char *p; 854 855 *dst = qdict_new(); 856 entry = qdict_first(src); 857 858 while (entry != NULL) { 859 next = qdict_next(src, entry); 860 if (strstart(entry->key, start, &p)) { 861 qobject_incref(entry->value); 862 qdict_put_obj(*dst, p, entry->value); 863 qdict_del(src, entry->key); 864 } 865 entry = next; 866 } 867 } 868 869 /* 870 * Opens a disk image (raw, qcow2, vmdk, ...) 871 * 872 * options is a QDict of options to pass to the block drivers, or NULL for an 873 * empty set of options. The reference to the QDict belongs to the block layer 874 * after the call (even on failure), so if the caller intends to reuse the 875 * dictionary, it needs to use QINCREF() before calling bdrv_open. 876 */ 877 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, 878 int flags, BlockDriver *drv) 879 { 880 int ret; 881 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ 882 char tmp_filename[PATH_MAX + 1]; 883 BlockDriverState *file = NULL; 884 QDict *file_options = NULL; 885 886 /* NULL means an empty set of options */ 887 if (options == NULL) { 888 options = qdict_new(); 889 } 890 891 bs->options = options; 892 options = qdict_clone_shallow(options); 893 894 /* For snapshot=on, create a temporary qcow2 overlay */ 895 if (flags & BDRV_O_SNAPSHOT) { 896 BlockDriverState *bs1; 897 int64_t total_size; 898 BlockDriver *bdrv_qcow2; 899 QEMUOptionParameter *options; 900 char backing_filename[PATH_MAX]; 901 902 /* if snapshot, we create a temporary backing file and open it 903 instead of opening 'filename' directly */ 904 905 /* if there is a backing file, use it */ 906 bs1 = bdrv_new(""); 907 ret = bdrv_open(bs1, filename, NULL, 0, drv); 908 if (ret < 0) { 909 bdrv_delete(bs1); 910 goto fail; 911 } 912 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK; 913 914 bdrv_delete(bs1); 915 916 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename)); 917 if (ret < 0) { 918 goto fail; 919 } 920 921 /* Real path is meaningless for protocols */ 922 if (path_has_protocol(filename)) { 923 snprintf(backing_filename, sizeof(backing_filename), 924 "%s", filename); 925 } else if (!realpath(filename, backing_filename)) { 926 ret = -errno; 927 goto fail; 928 } 929 930 bdrv_qcow2 = bdrv_find_format("qcow2"); 931 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL); 932 933 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size); 934 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename); 935 if (drv) { 936 set_option_parameter(options, BLOCK_OPT_BACKING_FMT, 937 drv->format_name); 938 } 939 940 ret = bdrv_create(bdrv_qcow2, tmp_filename, options); 941 free_option_parameters(options); 942 if (ret < 0) { 943 goto fail; 944 } 945 946 filename = tmp_filename; 947 drv = bdrv_qcow2; 948 bs->is_temporary = 1; 949 } 950 951 /* Open image file without format layer */ 952 if (flags & BDRV_O_RDWR) { 953 flags |= BDRV_O_ALLOW_RDWR; 954 } 955 956 extract_subqdict(options, &file_options, "file."); 957 958 ret = bdrv_file_open(&file, filename, file_options, 959 bdrv_open_flags(bs, flags)); 960 if (ret < 0) { 961 goto fail; 962 } 963 964 /* Find the right image format driver */ 965 if (!drv) { 966 ret = find_image_format(file, filename, &drv); 967 } 968 969 if (!drv) { 970 goto unlink_and_fail; 971 } 972 973 /* Open the image */ 974 ret = bdrv_open_common(bs, file, filename, options, flags, drv); 975 if (ret < 0) { 976 goto unlink_and_fail; 977 } 978 979 if (bs->file != file) { 980 bdrv_delete(file); 981 file = NULL; 982 } 983 984 /* If there is a backing file, use it */ 985 if ((flags & BDRV_O_NO_BACKING) == 0) { 986 ret = bdrv_open_backing_file(bs); 987 if (ret < 0) { 988 goto close_and_fail; 989 } 990 } 991 992 /* Check if any unknown options were used */ 993 if (qdict_size(options) != 0) { 994 const QDictEntry *entry = qdict_first(options); 995 qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by " 996 "device '%s' doesn't support the option '%s'", 997 drv->format_name, bs->device_name, entry->key); 998 999 ret = -EINVAL; 1000 goto close_and_fail; 1001 } 1002 QDECREF(options); 1003 1004 if (!bdrv_key_required(bs)) { 1005 bdrv_dev_change_media_cb(bs, true); 1006 } 1007 1008 /* throttling disk I/O limits */ 1009 if (bs->io_limits_enabled) { 1010 bdrv_io_limits_enable(bs); 1011 } 1012 1013 return 0; 1014 1015 unlink_and_fail: 1016 if (file != NULL) { 1017 bdrv_delete(file); 1018 } 1019 if (bs->is_temporary) { 1020 unlink(filename); 1021 } 1022 fail: 1023 QDECREF(bs->options); 1024 QDECREF(options); 1025 bs->options = NULL; 1026 return ret; 1027 1028 close_and_fail: 1029 bdrv_close(bs); 1030 QDECREF(options); 1031 return ret; 1032 } 1033 1034 typedef struct BlockReopenQueueEntry { 1035 bool prepared; 1036 BDRVReopenState state; 1037 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; 1038 } BlockReopenQueueEntry; 1039 1040 /* 1041 * Adds a BlockDriverState to a simple queue for an atomic, transactional 1042 * reopen of multiple devices. 1043 * 1044 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT 1045 * already performed, or alternatively may be NULL a new BlockReopenQueue will 1046 * be created and initialized. This newly created BlockReopenQueue should be 1047 * passed back in for subsequent calls that are intended to be of the same 1048 * atomic 'set'. 1049 * 1050 * bs is the BlockDriverState to add to the reopen queue. 1051 * 1052 * flags contains the open flags for the associated bs 1053 * 1054 * returns a pointer to bs_queue, which is either the newly allocated 1055 * bs_queue, or the existing bs_queue being used. 1056 * 1057 */ 1058 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, 1059 BlockDriverState *bs, int flags) 1060 { 1061 assert(bs != NULL); 1062 1063 BlockReopenQueueEntry *bs_entry; 1064 if (bs_queue == NULL) { 1065 bs_queue = g_new0(BlockReopenQueue, 1); 1066 QSIMPLEQ_INIT(bs_queue); 1067 } 1068 1069 if (bs->file) { 1070 bdrv_reopen_queue(bs_queue, bs->file, flags); 1071 } 1072 1073 bs_entry = g_new0(BlockReopenQueueEntry, 1); 1074 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); 1075 1076 bs_entry->state.bs = bs; 1077 bs_entry->state.flags = flags; 1078 1079 return bs_queue; 1080 } 1081 1082 /* 1083 * Reopen multiple BlockDriverStates atomically & transactionally. 1084 * 1085 * The queue passed in (bs_queue) must have been built up previous 1086 * via bdrv_reopen_queue(). 1087 * 1088 * Reopens all BDS specified in the queue, with the appropriate 1089 * flags. All devices are prepared for reopen, and failure of any 1090 * device will cause all device changes to be abandonded, and intermediate 1091 * data cleaned up. 1092 * 1093 * If all devices prepare successfully, then the changes are committed 1094 * to all devices. 1095 * 1096 */ 1097 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) 1098 { 1099 int ret = -1; 1100 BlockReopenQueueEntry *bs_entry, *next; 1101 Error *local_err = NULL; 1102 1103 assert(bs_queue != NULL); 1104 1105 bdrv_drain_all(); 1106 1107 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1108 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { 1109 error_propagate(errp, local_err); 1110 goto cleanup; 1111 } 1112 bs_entry->prepared = true; 1113 } 1114 1115 /* If we reach this point, we have success and just need to apply the 1116 * changes 1117 */ 1118 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1119 bdrv_reopen_commit(&bs_entry->state); 1120 } 1121 1122 ret = 0; 1123 1124 cleanup: 1125 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { 1126 if (ret && bs_entry->prepared) { 1127 bdrv_reopen_abort(&bs_entry->state); 1128 } 1129 g_free(bs_entry); 1130 } 1131 g_free(bs_queue); 1132 return ret; 1133 } 1134 1135 1136 /* Reopen a single BlockDriverState with the specified flags. */ 1137 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) 1138 { 1139 int ret = -1; 1140 Error *local_err = NULL; 1141 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); 1142 1143 ret = bdrv_reopen_multiple(queue, &local_err); 1144 if (local_err != NULL) { 1145 error_propagate(errp, local_err); 1146 } 1147 return ret; 1148 } 1149 1150 1151 /* 1152 * Prepares a BlockDriverState for reopen. All changes are staged in the 1153 * 'opaque' field of the BDRVReopenState, which is used and allocated by 1154 * the block driver layer .bdrv_reopen_prepare() 1155 * 1156 * bs is the BlockDriverState to reopen 1157 * flags are the new open flags 1158 * queue is the reopen queue 1159 * 1160 * Returns 0 on success, non-zero on error. On error errp will be set 1161 * as well. 1162 * 1163 * On failure, bdrv_reopen_abort() will be called to clean up any data. 1164 * It is the responsibility of the caller to then call the abort() or 1165 * commit() for any other BDS that have been left in a prepare() state 1166 * 1167 */ 1168 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, 1169 Error **errp) 1170 { 1171 int ret = -1; 1172 Error *local_err = NULL; 1173 BlockDriver *drv; 1174 1175 assert(reopen_state != NULL); 1176 assert(reopen_state->bs->drv != NULL); 1177 drv = reopen_state->bs->drv; 1178 1179 /* if we are to stay read-only, do not allow permission change 1180 * to r/w */ 1181 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && 1182 reopen_state->flags & BDRV_O_RDWR) { 1183 error_set(errp, QERR_DEVICE_IS_READ_ONLY, 1184 reopen_state->bs->device_name); 1185 goto error; 1186 } 1187 1188 1189 ret = bdrv_flush(reopen_state->bs); 1190 if (ret) { 1191 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", 1192 strerror(-ret)); 1193 goto error; 1194 } 1195 1196 if (drv->bdrv_reopen_prepare) { 1197 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); 1198 if (ret) { 1199 if (local_err != NULL) { 1200 error_propagate(errp, local_err); 1201 } else { 1202 error_set(errp, QERR_OPEN_FILE_FAILED, 1203 reopen_state->bs->filename); 1204 } 1205 goto error; 1206 } 1207 } else { 1208 /* It is currently mandatory to have a bdrv_reopen_prepare() 1209 * handler for each supported drv. */ 1210 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, 1211 drv->format_name, reopen_state->bs->device_name, 1212 "reopening of file"); 1213 ret = -1; 1214 goto error; 1215 } 1216 1217 ret = 0; 1218 1219 error: 1220 return ret; 1221 } 1222 1223 /* 1224 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and 1225 * makes them final by swapping the staging BlockDriverState contents into 1226 * the active BlockDriverState contents. 1227 */ 1228 void bdrv_reopen_commit(BDRVReopenState *reopen_state) 1229 { 1230 BlockDriver *drv; 1231 1232 assert(reopen_state != NULL); 1233 drv = reopen_state->bs->drv; 1234 assert(drv != NULL); 1235 1236 /* If there are any driver level actions to take */ 1237 if (drv->bdrv_reopen_commit) { 1238 drv->bdrv_reopen_commit(reopen_state); 1239 } 1240 1241 /* set BDS specific flags now */ 1242 reopen_state->bs->open_flags = reopen_state->flags; 1243 reopen_state->bs->enable_write_cache = !!(reopen_state->flags & 1244 BDRV_O_CACHE_WB); 1245 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); 1246 } 1247 1248 /* 1249 * Abort the reopen, and delete and free the staged changes in 1250 * reopen_state 1251 */ 1252 void bdrv_reopen_abort(BDRVReopenState *reopen_state) 1253 { 1254 BlockDriver *drv; 1255 1256 assert(reopen_state != NULL); 1257 drv = reopen_state->bs->drv; 1258 assert(drv != NULL); 1259 1260 if (drv->bdrv_reopen_abort) { 1261 drv->bdrv_reopen_abort(reopen_state); 1262 } 1263 } 1264 1265 1266 void bdrv_close(BlockDriverState *bs) 1267 { 1268 bdrv_flush(bs); 1269 if (bs->job) { 1270 block_job_cancel_sync(bs->job); 1271 } 1272 bdrv_drain_all(); 1273 notifier_list_notify(&bs->close_notifiers, bs); 1274 1275 if (bs->drv) { 1276 if (bs == bs_snapshots) { 1277 bs_snapshots = NULL; 1278 } 1279 if (bs->backing_hd) { 1280 bdrv_delete(bs->backing_hd); 1281 bs->backing_hd = NULL; 1282 } 1283 bs->drv->bdrv_close(bs); 1284 g_free(bs->opaque); 1285 #ifdef _WIN32 1286 if (bs->is_temporary) { 1287 unlink(bs->filename); 1288 } 1289 #endif 1290 bs->opaque = NULL; 1291 bs->drv = NULL; 1292 bs->copy_on_read = 0; 1293 bs->backing_file[0] = '\0'; 1294 bs->backing_format[0] = '\0'; 1295 bs->total_sectors = 0; 1296 bs->encrypted = 0; 1297 bs->valid_key = 0; 1298 bs->sg = 0; 1299 bs->growable = 0; 1300 QDECREF(bs->options); 1301 bs->options = NULL; 1302 1303 if (bs->file != NULL) { 1304 bdrv_delete(bs->file); 1305 bs->file = NULL; 1306 } 1307 } 1308 1309 bdrv_dev_change_media_cb(bs, false); 1310 1311 /*throttling disk I/O limits*/ 1312 if (bs->io_limits_enabled) { 1313 bdrv_io_limits_disable(bs); 1314 } 1315 } 1316 1317 void bdrv_close_all(void) 1318 { 1319 BlockDriverState *bs; 1320 1321 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1322 bdrv_close(bs); 1323 } 1324 } 1325 1326 /* 1327 * Wait for pending requests to complete across all BlockDriverStates 1328 * 1329 * This function does not flush data to disk, use bdrv_flush_all() for that 1330 * after calling this function. 1331 * 1332 * Note that completion of an asynchronous I/O operation can trigger any 1333 * number of other I/O operations on other devices---for example a coroutine 1334 * can be arbitrarily complex and a constant flow of I/O can come until the 1335 * coroutine is complete. Because of this, it is not possible to have a 1336 * function to drain a single device's I/O queue. 1337 */ 1338 void bdrv_drain_all(void) 1339 { 1340 BlockDriverState *bs; 1341 bool busy; 1342 1343 do { 1344 busy = qemu_aio_wait(); 1345 1346 /* FIXME: We do not have timer support here, so this is effectively 1347 * a busy wait. 1348 */ 1349 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1350 if (!qemu_co_queue_empty(&bs->throttled_reqs)) { 1351 qemu_co_queue_restart_all(&bs->throttled_reqs); 1352 busy = true; 1353 } 1354 } 1355 } while (busy); 1356 1357 /* If requests are still pending there is a bug somewhere */ 1358 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1359 assert(QLIST_EMPTY(&bs->tracked_requests)); 1360 assert(qemu_co_queue_empty(&bs->throttled_reqs)); 1361 } 1362 } 1363 1364 /* make a BlockDriverState anonymous by removing from bdrv_state list. 1365 Also, NULL terminate the device_name to prevent double remove */ 1366 void bdrv_make_anon(BlockDriverState *bs) 1367 { 1368 if (bs->device_name[0] != '\0') { 1369 QTAILQ_REMOVE(&bdrv_states, bs, list); 1370 } 1371 bs->device_name[0] = '\0'; 1372 } 1373 1374 static void bdrv_rebind(BlockDriverState *bs) 1375 { 1376 if (bs->drv && bs->drv->bdrv_rebind) { 1377 bs->drv->bdrv_rebind(bs); 1378 } 1379 } 1380 1381 static void bdrv_move_feature_fields(BlockDriverState *bs_dest, 1382 BlockDriverState *bs_src) 1383 { 1384 /* move some fields that need to stay attached to the device */ 1385 bs_dest->open_flags = bs_src->open_flags; 1386 1387 /* dev info */ 1388 bs_dest->dev_ops = bs_src->dev_ops; 1389 bs_dest->dev_opaque = bs_src->dev_opaque; 1390 bs_dest->dev = bs_src->dev; 1391 bs_dest->buffer_alignment = bs_src->buffer_alignment; 1392 bs_dest->copy_on_read = bs_src->copy_on_read; 1393 1394 bs_dest->enable_write_cache = bs_src->enable_write_cache; 1395 1396 /* i/o timing parameters */ 1397 bs_dest->slice_time = bs_src->slice_time; 1398 bs_dest->slice_start = bs_src->slice_start; 1399 bs_dest->slice_end = bs_src->slice_end; 1400 bs_dest->io_limits = bs_src->io_limits; 1401 bs_dest->io_base = bs_src->io_base; 1402 bs_dest->throttled_reqs = bs_src->throttled_reqs; 1403 bs_dest->block_timer = bs_src->block_timer; 1404 bs_dest->io_limits_enabled = bs_src->io_limits_enabled; 1405 1406 /* r/w error */ 1407 bs_dest->on_read_error = bs_src->on_read_error; 1408 bs_dest->on_write_error = bs_src->on_write_error; 1409 1410 /* i/o status */ 1411 bs_dest->iostatus_enabled = bs_src->iostatus_enabled; 1412 bs_dest->iostatus = bs_src->iostatus; 1413 1414 /* dirty bitmap */ 1415 bs_dest->dirty_bitmap = bs_src->dirty_bitmap; 1416 1417 /* job */ 1418 bs_dest->in_use = bs_src->in_use; 1419 bs_dest->job = bs_src->job; 1420 1421 /* keep the same entry in bdrv_states */ 1422 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), 1423 bs_src->device_name); 1424 bs_dest->list = bs_src->list; 1425 } 1426 1427 /* 1428 * Swap bs contents for two image chains while they are live, 1429 * while keeping required fields on the BlockDriverState that is 1430 * actually attached to a device. 1431 * 1432 * This will modify the BlockDriverState fields, and swap contents 1433 * between bs_new and bs_old. Both bs_new and bs_old are modified. 1434 * 1435 * bs_new is required to be anonymous. 1436 * 1437 * This function does not create any image files. 1438 */ 1439 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) 1440 { 1441 BlockDriverState tmp; 1442 1443 /* bs_new must be anonymous and shouldn't have anything fancy enabled */ 1444 assert(bs_new->device_name[0] == '\0'); 1445 assert(bs_new->dirty_bitmap == NULL); 1446 assert(bs_new->job == NULL); 1447 assert(bs_new->dev == NULL); 1448 assert(bs_new->in_use == 0); 1449 assert(bs_new->io_limits_enabled == false); 1450 assert(bs_new->block_timer == NULL); 1451 1452 tmp = *bs_new; 1453 *bs_new = *bs_old; 1454 *bs_old = tmp; 1455 1456 /* there are some fields that should not be swapped, move them back */ 1457 bdrv_move_feature_fields(&tmp, bs_old); 1458 bdrv_move_feature_fields(bs_old, bs_new); 1459 bdrv_move_feature_fields(bs_new, &tmp); 1460 1461 /* bs_new shouldn't be in bdrv_states even after the swap! */ 1462 assert(bs_new->device_name[0] == '\0'); 1463 1464 /* Check a few fields that should remain attached to the device */ 1465 assert(bs_new->dev == NULL); 1466 assert(bs_new->job == NULL); 1467 assert(bs_new->in_use == 0); 1468 assert(bs_new->io_limits_enabled == false); 1469 assert(bs_new->block_timer == NULL); 1470 1471 bdrv_rebind(bs_new); 1472 bdrv_rebind(bs_old); 1473 } 1474 1475 /* 1476 * Add new bs contents at the top of an image chain while the chain is 1477 * live, while keeping required fields on the top layer. 1478 * 1479 * This will modify the BlockDriverState fields, and swap contents 1480 * between bs_new and bs_top. Both bs_new and bs_top are modified. 1481 * 1482 * bs_new is required to be anonymous. 1483 * 1484 * This function does not create any image files. 1485 */ 1486 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) 1487 { 1488 bdrv_swap(bs_new, bs_top); 1489 1490 /* The contents of 'tmp' will become bs_top, as we are 1491 * swapping bs_new and bs_top contents. */ 1492 bs_top->backing_hd = bs_new; 1493 bs_top->open_flags &= ~BDRV_O_NO_BACKING; 1494 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file), 1495 bs_new->filename); 1496 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format), 1497 bs_new->drv ? bs_new->drv->format_name : ""); 1498 } 1499 1500 void bdrv_delete(BlockDriverState *bs) 1501 { 1502 assert(!bs->dev); 1503 assert(!bs->job); 1504 assert(!bs->in_use); 1505 1506 /* remove from list, if necessary */ 1507 bdrv_make_anon(bs); 1508 1509 bdrv_close(bs); 1510 1511 assert(bs != bs_snapshots); 1512 g_free(bs); 1513 } 1514 1515 int bdrv_attach_dev(BlockDriverState *bs, void *dev) 1516 /* TODO change to DeviceState *dev when all users are qdevified */ 1517 { 1518 if (bs->dev) { 1519 return -EBUSY; 1520 } 1521 bs->dev = dev; 1522 bdrv_iostatus_reset(bs); 1523 return 0; 1524 } 1525 1526 /* TODO qdevified devices don't use this, remove when devices are qdevified */ 1527 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev) 1528 { 1529 if (bdrv_attach_dev(bs, dev) < 0) { 1530 abort(); 1531 } 1532 } 1533 1534 void bdrv_detach_dev(BlockDriverState *bs, void *dev) 1535 /* TODO change to DeviceState *dev when all users are qdevified */ 1536 { 1537 assert(bs->dev == dev); 1538 bs->dev = NULL; 1539 bs->dev_ops = NULL; 1540 bs->dev_opaque = NULL; 1541 bs->buffer_alignment = 512; 1542 } 1543 1544 /* TODO change to return DeviceState * when all users are qdevified */ 1545 void *bdrv_get_attached_dev(BlockDriverState *bs) 1546 { 1547 return bs->dev; 1548 } 1549 1550 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, 1551 void *opaque) 1552 { 1553 bs->dev_ops = ops; 1554 bs->dev_opaque = opaque; 1555 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) { 1556 bs_snapshots = NULL; 1557 } 1558 } 1559 1560 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv, 1561 enum MonitorEvent ev, 1562 BlockErrorAction action, bool is_read) 1563 { 1564 QObject *data; 1565 const char *action_str; 1566 1567 switch (action) { 1568 case BDRV_ACTION_REPORT: 1569 action_str = "report"; 1570 break; 1571 case BDRV_ACTION_IGNORE: 1572 action_str = "ignore"; 1573 break; 1574 case BDRV_ACTION_STOP: 1575 action_str = "stop"; 1576 break; 1577 default: 1578 abort(); 1579 } 1580 1581 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }", 1582 bdrv->device_name, 1583 action_str, 1584 is_read ? "read" : "write"); 1585 monitor_protocol_event(ev, data); 1586 1587 qobject_decref(data); 1588 } 1589 1590 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected) 1591 { 1592 QObject *data; 1593 1594 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }", 1595 bdrv_get_device_name(bs), ejected); 1596 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data); 1597 1598 qobject_decref(data); 1599 } 1600 1601 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load) 1602 { 1603 if (bs->dev_ops && bs->dev_ops->change_media_cb) { 1604 bool tray_was_closed = !bdrv_dev_is_tray_open(bs); 1605 bs->dev_ops->change_media_cb(bs->dev_opaque, load); 1606 if (tray_was_closed) { 1607 /* tray open */ 1608 bdrv_emit_qmp_eject_event(bs, true); 1609 } 1610 if (load) { 1611 /* tray close */ 1612 bdrv_emit_qmp_eject_event(bs, false); 1613 } 1614 } 1615 } 1616 1617 bool bdrv_dev_has_removable_media(BlockDriverState *bs) 1618 { 1619 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb); 1620 } 1621 1622 void bdrv_dev_eject_request(BlockDriverState *bs, bool force) 1623 { 1624 if (bs->dev_ops && bs->dev_ops->eject_request_cb) { 1625 bs->dev_ops->eject_request_cb(bs->dev_opaque, force); 1626 } 1627 } 1628 1629 bool bdrv_dev_is_tray_open(BlockDriverState *bs) 1630 { 1631 if (bs->dev_ops && bs->dev_ops->is_tray_open) { 1632 return bs->dev_ops->is_tray_open(bs->dev_opaque); 1633 } 1634 return false; 1635 } 1636 1637 static void bdrv_dev_resize_cb(BlockDriverState *bs) 1638 { 1639 if (bs->dev_ops && bs->dev_ops->resize_cb) { 1640 bs->dev_ops->resize_cb(bs->dev_opaque); 1641 } 1642 } 1643 1644 bool bdrv_dev_is_medium_locked(BlockDriverState *bs) 1645 { 1646 if (bs->dev_ops && bs->dev_ops->is_medium_locked) { 1647 return bs->dev_ops->is_medium_locked(bs->dev_opaque); 1648 } 1649 return false; 1650 } 1651 1652 /* 1653 * Run consistency checks on an image 1654 * 1655 * Returns 0 if the check could be completed (it doesn't mean that the image is 1656 * free of errors) or -errno when an internal error occurred. The results of the 1657 * check are stored in res. 1658 */ 1659 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 1660 { 1661 if (bs->drv->bdrv_check == NULL) { 1662 return -ENOTSUP; 1663 } 1664 1665 memset(res, 0, sizeof(*res)); 1666 return bs->drv->bdrv_check(bs, res, fix); 1667 } 1668 1669 #define COMMIT_BUF_SECTORS 2048 1670 1671 /* commit COW file into the raw image */ 1672 int bdrv_commit(BlockDriverState *bs) 1673 { 1674 BlockDriver *drv = bs->drv; 1675 int64_t sector, total_sectors; 1676 int n, ro, open_flags; 1677 int ret = 0; 1678 uint8_t *buf; 1679 char filename[PATH_MAX]; 1680 1681 if (!drv) 1682 return -ENOMEDIUM; 1683 1684 if (!bs->backing_hd) { 1685 return -ENOTSUP; 1686 } 1687 1688 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) { 1689 return -EBUSY; 1690 } 1691 1692 ro = bs->backing_hd->read_only; 1693 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ 1694 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); 1695 open_flags = bs->backing_hd->open_flags; 1696 1697 if (ro) { 1698 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { 1699 return -EACCES; 1700 } 1701 } 1702 1703 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; 1704 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); 1705 1706 for (sector = 0; sector < total_sectors; sector += n) { 1707 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { 1708 1709 if (bdrv_read(bs, sector, buf, n) != 0) { 1710 ret = -EIO; 1711 goto ro_cleanup; 1712 } 1713 1714 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { 1715 ret = -EIO; 1716 goto ro_cleanup; 1717 } 1718 } 1719 } 1720 1721 if (drv->bdrv_make_empty) { 1722 ret = drv->bdrv_make_empty(bs); 1723 bdrv_flush(bs); 1724 } 1725 1726 /* 1727 * Make sure all data we wrote to the backing device is actually 1728 * stable on disk. 1729 */ 1730 if (bs->backing_hd) 1731 bdrv_flush(bs->backing_hd); 1732 1733 ro_cleanup: 1734 g_free(buf); 1735 1736 if (ro) { 1737 /* ignoring error return here */ 1738 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); 1739 } 1740 1741 return ret; 1742 } 1743 1744 int bdrv_commit_all(void) 1745 { 1746 BlockDriverState *bs; 1747 1748 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1749 if (bs->drv && bs->backing_hd) { 1750 int ret = bdrv_commit(bs); 1751 if (ret < 0) { 1752 return ret; 1753 } 1754 } 1755 } 1756 return 0; 1757 } 1758 1759 struct BdrvTrackedRequest { 1760 BlockDriverState *bs; 1761 int64_t sector_num; 1762 int nb_sectors; 1763 bool is_write; 1764 QLIST_ENTRY(BdrvTrackedRequest) list; 1765 Coroutine *co; /* owner, used for deadlock detection */ 1766 CoQueue wait_queue; /* coroutines blocked on this request */ 1767 }; 1768 1769 /** 1770 * Remove an active request from the tracked requests list 1771 * 1772 * This function should be called when a tracked request is completing. 1773 */ 1774 static void tracked_request_end(BdrvTrackedRequest *req) 1775 { 1776 QLIST_REMOVE(req, list); 1777 qemu_co_queue_restart_all(&req->wait_queue); 1778 } 1779 1780 /** 1781 * Add an active request to the tracked requests list 1782 */ 1783 static void tracked_request_begin(BdrvTrackedRequest *req, 1784 BlockDriverState *bs, 1785 int64_t sector_num, 1786 int nb_sectors, bool is_write) 1787 { 1788 *req = (BdrvTrackedRequest){ 1789 .bs = bs, 1790 .sector_num = sector_num, 1791 .nb_sectors = nb_sectors, 1792 .is_write = is_write, 1793 .co = qemu_coroutine_self(), 1794 }; 1795 1796 qemu_co_queue_init(&req->wait_queue); 1797 1798 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 1799 } 1800 1801 /** 1802 * Round a region to cluster boundaries 1803 */ 1804 void bdrv_round_to_clusters(BlockDriverState *bs, 1805 int64_t sector_num, int nb_sectors, 1806 int64_t *cluster_sector_num, 1807 int *cluster_nb_sectors) 1808 { 1809 BlockDriverInfo bdi; 1810 1811 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 1812 *cluster_sector_num = sector_num; 1813 *cluster_nb_sectors = nb_sectors; 1814 } else { 1815 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 1816 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 1817 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 1818 nb_sectors, c); 1819 } 1820 } 1821 1822 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 1823 int64_t sector_num, int nb_sectors) { 1824 /* aaaa bbbb */ 1825 if (sector_num >= req->sector_num + req->nb_sectors) { 1826 return false; 1827 } 1828 /* bbbb aaaa */ 1829 if (req->sector_num >= sector_num + nb_sectors) { 1830 return false; 1831 } 1832 return true; 1833 } 1834 1835 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, 1836 int64_t sector_num, int nb_sectors) 1837 { 1838 BdrvTrackedRequest *req; 1839 int64_t cluster_sector_num; 1840 int cluster_nb_sectors; 1841 bool retry; 1842 1843 /* If we touch the same cluster it counts as an overlap. This guarantees 1844 * that allocating writes will be serialized and not race with each other 1845 * for the same cluster. For example, in copy-on-read it ensures that the 1846 * CoR read and write operations are atomic and guest writes cannot 1847 * interleave between them. 1848 */ 1849 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 1850 &cluster_sector_num, &cluster_nb_sectors); 1851 1852 do { 1853 retry = false; 1854 QLIST_FOREACH(req, &bs->tracked_requests, list) { 1855 if (tracked_request_overlaps(req, cluster_sector_num, 1856 cluster_nb_sectors)) { 1857 /* Hitting this means there was a reentrant request, for 1858 * example, a block driver issuing nested requests. This must 1859 * never happen since it means deadlock. 1860 */ 1861 assert(qemu_coroutine_self() != req->co); 1862 1863 qemu_co_queue_wait(&req->wait_queue); 1864 retry = true; 1865 break; 1866 } 1867 } 1868 } while (retry); 1869 } 1870 1871 /* 1872 * Return values: 1873 * 0 - success 1874 * -EINVAL - backing format specified, but no file 1875 * -ENOSPC - can't update the backing file because no space is left in the 1876 * image file header 1877 * -ENOTSUP - format driver doesn't support changing the backing file 1878 */ 1879 int bdrv_change_backing_file(BlockDriverState *bs, 1880 const char *backing_file, const char *backing_fmt) 1881 { 1882 BlockDriver *drv = bs->drv; 1883 int ret; 1884 1885 /* Backing file format doesn't make sense without a backing file */ 1886 if (backing_fmt && !backing_file) { 1887 return -EINVAL; 1888 } 1889 1890 if (drv->bdrv_change_backing_file != NULL) { 1891 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); 1892 } else { 1893 ret = -ENOTSUP; 1894 } 1895 1896 if (ret == 0) { 1897 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 1898 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 1899 } 1900 return ret; 1901 } 1902 1903 /* 1904 * Finds the image layer in the chain that has 'bs' as its backing file. 1905 * 1906 * active is the current topmost image. 1907 * 1908 * Returns NULL if bs is not found in active's image chain, 1909 * or if active == bs. 1910 */ 1911 BlockDriverState *bdrv_find_overlay(BlockDriverState *active, 1912 BlockDriverState *bs) 1913 { 1914 BlockDriverState *overlay = NULL; 1915 BlockDriverState *intermediate; 1916 1917 assert(active != NULL); 1918 assert(bs != NULL); 1919 1920 /* if bs is the same as active, then by definition it has no overlay 1921 */ 1922 if (active == bs) { 1923 return NULL; 1924 } 1925 1926 intermediate = active; 1927 while (intermediate->backing_hd) { 1928 if (intermediate->backing_hd == bs) { 1929 overlay = intermediate; 1930 break; 1931 } 1932 intermediate = intermediate->backing_hd; 1933 } 1934 1935 return overlay; 1936 } 1937 1938 typedef struct BlkIntermediateStates { 1939 BlockDriverState *bs; 1940 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; 1941 } BlkIntermediateStates; 1942 1943 1944 /* 1945 * Drops images above 'base' up to and including 'top', and sets the image 1946 * above 'top' to have base as its backing file. 1947 * 1948 * Requires that the overlay to 'top' is opened r/w, so that the backing file 1949 * information in 'bs' can be properly updated. 1950 * 1951 * E.g., this will convert the following chain: 1952 * bottom <- base <- intermediate <- top <- active 1953 * 1954 * to 1955 * 1956 * bottom <- base <- active 1957 * 1958 * It is allowed for bottom==base, in which case it converts: 1959 * 1960 * base <- intermediate <- top <- active 1961 * 1962 * to 1963 * 1964 * base <- active 1965 * 1966 * Error conditions: 1967 * if active == top, that is considered an error 1968 * 1969 */ 1970 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, 1971 BlockDriverState *base) 1972 { 1973 BlockDriverState *intermediate; 1974 BlockDriverState *base_bs = NULL; 1975 BlockDriverState *new_top_bs = NULL; 1976 BlkIntermediateStates *intermediate_state, *next; 1977 int ret = -EIO; 1978 1979 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; 1980 QSIMPLEQ_INIT(&states_to_delete); 1981 1982 if (!top->drv || !base->drv) { 1983 goto exit; 1984 } 1985 1986 new_top_bs = bdrv_find_overlay(active, top); 1987 1988 if (new_top_bs == NULL) { 1989 /* we could not find the image above 'top', this is an error */ 1990 goto exit; 1991 } 1992 1993 /* special case of new_top_bs->backing_hd already pointing to base - nothing 1994 * to do, no intermediate images */ 1995 if (new_top_bs->backing_hd == base) { 1996 ret = 0; 1997 goto exit; 1998 } 1999 2000 intermediate = top; 2001 2002 /* now we will go down through the list, and add each BDS we find 2003 * into our deletion queue, until we hit the 'base' 2004 */ 2005 while (intermediate) { 2006 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates)); 2007 intermediate_state->bs = intermediate; 2008 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); 2009 2010 if (intermediate->backing_hd == base) { 2011 base_bs = intermediate->backing_hd; 2012 break; 2013 } 2014 intermediate = intermediate->backing_hd; 2015 } 2016 if (base_bs == NULL) { 2017 /* something went wrong, we did not end at the base. safely 2018 * unravel everything, and exit with error */ 2019 goto exit; 2020 } 2021 2022 /* success - we can delete the intermediate states, and link top->base */ 2023 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename, 2024 base_bs->drv ? base_bs->drv->format_name : ""); 2025 if (ret) { 2026 goto exit; 2027 } 2028 new_top_bs->backing_hd = base_bs; 2029 2030 2031 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2032 /* so that bdrv_close() does not recursively close the chain */ 2033 intermediate_state->bs->backing_hd = NULL; 2034 bdrv_delete(intermediate_state->bs); 2035 } 2036 ret = 0; 2037 2038 exit: 2039 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2040 g_free(intermediate_state); 2041 } 2042 return ret; 2043 } 2044 2045 2046 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 2047 size_t size) 2048 { 2049 int64_t len; 2050 2051 if (!bdrv_is_inserted(bs)) 2052 return -ENOMEDIUM; 2053 2054 if (bs->growable) 2055 return 0; 2056 2057 len = bdrv_getlength(bs); 2058 2059 if (offset < 0) 2060 return -EIO; 2061 2062 if ((offset > len) || (len - offset < size)) 2063 return -EIO; 2064 2065 return 0; 2066 } 2067 2068 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 2069 int nb_sectors) 2070 { 2071 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 2072 nb_sectors * BDRV_SECTOR_SIZE); 2073 } 2074 2075 typedef struct RwCo { 2076 BlockDriverState *bs; 2077 int64_t sector_num; 2078 int nb_sectors; 2079 QEMUIOVector *qiov; 2080 bool is_write; 2081 int ret; 2082 } RwCo; 2083 2084 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 2085 { 2086 RwCo *rwco = opaque; 2087 2088 if (!rwco->is_write) { 2089 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, 2090 rwco->nb_sectors, rwco->qiov, 0); 2091 } else { 2092 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, 2093 rwco->nb_sectors, rwco->qiov, 0); 2094 } 2095 } 2096 2097 /* 2098 * Process a synchronous request using coroutines 2099 */ 2100 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 2101 int nb_sectors, bool is_write) 2102 { 2103 QEMUIOVector qiov; 2104 struct iovec iov = { 2105 .iov_base = (void *)buf, 2106 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 2107 }; 2108 Coroutine *co; 2109 RwCo rwco = { 2110 .bs = bs, 2111 .sector_num = sector_num, 2112 .nb_sectors = nb_sectors, 2113 .qiov = &qiov, 2114 .is_write = is_write, 2115 .ret = NOT_DONE, 2116 }; 2117 2118 qemu_iovec_init_external(&qiov, &iov, 1); 2119 2120 /** 2121 * In sync call context, when the vcpu is blocked, this throttling timer 2122 * will not fire; so the I/O throttling function has to be disabled here 2123 * if it has been enabled. 2124 */ 2125 if (bs->io_limits_enabled) { 2126 fprintf(stderr, "Disabling I/O throttling on '%s' due " 2127 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 2128 bdrv_io_limits_disable(bs); 2129 } 2130 2131 if (qemu_in_coroutine()) { 2132 /* Fast-path if already in coroutine context */ 2133 bdrv_rw_co_entry(&rwco); 2134 } else { 2135 co = qemu_coroutine_create(bdrv_rw_co_entry); 2136 qemu_coroutine_enter(co, &rwco); 2137 while (rwco.ret == NOT_DONE) { 2138 qemu_aio_wait(); 2139 } 2140 } 2141 return rwco.ret; 2142 } 2143 2144 /* return < 0 if error. See bdrv_write() for the return codes */ 2145 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 2146 uint8_t *buf, int nb_sectors) 2147 { 2148 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false); 2149 } 2150 2151 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 2152 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 2153 uint8_t *buf, int nb_sectors) 2154 { 2155 bool enabled; 2156 int ret; 2157 2158 enabled = bs->io_limits_enabled; 2159 bs->io_limits_enabled = false; 2160 ret = bdrv_read(bs, 0, buf, 1); 2161 bs->io_limits_enabled = enabled; 2162 return ret; 2163 } 2164 2165 /* Return < 0 if error. Important errors are: 2166 -EIO generic I/O error (may happen for all errors) 2167 -ENOMEDIUM No media inserted. 2168 -EINVAL Invalid sector number or nb_sectors 2169 -EACCES Trying to write a read-only device 2170 */ 2171 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 2172 const uint8_t *buf, int nb_sectors) 2173 { 2174 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true); 2175 } 2176 2177 int bdrv_pread(BlockDriverState *bs, int64_t offset, 2178 void *buf, int count1) 2179 { 2180 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 2181 int len, nb_sectors, count; 2182 int64_t sector_num; 2183 int ret; 2184 2185 count = count1; 2186 /* first read to align to sector start */ 2187 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 2188 if (len > count) 2189 len = count; 2190 sector_num = offset >> BDRV_SECTOR_BITS; 2191 if (len > 0) { 2192 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2193 return ret; 2194 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); 2195 count -= len; 2196 if (count == 0) 2197 return count1; 2198 sector_num++; 2199 buf += len; 2200 } 2201 2202 /* read the sectors "in place" */ 2203 nb_sectors = count >> BDRV_SECTOR_BITS; 2204 if (nb_sectors > 0) { 2205 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) 2206 return ret; 2207 sector_num += nb_sectors; 2208 len = nb_sectors << BDRV_SECTOR_BITS; 2209 buf += len; 2210 count -= len; 2211 } 2212 2213 /* add data from the last sector */ 2214 if (count > 0) { 2215 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2216 return ret; 2217 memcpy(buf, tmp_buf, count); 2218 } 2219 return count1; 2220 } 2221 2222 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 2223 const void *buf, int count1) 2224 { 2225 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 2226 int len, nb_sectors, count; 2227 int64_t sector_num; 2228 int ret; 2229 2230 count = count1; 2231 /* first write to align to sector start */ 2232 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 2233 if (len > count) 2234 len = count; 2235 sector_num = offset >> BDRV_SECTOR_BITS; 2236 if (len > 0) { 2237 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2238 return ret; 2239 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len); 2240 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 2241 return ret; 2242 count -= len; 2243 if (count == 0) 2244 return count1; 2245 sector_num++; 2246 buf += len; 2247 } 2248 2249 /* write the sectors "in place" */ 2250 nb_sectors = count >> BDRV_SECTOR_BITS; 2251 if (nb_sectors > 0) { 2252 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0) 2253 return ret; 2254 sector_num += nb_sectors; 2255 len = nb_sectors << BDRV_SECTOR_BITS; 2256 buf += len; 2257 count -= len; 2258 } 2259 2260 /* add data from the last sector */ 2261 if (count > 0) { 2262 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2263 return ret; 2264 memcpy(tmp_buf, buf, count); 2265 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 2266 return ret; 2267 } 2268 return count1; 2269 } 2270 2271 /* 2272 * Writes to the file and ensures that no writes are reordered across this 2273 * request (acts as a barrier) 2274 * 2275 * Returns 0 on success, -errno in error cases. 2276 */ 2277 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 2278 const void *buf, int count) 2279 { 2280 int ret; 2281 2282 ret = bdrv_pwrite(bs, offset, buf, count); 2283 if (ret < 0) { 2284 return ret; 2285 } 2286 2287 /* No flush needed for cache modes that already do it */ 2288 if (bs->enable_write_cache) { 2289 bdrv_flush(bs); 2290 } 2291 2292 return 0; 2293 } 2294 2295 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 2296 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2297 { 2298 /* Perform I/O through a temporary buffer so that users who scribble over 2299 * their read buffer while the operation is in progress do not end up 2300 * modifying the image file. This is critical for zero-copy guest I/O 2301 * where anything might happen inside guest memory. 2302 */ 2303 void *bounce_buffer; 2304 2305 BlockDriver *drv = bs->drv; 2306 struct iovec iov; 2307 QEMUIOVector bounce_qiov; 2308 int64_t cluster_sector_num; 2309 int cluster_nb_sectors; 2310 size_t skip_bytes; 2311 int ret; 2312 2313 /* Cover entire cluster so no additional backing file I/O is required when 2314 * allocating cluster in the image file. 2315 */ 2316 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2317 &cluster_sector_num, &cluster_nb_sectors); 2318 2319 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 2320 cluster_sector_num, cluster_nb_sectors); 2321 2322 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 2323 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); 2324 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 2325 2326 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 2327 &bounce_qiov); 2328 if (ret < 0) { 2329 goto err; 2330 } 2331 2332 if (drv->bdrv_co_write_zeroes && 2333 buffer_is_zero(bounce_buffer, iov.iov_len)) { 2334 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 2335 cluster_nb_sectors); 2336 } else { 2337 /* This does not change the data on the disk, it is not necessary 2338 * to flush even in cache=writethrough mode. 2339 */ 2340 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 2341 &bounce_qiov); 2342 } 2343 2344 if (ret < 0) { 2345 /* It might be okay to ignore write errors for guest requests. If this 2346 * is a deliberate copy-on-read then we don't want to ignore the error. 2347 * Simply report it in all cases. 2348 */ 2349 goto err; 2350 } 2351 2352 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 2353 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 2354 nb_sectors * BDRV_SECTOR_SIZE); 2355 2356 err: 2357 qemu_vfree(bounce_buffer); 2358 return ret; 2359 } 2360 2361 /* 2362 * Handle a read request in coroutine context 2363 */ 2364 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 2365 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 2366 BdrvRequestFlags flags) 2367 { 2368 BlockDriver *drv = bs->drv; 2369 BdrvTrackedRequest req; 2370 int ret; 2371 2372 if (!drv) { 2373 return -ENOMEDIUM; 2374 } 2375 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 2376 return -EIO; 2377 } 2378 2379 /* throttling disk read I/O */ 2380 if (bs->io_limits_enabled) { 2381 bdrv_io_limits_intercept(bs, false, nb_sectors); 2382 } 2383 2384 if (bs->copy_on_read) { 2385 flags |= BDRV_REQ_COPY_ON_READ; 2386 } 2387 if (flags & BDRV_REQ_COPY_ON_READ) { 2388 bs->copy_on_read_in_flight++; 2389 } 2390 2391 if (bs->copy_on_read_in_flight) { 2392 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 2393 } 2394 2395 tracked_request_begin(&req, bs, sector_num, nb_sectors, false); 2396 2397 if (flags & BDRV_REQ_COPY_ON_READ) { 2398 int pnum; 2399 2400 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum); 2401 if (ret < 0) { 2402 goto out; 2403 } 2404 2405 if (!ret || pnum != nb_sectors) { 2406 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 2407 goto out; 2408 } 2409 } 2410 2411 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 2412 2413 out: 2414 tracked_request_end(&req); 2415 2416 if (flags & BDRV_REQ_COPY_ON_READ) { 2417 bs->copy_on_read_in_flight--; 2418 } 2419 2420 return ret; 2421 } 2422 2423 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 2424 int nb_sectors, QEMUIOVector *qiov) 2425 { 2426 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 2427 2428 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 2429 } 2430 2431 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 2432 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2433 { 2434 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 2435 2436 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 2437 BDRV_REQ_COPY_ON_READ); 2438 } 2439 2440 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 2441 int64_t sector_num, int nb_sectors) 2442 { 2443 BlockDriver *drv = bs->drv; 2444 QEMUIOVector qiov; 2445 struct iovec iov; 2446 int ret; 2447 2448 /* TODO Emulate only part of misaligned requests instead of letting block 2449 * drivers return -ENOTSUP and emulate everything */ 2450 2451 /* First try the efficient write zeroes operation */ 2452 if (drv->bdrv_co_write_zeroes) { 2453 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); 2454 if (ret != -ENOTSUP) { 2455 return ret; 2456 } 2457 } 2458 2459 /* Fall back to bounce buffer if write zeroes is unsupported */ 2460 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; 2461 iov.iov_base = qemu_blockalign(bs, iov.iov_len); 2462 memset(iov.iov_base, 0, iov.iov_len); 2463 qemu_iovec_init_external(&qiov, &iov, 1); 2464 2465 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); 2466 2467 qemu_vfree(iov.iov_base); 2468 return ret; 2469 } 2470 2471 /* 2472 * Handle a write request in coroutine context 2473 */ 2474 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 2475 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 2476 BdrvRequestFlags flags) 2477 { 2478 BlockDriver *drv = bs->drv; 2479 BdrvTrackedRequest req; 2480 int ret; 2481 2482 if (!bs->drv) { 2483 return -ENOMEDIUM; 2484 } 2485 if (bs->read_only) { 2486 return -EACCES; 2487 } 2488 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 2489 return -EIO; 2490 } 2491 2492 /* throttling disk write I/O */ 2493 if (bs->io_limits_enabled) { 2494 bdrv_io_limits_intercept(bs, true, nb_sectors); 2495 } 2496 2497 if (bs->copy_on_read_in_flight) { 2498 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 2499 } 2500 2501 tracked_request_begin(&req, bs, sector_num, nb_sectors, true); 2502 2503 if (flags & BDRV_REQ_ZERO_WRITE) { 2504 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); 2505 } else { 2506 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 2507 } 2508 2509 if (ret == 0 && !bs->enable_write_cache) { 2510 ret = bdrv_co_flush(bs); 2511 } 2512 2513 if (bs->dirty_bitmap) { 2514 bdrv_set_dirty(bs, sector_num, nb_sectors); 2515 } 2516 2517 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { 2518 bs->wr_highest_sector = sector_num + nb_sectors - 1; 2519 } 2520 2521 tracked_request_end(&req); 2522 2523 return ret; 2524 } 2525 2526 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 2527 int nb_sectors, QEMUIOVector *qiov) 2528 { 2529 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 2530 2531 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 2532 } 2533 2534 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 2535 int64_t sector_num, int nb_sectors) 2536 { 2537 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); 2538 2539 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 2540 BDRV_REQ_ZERO_WRITE); 2541 } 2542 2543 /** 2544 * Truncate file to 'offset' bytes (needed only for file protocols) 2545 */ 2546 int bdrv_truncate(BlockDriverState *bs, int64_t offset) 2547 { 2548 BlockDriver *drv = bs->drv; 2549 int ret; 2550 if (!drv) 2551 return -ENOMEDIUM; 2552 if (!drv->bdrv_truncate) 2553 return -ENOTSUP; 2554 if (bs->read_only) 2555 return -EACCES; 2556 if (bdrv_in_use(bs)) 2557 return -EBUSY; 2558 ret = drv->bdrv_truncate(bs, offset); 2559 if (ret == 0) { 2560 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 2561 bdrv_dev_resize_cb(bs); 2562 } 2563 return ret; 2564 } 2565 2566 /** 2567 * Length of a allocated file in bytes. Sparse files are counted by actual 2568 * allocated space. Return < 0 if error or unknown. 2569 */ 2570 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) 2571 { 2572 BlockDriver *drv = bs->drv; 2573 if (!drv) { 2574 return -ENOMEDIUM; 2575 } 2576 if (drv->bdrv_get_allocated_file_size) { 2577 return drv->bdrv_get_allocated_file_size(bs); 2578 } 2579 if (bs->file) { 2580 return bdrv_get_allocated_file_size(bs->file); 2581 } 2582 return -ENOTSUP; 2583 } 2584 2585 /** 2586 * Length of a file in bytes. Return < 0 if error or unknown. 2587 */ 2588 int64_t bdrv_getlength(BlockDriverState *bs) 2589 { 2590 BlockDriver *drv = bs->drv; 2591 if (!drv) 2592 return -ENOMEDIUM; 2593 2594 if (bs->growable || bdrv_dev_has_removable_media(bs)) { 2595 if (drv->bdrv_getlength) { 2596 return drv->bdrv_getlength(bs); 2597 } 2598 } 2599 return bs->total_sectors * BDRV_SECTOR_SIZE; 2600 } 2601 2602 /* return 0 as number of sectors if no device present or error */ 2603 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) 2604 { 2605 int64_t length; 2606 length = bdrv_getlength(bs); 2607 if (length < 0) 2608 length = 0; 2609 else 2610 length = length >> BDRV_SECTOR_BITS; 2611 *nb_sectors_ptr = length; 2612 } 2613 2614 /* throttling disk io limits */ 2615 void bdrv_set_io_limits(BlockDriverState *bs, 2616 BlockIOLimit *io_limits) 2617 { 2618 bs->io_limits = *io_limits; 2619 bs->io_limits_enabled = bdrv_io_limits_enabled(bs); 2620 } 2621 2622 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, 2623 BlockdevOnError on_write_error) 2624 { 2625 bs->on_read_error = on_read_error; 2626 bs->on_write_error = on_write_error; 2627 } 2628 2629 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) 2630 { 2631 return is_read ? bs->on_read_error : bs->on_write_error; 2632 } 2633 2634 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) 2635 { 2636 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; 2637 2638 switch (on_err) { 2639 case BLOCKDEV_ON_ERROR_ENOSPC: 2640 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT; 2641 case BLOCKDEV_ON_ERROR_STOP: 2642 return BDRV_ACTION_STOP; 2643 case BLOCKDEV_ON_ERROR_REPORT: 2644 return BDRV_ACTION_REPORT; 2645 case BLOCKDEV_ON_ERROR_IGNORE: 2646 return BDRV_ACTION_IGNORE; 2647 default: 2648 abort(); 2649 } 2650 } 2651 2652 /* This is done by device models because, while the block layer knows 2653 * about the error, it does not know whether an operation comes from 2654 * the device or the block layer (from a job, for example). 2655 */ 2656 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, 2657 bool is_read, int error) 2658 { 2659 assert(error >= 0); 2660 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read); 2661 if (action == BDRV_ACTION_STOP) { 2662 vm_stop(RUN_STATE_IO_ERROR); 2663 bdrv_iostatus_set_err(bs, error); 2664 } 2665 } 2666 2667 int bdrv_is_read_only(BlockDriverState *bs) 2668 { 2669 return bs->read_only; 2670 } 2671 2672 int bdrv_is_sg(BlockDriverState *bs) 2673 { 2674 return bs->sg; 2675 } 2676 2677 int bdrv_enable_write_cache(BlockDriverState *bs) 2678 { 2679 return bs->enable_write_cache; 2680 } 2681 2682 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) 2683 { 2684 bs->enable_write_cache = wce; 2685 2686 /* so a reopen() will preserve wce */ 2687 if (wce) { 2688 bs->open_flags |= BDRV_O_CACHE_WB; 2689 } else { 2690 bs->open_flags &= ~BDRV_O_CACHE_WB; 2691 } 2692 } 2693 2694 int bdrv_is_encrypted(BlockDriverState *bs) 2695 { 2696 if (bs->backing_hd && bs->backing_hd->encrypted) 2697 return 1; 2698 return bs->encrypted; 2699 } 2700 2701 int bdrv_key_required(BlockDriverState *bs) 2702 { 2703 BlockDriverState *backing_hd = bs->backing_hd; 2704 2705 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) 2706 return 1; 2707 return (bs->encrypted && !bs->valid_key); 2708 } 2709 2710 int bdrv_set_key(BlockDriverState *bs, const char *key) 2711 { 2712 int ret; 2713 if (bs->backing_hd && bs->backing_hd->encrypted) { 2714 ret = bdrv_set_key(bs->backing_hd, key); 2715 if (ret < 0) 2716 return ret; 2717 if (!bs->encrypted) 2718 return 0; 2719 } 2720 if (!bs->encrypted) { 2721 return -EINVAL; 2722 } else if (!bs->drv || !bs->drv->bdrv_set_key) { 2723 return -ENOMEDIUM; 2724 } 2725 ret = bs->drv->bdrv_set_key(bs, key); 2726 if (ret < 0) { 2727 bs->valid_key = 0; 2728 } else if (!bs->valid_key) { 2729 bs->valid_key = 1; 2730 /* call the change callback now, we skipped it on open */ 2731 bdrv_dev_change_media_cb(bs, true); 2732 } 2733 return ret; 2734 } 2735 2736 const char *bdrv_get_format_name(BlockDriverState *bs) 2737 { 2738 return bs->drv ? bs->drv->format_name : NULL; 2739 } 2740 2741 void bdrv_iterate_format(void (*it)(void *opaque, const char *name), 2742 void *opaque) 2743 { 2744 BlockDriver *drv; 2745 2746 QLIST_FOREACH(drv, &bdrv_drivers, list) { 2747 it(opaque, drv->format_name); 2748 } 2749 } 2750 2751 BlockDriverState *bdrv_find(const char *name) 2752 { 2753 BlockDriverState *bs; 2754 2755 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2756 if (!strcmp(name, bs->device_name)) { 2757 return bs; 2758 } 2759 } 2760 return NULL; 2761 } 2762 2763 BlockDriverState *bdrv_next(BlockDriverState *bs) 2764 { 2765 if (!bs) { 2766 return QTAILQ_FIRST(&bdrv_states); 2767 } 2768 return QTAILQ_NEXT(bs, list); 2769 } 2770 2771 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) 2772 { 2773 BlockDriverState *bs; 2774 2775 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2776 it(opaque, bs); 2777 } 2778 } 2779 2780 const char *bdrv_get_device_name(BlockDriverState *bs) 2781 { 2782 return bs->device_name; 2783 } 2784 2785 int bdrv_get_flags(BlockDriverState *bs) 2786 { 2787 return bs->open_flags; 2788 } 2789 2790 void bdrv_flush_all(void) 2791 { 2792 BlockDriverState *bs; 2793 2794 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2795 bdrv_flush(bs); 2796 } 2797 } 2798 2799 int bdrv_has_zero_init(BlockDriverState *bs) 2800 { 2801 assert(bs->drv); 2802 2803 if (bs->drv->bdrv_has_zero_init) { 2804 return bs->drv->bdrv_has_zero_init(bs); 2805 } 2806 2807 return 1; 2808 } 2809 2810 typedef struct BdrvCoIsAllocatedData { 2811 BlockDriverState *bs; 2812 BlockDriverState *base; 2813 int64_t sector_num; 2814 int nb_sectors; 2815 int *pnum; 2816 int ret; 2817 bool done; 2818 } BdrvCoIsAllocatedData; 2819 2820 /* 2821 * Returns true iff the specified sector is present in the disk image. Drivers 2822 * not implementing the functionality are assumed to not support backing files, 2823 * hence all their sectors are reported as allocated. 2824 * 2825 * If 'sector_num' is beyond the end of the disk image the return value is 0 2826 * and 'pnum' is set to 0. 2827 * 2828 * 'pnum' is set to the number of sectors (including and immediately following 2829 * the specified sector) that are known to be in the same 2830 * allocated/unallocated state. 2831 * 2832 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 2833 * beyond the end of the disk image it will be clamped. 2834 */ 2835 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, 2836 int nb_sectors, int *pnum) 2837 { 2838 int64_t n; 2839 2840 if (sector_num >= bs->total_sectors) { 2841 *pnum = 0; 2842 return 0; 2843 } 2844 2845 n = bs->total_sectors - sector_num; 2846 if (n < nb_sectors) { 2847 nb_sectors = n; 2848 } 2849 2850 if (!bs->drv->bdrv_co_is_allocated) { 2851 *pnum = nb_sectors; 2852 return 1; 2853 } 2854 2855 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum); 2856 } 2857 2858 /* Coroutine wrapper for bdrv_is_allocated() */ 2859 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque) 2860 { 2861 BdrvCoIsAllocatedData *data = opaque; 2862 BlockDriverState *bs = data->bs; 2863 2864 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors, 2865 data->pnum); 2866 data->done = true; 2867 } 2868 2869 /* 2870 * Synchronous wrapper around bdrv_co_is_allocated(). 2871 * 2872 * See bdrv_co_is_allocated() for details. 2873 */ 2874 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, 2875 int *pnum) 2876 { 2877 Coroutine *co; 2878 BdrvCoIsAllocatedData data = { 2879 .bs = bs, 2880 .sector_num = sector_num, 2881 .nb_sectors = nb_sectors, 2882 .pnum = pnum, 2883 .done = false, 2884 }; 2885 2886 co = qemu_coroutine_create(bdrv_is_allocated_co_entry); 2887 qemu_coroutine_enter(co, &data); 2888 while (!data.done) { 2889 qemu_aio_wait(); 2890 } 2891 return data.ret; 2892 } 2893 2894 /* 2895 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2896 * 2897 * Return true if the given sector is allocated in any image between 2898 * BASE and TOP (inclusive). BASE can be NULL to check if the given 2899 * sector is allocated in any image of the chain. Return false otherwise. 2900 * 2901 * 'pnum' is set to the number of sectors (including and immediately following 2902 * the specified sector) that are known to be in the same 2903 * allocated/unallocated state. 2904 * 2905 */ 2906 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, 2907 BlockDriverState *base, 2908 int64_t sector_num, 2909 int nb_sectors, int *pnum) 2910 { 2911 BlockDriverState *intermediate; 2912 int ret, n = nb_sectors; 2913 2914 intermediate = top; 2915 while (intermediate && intermediate != base) { 2916 int pnum_inter; 2917 ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors, 2918 &pnum_inter); 2919 if (ret < 0) { 2920 return ret; 2921 } else if (ret) { 2922 *pnum = pnum_inter; 2923 return 1; 2924 } 2925 2926 /* 2927 * [sector_num, nb_sectors] is unallocated on top but intermediate 2928 * might have 2929 * 2930 * [sector_num+x, nr_sectors] allocated. 2931 */ 2932 if (n > pnum_inter && 2933 (intermediate == top || 2934 sector_num + pnum_inter < intermediate->total_sectors)) { 2935 n = pnum_inter; 2936 } 2937 2938 intermediate = intermediate->backing_hd; 2939 } 2940 2941 *pnum = n; 2942 return 0; 2943 } 2944 2945 /* Coroutine wrapper for bdrv_is_allocated_above() */ 2946 static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque) 2947 { 2948 BdrvCoIsAllocatedData *data = opaque; 2949 BlockDriverState *top = data->bs; 2950 BlockDriverState *base = data->base; 2951 2952 data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num, 2953 data->nb_sectors, data->pnum); 2954 data->done = true; 2955 } 2956 2957 /* 2958 * Synchronous wrapper around bdrv_co_is_allocated_above(). 2959 * 2960 * See bdrv_co_is_allocated_above() for details. 2961 */ 2962 int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, 2963 int64_t sector_num, int nb_sectors, int *pnum) 2964 { 2965 Coroutine *co; 2966 BdrvCoIsAllocatedData data = { 2967 .bs = top, 2968 .base = base, 2969 .sector_num = sector_num, 2970 .nb_sectors = nb_sectors, 2971 .pnum = pnum, 2972 .done = false, 2973 }; 2974 2975 co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry); 2976 qemu_coroutine_enter(co, &data); 2977 while (!data.done) { 2978 qemu_aio_wait(); 2979 } 2980 return data.ret; 2981 } 2982 2983 BlockInfo *bdrv_query_info(BlockDriverState *bs) 2984 { 2985 BlockInfo *info = g_malloc0(sizeof(*info)); 2986 info->device = g_strdup(bs->device_name); 2987 info->type = g_strdup("unknown"); 2988 info->locked = bdrv_dev_is_medium_locked(bs); 2989 info->removable = bdrv_dev_has_removable_media(bs); 2990 2991 if (bdrv_dev_has_removable_media(bs)) { 2992 info->has_tray_open = true; 2993 info->tray_open = bdrv_dev_is_tray_open(bs); 2994 } 2995 2996 if (bdrv_iostatus_is_enabled(bs)) { 2997 info->has_io_status = true; 2998 info->io_status = bs->iostatus; 2999 } 3000 3001 if (bs->dirty_bitmap) { 3002 info->has_dirty = true; 3003 info->dirty = g_malloc0(sizeof(*info->dirty)); 3004 info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE; 3005 info->dirty->granularity = 3006 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap)); 3007 } 3008 3009 if (bs->drv) { 3010 info->has_inserted = true; 3011 info->inserted = g_malloc0(sizeof(*info->inserted)); 3012 info->inserted->file = g_strdup(bs->filename); 3013 info->inserted->ro = bs->read_only; 3014 info->inserted->drv = g_strdup(bs->drv->format_name); 3015 info->inserted->encrypted = bs->encrypted; 3016 info->inserted->encryption_key_missing = bdrv_key_required(bs); 3017 3018 if (bs->backing_file[0]) { 3019 info->inserted->has_backing_file = true; 3020 info->inserted->backing_file = g_strdup(bs->backing_file); 3021 } 3022 3023 info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs); 3024 3025 if (bs->io_limits_enabled) { 3026 info->inserted->bps = 3027 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; 3028 info->inserted->bps_rd = 3029 bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; 3030 info->inserted->bps_wr = 3031 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; 3032 info->inserted->iops = 3033 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; 3034 info->inserted->iops_rd = 3035 bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; 3036 info->inserted->iops_wr = 3037 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; 3038 } 3039 } 3040 return info; 3041 } 3042 3043 BlockInfoList *qmp_query_block(Error **errp) 3044 { 3045 BlockInfoList *head = NULL, **p_next = &head; 3046 BlockDriverState *bs; 3047 3048 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3049 BlockInfoList *info = g_malloc0(sizeof(*info)); 3050 info->value = bdrv_query_info(bs); 3051 3052 *p_next = info; 3053 p_next = &info->next; 3054 } 3055 3056 return head; 3057 } 3058 3059 BlockStats *bdrv_query_stats(const BlockDriverState *bs) 3060 { 3061 BlockStats *s; 3062 3063 s = g_malloc0(sizeof(*s)); 3064 3065 if (bs->device_name[0]) { 3066 s->has_device = true; 3067 s->device = g_strdup(bs->device_name); 3068 } 3069 3070 s->stats = g_malloc0(sizeof(*s->stats)); 3071 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ]; 3072 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE]; 3073 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ]; 3074 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE]; 3075 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE; 3076 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH]; 3077 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE]; 3078 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ]; 3079 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH]; 3080 3081 if (bs->file) { 3082 s->has_parent = true; 3083 s->parent = bdrv_query_stats(bs->file); 3084 } 3085 3086 return s; 3087 } 3088 3089 BlockStatsList *qmp_query_blockstats(Error **errp) 3090 { 3091 BlockStatsList *head = NULL, **p_next = &head; 3092 BlockDriverState *bs; 3093 3094 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3095 BlockStatsList *info = g_malloc0(sizeof(*info)); 3096 info->value = bdrv_query_stats(bs); 3097 3098 *p_next = info; 3099 p_next = &info->next; 3100 } 3101 3102 return head; 3103 } 3104 3105 const char *bdrv_get_encrypted_filename(BlockDriverState *bs) 3106 { 3107 if (bs->backing_hd && bs->backing_hd->encrypted) 3108 return bs->backing_file; 3109 else if (bs->encrypted) 3110 return bs->filename; 3111 else 3112 return NULL; 3113 } 3114 3115 void bdrv_get_backing_filename(BlockDriverState *bs, 3116 char *filename, int filename_size) 3117 { 3118 pstrcpy(filename, filename_size, bs->backing_file); 3119 } 3120 3121 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 3122 const uint8_t *buf, int nb_sectors) 3123 { 3124 BlockDriver *drv = bs->drv; 3125 if (!drv) 3126 return -ENOMEDIUM; 3127 if (!drv->bdrv_write_compressed) 3128 return -ENOTSUP; 3129 if (bdrv_check_request(bs, sector_num, nb_sectors)) 3130 return -EIO; 3131 3132 assert(!bs->dirty_bitmap); 3133 3134 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 3135 } 3136 3137 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 3138 { 3139 BlockDriver *drv = bs->drv; 3140 if (!drv) 3141 return -ENOMEDIUM; 3142 if (!drv->bdrv_get_info) 3143 return -ENOTSUP; 3144 memset(bdi, 0, sizeof(*bdi)); 3145 return drv->bdrv_get_info(bs, bdi); 3146 } 3147 3148 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 3149 int64_t pos, int size) 3150 { 3151 BlockDriver *drv = bs->drv; 3152 if (!drv) 3153 return -ENOMEDIUM; 3154 if (drv->bdrv_save_vmstate) 3155 return drv->bdrv_save_vmstate(bs, buf, pos, size); 3156 if (bs->file) 3157 return bdrv_save_vmstate(bs->file, buf, pos, size); 3158 return -ENOTSUP; 3159 } 3160 3161 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 3162 int64_t pos, int size) 3163 { 3164 BlockDriver *drv = bs->drv; 3165 if (!drv) 3166 return -ENOMEDIUM; 3167 if (drv->bdrv_load_vmstate) 3168 return drv->bdrv_load_vmstate(bs, buf, pos, size); 3169 if (bs->file) 3170 return bdrv_load_vmstate(bs->file, buf, pos, size); 3171 return -ENOTSUP; 3172 } 3173 3174 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) 3175 { 3176 BlockDriver *drv = bs->drv; 3177 3178 if (!drv || !drv->bdrv_debug_event) { 3179 return; 3180 } 3181 3182 drv->bdrv_debug_event(bs, event); 3183 } 3184 3185 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, 3186 const char *tag) 3187 { 3188 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { 3189 bs = bs->file; 3190 } 3191 3192 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { 3193 return bs->drv->bdrv_debug_breakpoint(bs, event, tag); 3194 } 3195 3196 return -ENOTSUP; 3197 } 3198 3199 int bdrv_debug_resume(BlockDriverState *bs, const char *tag) 3200 { 3201 while (bs && bs->drv && !bs->drv->bdrv_debug_resume) { 3202 bs = bs->file; 3203 } 3204 3205 if (bs && bs->drv && bs->drv->bdrv_debug_resume) { 3206 return bs->drv->bdrv_debug_resume(bs, tag); 3207 } 3208 3209 return -ENOTSUP; 3210 } 3211 3212 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) 3213 { 3214 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { 3215 bs = bs->file; 3216 } 3217 3218 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { 3219 return bs->drv->bdrv_debug_is_suspended(bs, tag); 3220 } 3221 3222 return false; 3223 } 3224 3225 /**************************************************************/ 3226 /* handling of snapshots */ 3227 3228 int bdrv_can_snapshot(BlockDriverState *bs) 3229 { 3230 BlockDriver *drv = bs->drv; 3231 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 3232 return 0; 3233 } 3234 3235 if (!drv->bdrv_snapshot_create) { 3236 if (bs->file != NULL) { 3237 return bdrv_can_snapshot(bs->file); 3238 } 3239 return 0; 3240 } 3241 3242 return 1; 3243 } 3244 3245 int bdrv_is_snapshot(BlockDriverState *bs) 3246 { 3247 return !!(bs->open_flags & BDRV_O_SNAPSHOT); 3248 } 3249 3250 BlockDriverState *bdrv_snapshots(void) 3251 { 3252 BlockDriverState *bs; 3253 3254 if (bs_snapshots) { 3255 return bs_snapshots; 3256 } 3257 3258 bs = NULL; 3259 while ((bs = bdrv_next(bs))) { 3260 if (bdrv_can_snapshot(bs)) { 3261 bs_snapshots = bs; 3262 return bs; 3263 } 3264 } 3265 return NULL; 3266 } 3267 3268 int bdrv_snapshot_create(BlockDriverState *bs, 3269 QEMUSnapshotInfo *sn_info) 3270 { 3271 BlockDriver *drv = bs->drv; 3272 if (!drv) 3273 return -ENOMEDIUM; 3274 if (drv->bdrv_snapshot_create) 3275 return drv->bdrv_snapshot_create(bs, sn_info); 3276 if (bs->file) 3277 return bdrv_snapshot_create(bs->file, sn_info); 3278 return -ENOTSUP; 3279 } 3280 3281 int bdrv_snapshot_goto(BlockDriverState *bs, 3282 const char *snapshot_id) 3283 { 3284 BlockDriver *drv = bs->drv; 3285 int ret, open_ret; 3286 3287 if (!drv) 3288 return -ENOMEDIUM; 3289 if (drv->bdrv_snapshot_goto) 3290 return drv->bdrv_snapshot_goto(bs, snapshot_id); 3291 3292 if (bs->file) { 3293 drv->bdrv_close(bs); 3294 ret = bdrv_snapshot_goto(bs->file, snapshot_id); 3295 open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); 3296 if (open_ret < 0) { 3297 bdrv_delete(bs->file); 3298 bs->drv = NULL; 3299 return open_ret; 3300 } 3301 return ret; 3302 } 3303 3304 return -ENOTSUP; 3305 } 3306 3307 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) 3308 { 3309 BlockDriver *drv = bs->drv; 3310 if (!drv) 3311 return -ENOMEDIUM; 3312 if (drv->bdrv_snapshot_delete) 3313 return drv->bdrv_snapshot_delete(bs, snapshot_id); 3314 if (bs->file) 3315 return bdrv_snapshot_delete(bs->file, snapshot_id); 3316 return -ENOTSUP; 3317 } 3318 3319 int bdrv_snapshot_list(BlockDriverState *bs, 3320 QEMUSnapshotInfo **psn_info) 3321 { 3322 BlockDriver *drv = bs->drv; 3323 if (!drv) 3324 return -ENOMEDIUM; 3325 if (drv->bdrv_snapshot_list) 3326 return drv->bdrv_snapshot_list(bs, psn_info); 3327 if (bs->file) 3328 return bdrv_snapshot_list(bs->file, psn_info); 3329 return -ENOTSUP; 3330 } 3331 3332 int bdrv_snapshot_load_tmp(BlockDriverState *bs, 3333 const char *snapshot_name) 3334 { 3335 BlockDriver *drv = bs->drv; 3336 if (!drv) { 3337 return -ENOMEDIUM; 3338 } 3339 if (!bs->read_only) { 3340 return -EINVAL; 3341 } 3342 if (drv->bdrv_snapshot_load_tmp) { 3343 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); 3344 } 3345 return -ENOTSUP; 3346 } 3347 3348 /* backing_file can either be relative, or absolute, or a protocol. If it is 3349 * relative, it must be relative to the chain. So, passing in bs->filename 3350 * from a BDS as backing_file should not be done, as that may be relative to 3351 * the CWD rather than the chain. */ 3352 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, 3353 const char *backing_file) 3354 { 3355 char *filename_full = NULL; 3356 char *backing_file_full = NULL; 3357 char *filename_tmp = NULL; 3358 int is_protocol = 0; 3359 BlockDriverState *curr_bs = NULL; 3360 BlockDriverState *retval = NULL; 3361 3362 if (!bs || !bs->drv || !backing_file) { 3363 return NULL; 3364 } 3365 3366 filename_full = g_malloc(PATH_MAX); 3367 backing_file_full = g_malloc(PATH_MAX); 3368 filename_tmp = g_malloc(PATH_MAX); 3369 3370 is_protocol = path_has_protocol(backing_file); 3371 3372 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { 3373 3374 /* If either of the filename paths is actually a protocol, then 3375 * compare unmodified paths; otherwise make paths relative */ 3376 if (is_protocol || path_has_protocol(curr_bs->backing_file)) { 3377 if (strcmp(backing_file, curr_bs->backing_file) == 0) { 3378 retval = curr_bs->backing_hd; 3379 break; 3380 } 3381 } else { 3382 /* If not an absolute filename path, make it relative to the current 3383 * image's filename path */ 3384 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 3385 backing_file); 3386 3387 /* We are going to compare absolute pathnames */ 3388 if (!realpath(filename_tmp, filename_full)) { 3389 continue; 3390 } 3391 3392 /* We need to make sure the backing filename we are comparing against 3393 * is relative to the current image filename (or absolute) */ 3394 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 3395 curr_bs->backing_file); 3396 3397 if (!realpath(filename_tmp, backing_file_full)) { 3398 continue; 3399 } 3400 3401 if (strcmp(backing_file_full, filename_full) == 0) { 3402 retval = curr_bs->backing_hd; 3403 break; 3404 } 3405 } 3406 } 3407 3408 g_free(filename_full); 3409 g_free(backing_file_full); 3410 g_free(filename_tmp); 3411 return retval; 3412 } 3413 3414 int bdrv_get_backing_file_depth(BlockDriverState *bs) 3415 { 3416 if (!bs->drv) { 3417 return 0; 3418 } 3419 3420 if (!bs->backing_hd) { 3421 return 0; 3422 } 3423 3424 return 1 + bdrv_get_backing_file_depth(bs->backing_hd); 3425 } 3426 3427 BlockDriverState *bdrv_find_base(BlockDriverState *bs) 3428 { 3429 BlockDriverState *curr_bs = NULL; 3430 3431 if (!bs) { 3432 return NULL; 3433 } 3434 3435 curr_bs = bs; 3436 3437 while (curr_bs->backing_hd) { 3438 curr_bs = curr_bs->backing_hd; 3439 } 3440 return curr_bs; 3441 } 3442 3443 #define NB_SUFFIXES 4 3444 3445 char *get_human_readable_size(char *buf, int buf_size, int64_t size) 3446 { 3447 static const char suffixes[NB_SUFFIXES] = "KMGT"; 3448 int64_t base; 3449 int i; 3450 3451 if (size <= 999) { 3452 snprintf(buf, buf_size, "%" PRId64, size); 3453 } else { 3454 base = 1024; 3455 for(i = 0; i < NB_SUFFIXES; i++) { 3456 if (size < (10 * base)) { 3457 snprintf(buf, buf_size, "%0.1f%c", 3458 (double)size / base, 3459 suffixes[i]); 3460 break; 3461 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) { 3462 snprintf(buf, buf_size, "%" PRId64 "%c", 3463 ((size + (base >> 1)) / base), 3464 suffixes[i]); 3465 break; 3466 } 3467 base = base * 1024; 3468 } 3469 } 3470 return buf; 3471 } 3472 3473 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) 3474 { 3475 char buf1[128], date_buf[128], clock_buf[128]; 3476 struct tm tm; 3477 time_t ti; 3478 int64_t secs; 3479 3480 if (!sn) { 3481 snprintf(buf, buf_size, 3482 "%-10s%-20s%7s%20s%15s", 3483 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK"); 3484 } else { 3485 ti = sn->date_sec; 3486 localtime_r(&ti, &tm); 3487 strftime(date_buf, sizeof(date_buf), 3488 "%Y-%m-%d %H:%M:%S", &tm); 3489 secs = sn->vm_clock_nsec / 1000000000; 3490 snprintf(clock_buf, sizeof(clock_buf), 3491 "%02d:%02d:%02d.%03d", 3492 (int)(secs / 3600), 3493 (int)((secs / 60) % 60), 3494 (int)(secs % 60), 3495 (int)((sn->vm_clock_nsec / 1000000) % 1000)); 3496 snprintf(buf, buf_size, 3497 "%-10s%-20s%7s%20s%15s", 3498 sn->id_str, sn->name, 3499 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size), 3500 date_buf, 3501 clock_buf); 3502 } 3503 return buf; 3504 } 3505 3506 /**************************************************************/ 3507 /* async I/Os */ 3508 3509 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 3510 QEMUIOVector *qiov, int nb_sectors, 3511 BlockDriverCompletionFunc *cb, void *opaque) 3512 { 3513 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 3514 3515 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 3516 cb, opaque, false); 3517 } 3518 3519 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 3520 QEMUIOVector *qiov, int nb_sectors, 3521 BlockDriverCompletionFunc *cb, void *opaque) 3522 { 3523 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 3524 3525 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 3526 cb, opaque, true); 3527 } 3528 3529 3530 typedef struct MultiwriteCB { 3531 int error; 3532 int num_requests; 3533 int num_callbacks; 3534 struct { 3535 BlockDriverCompletionFunc *cb; 3536 void *opaque; 3537 QEMUIOVector *free_qiov; 3538 } callbacks[]; 3539 } MultiwriteCB; 3540 3541 static void multiwrite_user_cb(MultiwriteCB *mcb) 3542 { 3543 int i; 3544 3545 for (i = 0; i < mcb->num_callbacks; i++) { 3546 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 3547 if (mcb->callbacks[i].free_qiov) { 3548 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 3549 } 3550 g_free(mcb->callbacks[i].free_qiov); 3551 } 3552 } 3553 3554 static void multiwrite_cb(void *opaque, int ret) 3555 { 3556 MultiwriteCB *mcb = opaque; 3557 3558 trace_multiwrite_cb(mcb, ret); 3559 3560 if (ret < 0 && !mcb->error) { 3561 mcb->error = ret; 3562 } 3563 3564 mcb->num_requests--; 3565 if (mcb->num_requests == 0) { 3566 multiwrite_user_cb(mcb); 3567 g_free(mcb); 3568 } 3569 } 3570 3571 static int multiwrite_req_compare(const void *a, const void *b) 3572 { 3573 const BlockRequest *req1 = a, *req2 = b; 3574 3575 /* 3576 * Note that we can't simply subtract req2->sector from req1->sector 3577 * here as that could overflow the return value. 3578 */ 3579 if (req1->sector > req2->sector) { 3580 return 1; 3581 } else if (req1->sector < req2->sector) { 3582 return -1; 3583 } else { 3584 return 0; 3585 } 3586 } 3587 3588 /* 3589 * Takes a bunch of requests and tries to merge them. Returns the number of 3590 * requests that remain after merging. 3591 */ 3592 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 3593 int num_reqs, MultiwriteCB *mcb) 3594 { 3595 int i, outidx; 3596 3597 // Sort requests by start sector 3598 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 3599 3600 // Check if adjacent requests touch the same clusters. If so, combine them, 3601 // filling up gaps with zero sectors. 3602 outidx = 0; 3603 for (i = 1; i < num_reqs; i++) { 3604 int merge = 0; 3605 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 3606 3607 // Handle exactly sequential writes and overlapping writes. 3608 if (reqs[i].sector <= oldreq_last) { 3609 merge = 1; 3610 } 3611 3612 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 3613 merge = 0; 3614 } 3615 3616 if (merge) { 3617 size_t size; 3618 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 3619 qemu_iovec_init(qiov, 3620 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 3621 3622 // Add the first request to the merged one. If the requests are 3623 // overlapping, drop the last sectors of the first request. 3624 size = (reqs[i].sector - reqs[outidx].sector) << 9; 3625 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 3626 3627 // We should need to add any zeros between the two requests 3628 assert (reqs[i].sector <= oldreq_last); 3629 3630 // Add the second request 3631 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 3632 3633 reqs[outidx].nb_sectors = qiov->size >> 9; 3634 reqs[outidx].qiov = qiov; 3635 3636 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 3637 } else { 3638 outidx++; 3639 reqs[outidx].sector = reqs[i].sector; 3640 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 3641 reqs[outidx].qiov = reqs[i].qiov; 3642 } 3643 } 3644 3645 return outidx + 1; 3646 } 3647 3648 /* 3649 * Submit multiple AIO write requests at once. 3650 * 3651 * On success, the function returns 0 and all requests in the reqs array have 3652 * been submitted. In error case this function returns -1, and any of the 3653 * requests may or may not be submitted yet. In particular, this means that the 3654 * callback will be called for some of the requests, for others it won't. The 3655 * caller must check the error field of the BlockRequest to wait for the right 3656 * callbacks (if error != 0, no callback will be called). 3657 * 3658 * The implementation may modify the contents of the reqs array, e.g. to merge 3659 * requests. However, the fields opaque and error are left unmodified as they 3660 * are used to signal failure for a single request to the caller. 3661 */ 3662 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 3663 { 3664 MultiwriteCB *mcb; 3665 int i; 3666 3667 /* don't submit writes if we don't have a medium */ 3668 if (bs->drv == NULL) { 3669 for (i = 0; i < num_reqs; i++) { 3670 reqs[i].error = -ENOMEDIUM; 3671 } 3672 return -1; 3673 } 3674 3675 if (num_reqs == 0) { 3676 return 0; 3677 } 3678 3679 // Create MultiwriteCB structure 3680 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 3681 mcb->num_requests = 0; 3682 mcb->num_callbacks = num_reqs; 3683 3684 for (i = 0; i < num_reqs; i++) { 3685 mcb->callbacks[i].cb = reqs[i].cb; 3686 mcb->callbacks[i].opaque = reqs[i].opaque; 3687 } 3688 3689 // Check for mergable requests 3690 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 3691 3692 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 3693 3694 /* Run the aio requests. */ 3695 mcb->num_requests = num_reqs; 3696 for (i = 0; i < num_reqs; i++) { 3697 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, 3698 reqs[i].nb_sectors, multiwrite_cb, mcb); 3699 } 3700 3701 return 0; 3702 } 3703 3704 void bdrv_aio_cancel(BlockDriverAIOCB *acb) 3705 { 3706 acb->aiocb_info->cancel(acb); 3707 } 3708 3709 /* block I/O throttling */ 3710 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, 3711 bool is_write, double elapsed_time, uint64_t *wait) 3712 { 3713 uint64_t bps_limit = 0; 3714 double bytes_limit, bytes_base, bytes_res; 3715 double slice_time, wait_time; 3716 3717 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { 3718 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; 3719 } else if (bs->io_limits.bps[is_write]) { 3720 bps_limit = bs->io_limits.bps[is_write]; 3721 } else { 3722 if (wait) { 3723 *wait = 0; 3724 } 3725 3726 return false; 3727 } 3728 3729 slice_time = bs->slice_end - bs->slice_start; 3730 slice_time /= (NANOSECONDS_PER_SECOND); 3731 bytes_limit = bps_limit * slice_time; 3732 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write]; 3733 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { 3734 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write]; 3735 } 3736 3737 /* bytes_base: the bytes of data which have been read/written; and 3738 * it is obtained from the history statistic info. 3739 * bytes_res: the remaining bytes of data which need to be read/written. 3740 * (bytes_base + bytes_res) / bps_limit: used to calcuate 3741 * the total time for completing reading/writting all data. 3742 */ 3743 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; 3744 3745 if (bytes_base + bytes_res <= bytes_limit) { 3746 if (wait) { 3747 *wait = 0; 3748 } 3749 3750 return false; 3751 } 3752 3753 /* Calc approx time to dispatch */ 3754 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time; 3755 3756 /* When the I/O rate at runtime exceeds the limits, 3757 * bs->slice_end need to be extended in order that the current statistic 3758 * info can be kept until the timer fire, so it is increased and tuned 3759 * based on the result of experiment. 3760 */ 3761 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; 3762 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; 3763 if (wait) { 3764 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; 3765 } 3766 3767 return true; 3768 } 3769 3770 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, 3771 double elapsed_time, uint64_t *wait) 3772 { 3773 uint64_t iops_limit = 0; 3774 double ios_limit, ios_base; 3775 double slice_time, wait_time; 3776 3777 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { 3778 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; 3779 } else if (bs->io_limits.iops[is_write]) { 3780 iops_limit = bs->io_limits.iops[is_write]; 3781 } else { 3782 if (wait) { 3783 *wait = 0; 3784 } 3785 3786 return false; 3787 } 3788 3789 slice_time = bs->slice_end - bs->slice_start; 3790 slice_time /= (NANOSECONDS_PER_SECOND); 3791 ios_limit = iops_limit * slice_time; 3792 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write]; 3793 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { 3794 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write]; 3795 } 3796 3797 if (ios_base + 1 <= ios_limit) { 3798 if (wait) { 3799 *wait = 0; 3800 } 3801 3802 return false; 3803 } 3804 3805 /* Calc approx time to dispatch */ 3806 wait_time = (ios_base + 1) / iops_limit; 3807 if (wait_time > elapsed_time) { 3808 wait_time = wait_time - elapsed_time; 3809 } else { 3810 wait_time = 0; 3811 } 3812 3813 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; 3814 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; 3815 if (wait) { 3816 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; 3817 } 3818 3819 return true; 3820 } 3821 3822 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, 3823 bool is_write, int64_t *wait) 3824 { 3825 int64_t now, max_wait; 3826 uint64_t bps_wait = 0, iops_wait = 0; 3827 double elapsed_time; 3828 int bps_ret, iops_ret; 3829 3830 now = qemu_get_clock_ns(vm_clock); 3831 if ((bs->slice_start < now) 3832 && (bs->slice_end > now)) { 3833 bs->slice_end = now + bs->slice_time; 3834 } else { 3835 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; 3836 bs->slice_start = now; 3837 bs->slice_end = now + bs->slice_time; 3838 3839 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write]; 3840 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write]; 3841 3842 bs->io_base.ios[is_write] = bs->nr_ops[is_write]; 3843 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write]; 3844 } 3845 3846 elapsed_time = now - bs->slice_start; 3847 elapsed_time /= (NANOSECONDS_PER_SECOND); 3848 3849 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, 3850 is_write, elapsed_time, &bps_wait); 3851 iops_ret = bdrv_exceed_iops_limits(bs, is_write, 3852 elapsed_time, &iops_wait); 3853 if (bps_ret || iops_ret) { 3854 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; 3855 if (wait) { 3856 *wait = max_wait; 3857 } 3858 3859 now = qemu_get_clock_ns(vm_clock); 3860 if (bs->slice_end < now + max_wait) { 3861 bs->slice_end = now + max_wait; 3862 } 3863 3864 return true; 3865 } 3866 3867 if (wait) { 3868 *wait = 0; 3869 } 3870 3871 return false; 3872 } 3873 3874 /**************************************************************/ 3875 /* async block device emulation */ 3876 3877 typedef struct BlockDriverAIOCBSync { 3878 BlockDriverAIOCB common; 3879 QEMUBH *bh; 3880 int ret; 3881 /* vector translation state */ 3882 QEMUIOVector *qiov; 3883 uint8_t *bounce; 3884 int is_write; 3885 } BlockDriverAIOCBSync; 3886 3887 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) 3888 { 3889 BlockDriverAIOCBSync *acb = 3890 container_of(blockacb, BlockDriverAIOCBSync, common); 3891 qemu_bh_delete(acb->bh); 3892 acb->bh = NULL; 3893 qemu_aio_release(acb); 3894 } 3895 3896 static const AIOCBInfo bdrv_em_aiocb_info = { 3897 .aiocb_size = sizeof(BlockDriverAIOCBSync), 3898 .cancel = bdrv_aio_cancel_em, 3899 }; 3900 3901 static void bdrv_aio_bh_cb(void *opaque) 3902 { 3903 BlockDriverAIOCBSync *acb = opaque; 3904 3905 if (!acb->is_write) 3906 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 3907 qemu_vfree(acb->bounce); 3908 acb->common.cb(acb->common.opaque, acb->ret); 3909 qemu_bh_delete(acb->bh); 3910 acb->bh = NULL; 3911 qemu_aio_release(acb); 3912 } 3913 3914 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 3915 int64_t sector_num, 3916 QEMUIOVector *qiov, 3917 int nb_sectors, 3918 BlockDriverCompletionFunc *cb, 3919 void *opaque, 3920 int is_write) 3921 3922 { 3923 BlockDriverAIOCBSync *acb; 3924 3925 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 3926 acb->is_write = is_write; 3927 acb->qiov = qiov; 3928 acb->bounce = qemu_blockalign(bs, qiov->size); 3929 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); 3930 3931 if (is_write) { 3932 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 3933 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 3934 } else { 3935 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 3936 } 3937 3938 qemu_bh_schedule(acb->bh); 3939 3940 return &acb->common; 3941 } 3942 3943 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 3944 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3945 BlockDriverCompletionFunc *cb, void *opaque) 3946 { 3947 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 3948 } 3949 3950 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 3951 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3952 BlockDriverCompletionFunc *cb, void *opaque) 3953 { 3954 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 3955 } 3956 3957 3958 typedef struct BlockDriverAIOCBCoroutine { 3959 BlockDriverAIOCB common; 3960 BlockRequest req; 3961 bool is_write; 3962 bool *done; 3963 QEMUBH* bh; 3964 } BlockDriverAIOCBCoroutine; 3965 3966 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) 3967 { 3968 BlockDriverAIOCBCoroutine *acb = 3969 container_of(blockacb, BlockDriverAIOCBCoroutine, common); 3970 bool done = false; 3971 3972 acb->done = &done; 3973 while (!done) { 3974 qemu_aio_wait(); 3975 } 3976 } 3977 3978 static const AIOCBInfo bdrv_em_co_aiocb_info = { 3979 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), 3980 .cancel = bdrv_aio_co_cancel_em, 3981 }; 3982 3983 static void bdrv_co_em_bh(void *opaque) 3984 { 3985 BlockDriverAIOCBCoroutine *acb = opaque; 3986 3987 acb->common.cb(acb->common.opaque, acb->req.error); 3988 3989 if (acb->done) { 3990 *acb->done = true; 3991 } 3992 3993 qemu_bh_delete(acb->bh); 3994 qemu_aio_release(acb); 3995 } 3996 3997 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 3998 static void coroutine_fn bdrv_co_do_rw(void *opaque) 3999 { 4000 BlockDriverAIOCBCoroutine *acb = opaque; 4001 BlockDriverState *bs = acb->common.bs; 4002 4003 if (!acb->is_write) { 4004 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 4005 acb->req.nb_sectors, acb->req.qiov, 0); 4006 } else { 4007 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 4008 acb->req.nb_sectors, acb->req.qiov, 0); 4009 } 4010 4011 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 4012 qemu_bh_schedule(acb->bh); 4013 } 4014 4015 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 4016 int64_t sector_num, 4017 QEMUIOVector *qiov, 4018 int nb_sectors, 4019 BlockDriverCompletionFunc *cb, 4020 void *opaque, 4021 bool is_write) 4022 { 4023 Coroutine *co; 4024 BlockDriverAIOCBCoroutine *acb; 4025 4026 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4027 acb->req.sector = sector_num; 4028 acb->req.nb_sectors = nb_sectors; 4029 acb->req.qiov = qiov; 4030 acb->is_write = is_write; 4031 acb->done = NULL; 4032 4033 co = qemu_coroutine_create(bdrv_co_do_rw); 4034 qemu_coroutine_enter(co, acb); 4035 4036 return &acb->common; 4037 } 4038 4039 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 4040 { 4041 BlockDriverAIOCBCoroutine *acb = opaque; 4042 BlockDriverState *bs = acb->common.bs; 4043 4044 acb->req.error = bdrv_co_flush(bs); 4045 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 4046 qemu_bh_schedule(acb->bh); 4047 } 4048 4049 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 4050 BlockDriverCompletionFunc *cb, void *opaque) 4051 { 4052 trace_bdrv_aio_flush(bs, opaque); 4053 4054 Coroutine *co; 4055 BlockDriverAIOCBCoroutine *acb; 4056 4057 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4058 acb->done = NULL; 4059 4060 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 4061 qemu_coroutine_enter(co, acb); 4062 4063 return &acb->common; 4064 } 4065 4066 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 4067 { 4068 BlockDriverAIOCBCoroutine *acb = opaque; 4069 BlockDriverState *bs = acb->common.bs; 4070 4071 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 4072 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 4073 qemu_bh_schedule(acb->bh); 4074 } 4075 4076 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, 4077 int64_t sector_num, int nb_sectors, 4078 BlockDriverCompletionFunc *cb, void *opaque) 4079 { 4080 Coroutine *co; 4081 BlockDriverAIOCBCoroutine *acb; 4082 4083 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 4084 4085 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 4086 acb->req.sector = sector_num; 4087 acb->req.nb_sectors = nb_sectors; 4088 acb->done = NULL; 4089 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 4090 qemu_coroutine_enter(co, acb); 4091 4092 return &acb->common; 4093 } 4094 4095 void bdrv_init(void) 4096 { 4097 module_call_init(MODULE_INIT_BLOCK); 4098 } 4099 4100 void bdrv_init_with_whitelist(void) 4101 { 4102 use_bdrv_whitelist = 1; 4103 bdrv_init(); 4104 } 4105 4106 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 4107 BlockDriverCompletionFunc *cb, void *opaque) 4108 { 4109 BlockDriverAIOCB *acb; 4110 4111 acb = g_slice_alloc(aiocb_info->aiocb_size); 4112 acb->aiocb_info = aiocb_info; 4113 acb->bs = bs; 4114 acb->cb = cb; 4115 acb->opaque = opaque; 4116 return acb; 4117 } 4118 4119 void qemu_aio_release(void *p) 4120 { 4121 BlockDriverAIOCB *acb = p; 4122 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 4123 } 4124 4125 /**************************************************************/ 4126 /* Coroutine block device emulation */ 4127 4128 typedef struct CoroutineIOCompletion { 4129 Coroutine *coroutine; 4130 int ret; 4131 } CoroutineIOCompletion; 4132 4133 static void bdrv_co_io_em_complete(void *opaque, int ret) 4134 { 4135 CoroutineIOCompletion *co = opaque; 4136 4137 co->ret = ret; 4138 qemu_coroutine_enter(co->coroutine, NULL); 4139 } 4140 4141 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 4142 int nb_sectors, QEMUIOVector *iov, 4143 bool is_write) 4144 { 4145 CoroutineIOCompletion co = { 4146 .coroutine = qemu_coroutine_self(), 4147 }; 4148 BlockDriverAIOCB *acb; 4149 4150 if (is_write) { 4151 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 4152 bdrv_co_io_em_complete, &co); 4153 } else { 4154 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 4155 bdrv_co_io_em_complete, &co); 4156 } 4157 4158 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 4159 if (!acb) { 4160 return -EIO; 4161 } 4162 qemu_coroutine_yield(); 4163 4164 return co.ret; 4165 } 4166 4167 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 4168 int64_t sector_num, int nb_sectors, 4169 QEMUIOVector *iov) 4170 { 4171 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 4172 } 4173 4174 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 4175 int64_t sector_num, int nb_sectors, 4176 QEMUIOVector *iov) 4177 { 4178 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 4179 } 4180 4181 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 4182 { 4183 RwCo *rwco = opaque; 4184 4185 rwco->ret = bdrv_co_flush(rwco->bs); 4186 } 4187 4188 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 4189 { 4190 int ret; 4191 4192 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 4193 return 0; 4194 } 4195 4196 /* Write back cached data to the OS even with cache=unsafe */ 4197 if (bs->drv->bdrv_co_flush_to_os) { 4198 ret = bs->drv->bdrv_co_flush_to_os(bs); 4199 if (ret < 0) { 4200 return ret; 4201 } 4202 } 4203 4204 /* But don't actually force it to the disk with cache=unsafe */ 4205 if (bs->open_flags & BDRV_O_NO_FLUSH) { 4206 goto flush_parent; 4207 } 4208 4209 if (bs->drv->bdrv_co_flush_to_disk) { 4210 ret = bs->drv->bdrv_co_flush_to_disk(bs); 4211 } else if (bs->drv->bdrv_aio_flush) { 4212 BlockDriverAIOCB *acb; 4213 CoroutineIOCompletion co = { 4214 .coroutine = qemu_coroutine_self(), 4215 }; 4216 4217 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 4218 if (acb == NULL) { 4219 ret = -EIO; 4220 } else { 4221 qemu_coroutine_yield(); 4222 ret = co.ret; 4223 } 4224 } else { 4225 /* 4226 * Some block drivers always operate in either writethrough or unsafe 4227 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 4228 * know how the server works (because the behaviour is hardcoded or 4229 * depends on server-side configuration), so we can't ensure that 4230 * everything is safe on disk. Returning an error doesn't work because 4231 * that would break guests even if the server operates in writethrough 4232 * mode. 4233 * 4234 * Let's hope the user knows what he's doing. 4235 */ 4236 ret = 0; 4237 } 4238 if (ret < 0) { 4239 return ret; 4240 } 4241 4242 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 4243 * in the case of cache=unsafe, so there are no useless flushes. 4244 */ 4245 flush_parent: 4246 return bdrv_co_flush(bs->file); 4247 } 4248 4249 void bdrv_invalidate_cache(BlockDriverState *bs) 4250 { 4251 if (bs->drv && bs->drv->bdrv_invalidate_cache) { 4252 bs->drv->bdrv_invalidate_cache(bs); 4253 } 4254 } 4255 4256 void bdrv_invalidate_cache_all(void) 4257 { 4258 BlockDriverState *bs; 4259 4260 QTAILQ_FOREACH(bs, &bdrv_states, list) { 4261 bdrv_invalidate_cache(bs); 4262 } 4263 } 4264 4265 void bdrv_clear_incoming_migration_all(void) 4266 { 4267 BlockDriverState *bs; 4268 4269 QTAILQ_FOREACH(bs, &bdrv_states, list) { 4270 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); 4271 } 4272 } 4273 4274 int bdrv_flush(BlockDriverState *bs) 4275 { 4276 Coroutine *co; 4277 RwCo rwco = { 4278 .bs = bs, 4279 .ret = NOT_DONE, 4280 }; 4281 4282 if (qemu_in_coroutine()) { 4283 /* Fast-path if already in coroutine context */ 4284 bdrv_flush_co_entry(&rwco); 4285 } else { 4286 co = qemu_coroutine_create(bdrv_flush_co_entry); 4287 qemu_coroutine_enter(co, &rwco); 4288 while (rwco.ret == NOT_DONE) { 4289 qemu_aio_wait(); 4290 } 4291 } 4292 4293 return rwco.ret; 4294 } 4295 4296 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 4297 { 4298 RwCo *rwco = opaque; 4299 4300 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 4301 } 4302 4303 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 4304 int nb_sectors) 4305 { 4306 if (!bs->drv) { 4307 return -ENOMEDIUM; 4308 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { 4309 return -EIO; 4310 } else if (bs->read_only) { 4311 return -EROFS; 4312 } 4313 4314 if (bs->dirty_bitmap) { 4315 bdrv_reset_dirty(bs, sector_num, nb_sectors); 4316 } 4317 4318 /* Do nothing if disabled. */ 4319 if (!(bs->open_flags & BDRV_O_UNMAP)) { 4320 return 0; 4321 } 4322 4323 if (bs->drv->bdrv_co_discard) { 4324 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); 4325 } else if (bs->drv->bdrv_aio_discard) { 4326 BlockDriverAIOCB *acb; 4327 CoroutineIOCompletion co = { 4328 .coroutine = qemu_coroutine_self(), 4329 }; 4330 4331 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 4332 bdrv_co_io_em_complete, &co); 4333 if (acb == NULL) { 4334 return -EIO; 4335 } else { 4336 qemu_coroutine_yield(); 4337 return co.ret; 4338 } 4339 } else { 4340 return 0; 4341 } 4342 } 4343 4344 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 4345 { 4346 Coroutine *co; 4347 RwCo rwco = { 4348 .bs = bs, 4349 .sector_num = sector_num, 4350 .nb_sectors = nb_sectors, 4351 .ret = NOT_DONE, 4352 }; 4353 4354 if (qemu_in_coroutine()) { 4355 /* Fast-path if already in coroutine context */ 4356 bdrv_discard_co_entry(&rwco); 4357 } else { 4358 co = qemu_coroutine_create(bdrv_discard_co_entry); 4359 qemu_coroutine_enter(co, &rwco); 4360 while (rwco.ret == NOT_DONE) { 4361 qemu_aio_wait(); 4362 } 4363 } 4364 4365 return rwco.ret; 4366 } 4367 4368 /**************************************************************/ 4369 /* removable device support */ 4370 4371 /** 4372 * Return TRUE if the media is present 4373 */ 4374 int bdrv_is_inserted(BlockDriverState *bs) 4375 { 4376 BlockDriver *drv = bs->drv; 4377 4378 if (!drv) 4379 return 0; 4380 if (!drv->bdrv_is_inserted) 4381 return 1; 4382 return drv->bdrv_is_inserted(bs); 4383 } 4384 4385 /** 4386 * Return whether the media changed since the last call to this 4387 * function, or -ENOTSUP if we don't know. Most drivers don't know. 4388 */ 4389 int bdrv_media_changed(BlockDriverState *bs) 4390 { 4391 BlockDriver *drv = bs->drv; 4392 4393 if (drv && drv->bdrv_media_changed) { 4394 return drv->bdrv_media_changed(bs); 4395 } 4396 return -ENOTSUP; 4397 } 4398 4399 /** 4400 * If eject_flag is TRUE, eject the media. Otherwise, close the tray 4401 */ 4402 void bdrv_eject(BlockDriverState *bs, bool eject_flag) 4403 { 4404 BlockDriver *drv = bs->drv; 4405 4406 if (drv && drv->bdrv_eject) { 4407 drv->bdrv_eject(bs, eject_flag); 4408 } 4409 4410 if (bs->device_name[0] != '\0') { 4411 bdrv_emit_qmp_eject_event(bs, eject_flag); 4412 } 4413 } 4414 4415 /** 4416 * Lock or unlock the media (if it is locked, the user won't be able 4417 * to eject it manually). 4418 */ 4419 void bdrv_lock_medium(BlockDriverState *bs, bool locked) 4420 { 4421 BlockDriver *drv = bs->drv; 4422 4423 trace_bdrv_lock_medium(bs, locked); 4424 4425 if (drv && drv->bdrv_lock_medium) { 4426 drv->bdrv_lock_medium(bs, locked); 4427 } 4428 } 4429 4430 /* needed for generic scsi interface */ 4431 4432 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 4433 { 4434 BlockDriver *drv = bs->drv; 4435 4436 if (drv && drv->bdrv_ioctl) 4437 return drv->bdrv_ioctl(bs, req, buf); 4438 return -ENOTSUP; 4439 } 4440 4441 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 4442 unsigned long int req, void *buf, 4443 BlockDriverCompletionFunc *cb, void *opaque) 4444 { 4445 BlockDriver *drv = bs->drv; 4446 4447 if (drv && drv->bdrv_aio_ioctl) 4448 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 4449 return NULL; 4450 } 4451 4452 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) 4453 { 4454 bs->buffer_alignment = align; 4455 } 4456 4457 void *qemu_blockalign(BlockDriverState *bs, size_t size) 4458 { 4459 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); 4460 } 4461 4462 /* 4463 * Check if all memory in this vector is sector aligned. 4464 */ 4465 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 4466 { 4467 int i; 4468 4469 for (i = 0; i < qiov->niov; i++) { 4470 if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { 4471 return false; 4472 } 4473 } 4474 4475 return true; 4476 } 4477 4478 void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) 4479 { 4480 int64_t bitmap_size; 4481 4482 assert((granularity & (granularity - 1)) == 0); 4483 4484 if (granularity) { 4485 granularity >>= BDRV_SECTOR_BITS; 4486 assert(!bs->dirty_bitmap); 4487 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); 4488 bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); 4489 } else { 4490 if (bs->dirty_bitmap) { 4491 hbitmap_free(bs->dirty_bitmap); 4492 bs->dirty_bitmap = NULL; 4493 } 4494 } 4495 } 4496 4497 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) 4498 { 4499 if (bs->dirty_bitmap) { 4500 return hbitmap_get(bs->dirty_bitmap, sector); 4501 } else { 4502 return 0; 4503 } 4504 } 4505 4506 void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) 4507 { 4508 hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); 4509 } 4510 4511 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, 4512 int nr_sectors) 4513 { 4514 hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); 4515 } 4516 4517 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, 4518 int nr_sectors) 4519 { 4520 hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); 4521 } 4522 4523 int64_t bdrv_get_dirty_count(BlockDriverState *bs) 4524 { 4525 if (bs->dirty_bitmap) { 4526 return hbitmap_count(bs->dirty_bitmap); 4527 } else { 4528 return 0; 4529 } 4530 } 4531 4532 void bdrv_set_in_use(BlockDriverState *bs, int in_use) 4533 { 4534 assert(bs->in_use != in_use); 4535 bs->in_use = in_use; 4536 } 4537 4538 int bdrv_in_use(BlockDriverState *bs) 4539 { 4540 return bs->in_use; 4541 } 4542 4543 void bdrv_iostatus_enable(BlockDriverState *bs) 4544 { 4545 bs->iostatus_enabled = true; 4546 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 4547 } 4548 4549 /* The I/O status is only enabled if the drive explicitly 4550 * enables it _and_ the VM is configured to stop on errors */ 4551 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) 4552 { 4553 return (bs->iostatus_enabled && 4554 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || 4555 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || 4556 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); 4557 } 4558 4559 void bdrv_iostatus_disable(BlockDriverState *bs) 4560 { 4561 bs->iostatus_enabled = false; 4562 } 4563 4564 void bdrv_iostatus_reset(BlockDriverState *bs) 4565 { 4566 if (bdrv_iostatus_is_enabled(bs)) { 4567 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 4568 if (bs->job) { 4569 block_job_iostatus_reset(bs->job); 4570 } 4571 } 4572 } 4573 4574 void bdrv_iostatus_set_err(BlockDriverState *bs, int error) 4575 { 4576 assert(bdrv_iostatus_is_enabled(bs)); 4577 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 4578 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : 4579 BLOCK_DEVICE_IO_STATUS_FAILED; 4580 } 4581 } 4582 4583 void 4584 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, 4585 enum BlockAcctType type) 4586 { 4587 assert(type < BDRV_MAX_IOTYPE); 4588 4589 cookie->bytes = bytes; 4590 cookie->start_time_ns = get_clock(); 4591 cookie->type = type; 4592 } 4593 4594 void 4595 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie) 4596 { 4597 assert(cookie->type < BDRV_MAX_IOTYPE); 4598 4599 bs->nr_bytes[cookie->type] += cookie->bytes; 4600 bs->nr_ops[cookie->type]++; 4601 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns; 4602 } 4603 4604 void bdrv_img_create(const char *filename, const char *fmt, 4605 const char *base_filename, const char *base_fmt, 4606 char *options, uint64_t img_size, int flags, 4607 Error **errp, bool quiet) 4608 { 4609 QEMUOptionParameter *param = NULL, *create_options = NULL; 4610 QEMUOptionParameter *backing_fmt, *backing_file, *size; 4611 BlockDriverState *bs = NULL; 4612 BlockDriver *drv, *proto_drv; 4613 BlockDriver *backing_drv = NULL; 4614 int ret = 0; 4615 4616 /* Find driver and parse its options */ 4617 drv = bdrv_find_format(fmt); 4618 if (!drv) { 4619 error_setg(errp, "Unknown file format '%s'", fmt); 4620 return; 4621 } 4622 4623 proto_drv = bdrv_find_protocol(filename); 4624 if (!proto_drv) { 4625 error_setg(errp, "Unknown protocol '%s'", filename); 4626 return; 4627 } 4628 4629 create_options = append_option_parameters(create_options, 4630 drv->create_options); 4631 create_options = append_option_parameters(create_options, 4632 proto_drv->create_options); 4633 4634 /* Create parameter list with default values */ 4635 param = parse_option_parameters("", create_options, param); 4636 4637 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size); 4638 4639 /* Parse -o options */ 4640 if (options) { 4641 param = parse_option_parameters(options, create_options, param); 4642 if (param == NULL) { 4643 error_setg(errp, "Invalid options for file format '%s'.", fmt); 4644 goto out; 4645 } 4646 } 4647 4648 if (base_filename) { 4649 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE, 4650 base_filename)) { 4651 error_setg(errp, "Backing file not supported for file format '%s'", 4652 fmt); 4653 goto out; 4654 } 4655 } 4656 4657 if (base_fmt) { 4658 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) { 4659 error_setg(errp, "Backing file format not supported for file " 4660 "format '%s'", fmt); 4661 goto out; 4662 } 4663 } 4664 4665 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE); 4666 if (backing_file && backing_file->value.s) { 4667 if (!strcmp(filename, backing_file->value.s)) { 4668 error_setg(errp, "Error: Trying to create an image with the " 4669 "same filename as the backing file"); 4670 goto out; 4671 } 4672 } 4673 4674 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT); 4675 if (backing_fmt && backing_fmt->value.s) { 4676 backing_drv = bdrv_find_format(backing_fmt->value.s); 4677 if (!backing_drv) { 4678 error_setg(errp, "Unknown backing file format '%s'", 4679 backing_fmt->value.s); 4680 goto out; 4681 } 4682 } 4683 4684 // The size for the image must always be specified, with one exception: 4685 // If we are using a backing file, we can obtain the size from there 4686 size = get_option_parameter(param, BLOCK_OPT_SIZE); 4687 if (size && size->value.n == -1) { 4688 if (backing_file && backing_file->value.s) { 4689 uint64_t size; 4690 char buf[32]; 4691 int back_flags; 4692 4693 /* backing files always opened read-only */ 4694 back_flags = 4695 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 4696 4697 bs = bdrv_new(""); 4698 4699 ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags, 4700 backing_drv); 4701 if (ret < 0) { 4702 error_setg_errno(errp, -ret, "Could not open '%s'", 4703 backing_file->value.s); 4704 goto out; 4705 } 4706 bdrv_get_geometry(bs, &size); 4707 size *= 512; 4708 4709 snprintf(buf, sizeof(buf), "%" PRId64, size); 4710 set_option_parameter(param, BLOCK_OPT_SIZE, buf); 4711 } else { 4712 error_setg(errp, "Image creation needs a size parameter"); 4713 goto out; 4714 } 4715 } 4716 4717 if (!quiet) { 4718 printf("Formatting '%s', fmt=%s ", filename, fmt); 4719 print_option_parameters(param); 4720 puts(""); 4721 } 4722 ret = bdrv_create(drv, filename, param); 4723 if (ret < 0) { 4724 if (ret == -ENOTSUP) { 4725 error_setg(errp,"Formatting or formatting option not supported for " 4726 "file format '%s'", fmt); 4727 } else if (ret == -EFBIG) { 4728 error_setg(errp, "The image size is too large for file format '%s'", 4729 fmt); 4730 } else { 4731 error_setg(errp, "%s: error while creating %s: %s", filename, fmt, 4732 strerror(-ret)); 4733 } 4734 } 4735 4736 out: 4737 free_option_parameters(create_options); 4738 free_option_parameters(param); 4739 4740 if (bs) { 4741 bdrv_delete(bs); 4742 } 4743 } 4744 4745 AioContext *bdrv_get_aio_context(BlockDriverState *bs) 4746 { 4747 /* Currently BlockDriverState always uses the main loop AioContext */ 4748 return qemu_get_aio_context(); 4749 } 4750