1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "qemu/cutils.h" 31 #include "qapi/error.h" 32 #include "qemu/error-report.h" 33 34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 35 36 static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, 37 int64_t offset, 38 QEMUIOVector *qiov, 39 BdrvRequestFlags flags, 40 BlockCompletionFunc *cb, 41 void *opaque, 42 bool is_write); 43 static void coroutine_fn bdrv_co_do_rw(void *opaque); 44 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 45 int64_t offset, int count, BdrvRequestFlags flags); 46 47 void bdrv_parent_drained_begin(BlockDriverState *bs) 48 { 49 BdrvChild *c; 50 51 QLIST_FOREACH(c, &bs->parents, next_parent) { 52 if (c->role->drained_begin) { 53 c->role->drained_begin(c); 54 } 55 } 56 } 57 58 void bdrv_parent_drained_end(BlockDriverState *bs) 59 { 60 BdrvChild *c; 61 62 QLIST_FOREACH(c, &bs->parents, next_parent) { 63 if (c->role->drained_end) { 64 c->role->drained_end(c); 65 } 66 } 67 } 68 69 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 70 { 71 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 72 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 73 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 74 src->opt_mem_alignment); 75 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 76 src->min_mem_alignment); 77 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 78 } 79 80 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 81 { 82 BlockDriver *drv = bs->drv; 83 Error *local_err = NULL; 84 85 memset(&bs->bl, 0, sizeof(bs->bl)); 86 87 if (!drv) { 88 return; 89 } 90 91 /* Default alignment based on whether driver has byte interface */ 92 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; 93 94 /* Take some limits from the children as a default */ 95 if (bs->file) { 96 bdrv_refresh_limits(bs->file->bs, &local_err); 97 if (local_err) { 98 error_propagate(errp, local_err); 99 return; 100 } 101 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 102 } else { 103 bs->bl.min_mem_alignment = 512; 104 bs->bl.opt_mem_alignment = getpagesize(); 105 106 /* Safe default since most protocols use readv()/writev()/etc */ 107 bs->bl.max_iov = IOV_MAX; 108 } 109 110 if (bs->backing) { 111 bdrv_refresh_limits(bs->backing->bs, &local_err); 112 if (local_err) { 113 error_propagate(errp, local_err); 114 return; 115 } 116 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 117 } 118 119 /* Then let the driver override it */ 120 if (drv->bdrv_refresh_limits) { 121 drv->bdrv_refresh_limits(bs, errp); 122 } 123 } 124 125 /** 126 * The copy-on-read flag is actually a reference count so multiple users may 127 * use the feature without worrying about clobbering its previous state. 128 * Copy-on-read stays enabled until all users have called to disable it. 129 */ 130 void bdrv_enable_copy_on_read(BlockDriverState *bs) 131 { 132 bs->copy_on_read++; 133 } 134 135 void bdrv_disable_copy_on_read(BlockDriverState *bs) 136 { 137 assert(bs->copy_on_read > 0); 138 bs->copy_on_read--; 139 } 140 141 /* Check if any requests are in-flight (including throttled requests) */ 142 bool bdrv_requests_pending(BlockDriverState *bs) 143 { 144 BdrvChild *child; 145 146 if (atomic_read(&bs->in_flight)) { 147 return true; 148 } 149 150 QLIST_FOREACH(child, &bs->children, next) { 151 if (bdrv_requests_pending(child->bs)) { 152 return true; 153 } 154 } 155 156 return false; 157 } 158 159 static bool bdrv_drain_recurse(BlockDriverState *bs) 160 { 161 BdrvChild *child, *tmp; 162 bool waited; 163 164 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); 165 166 if (bs->drv && bs->drv->bdrv_drain) { 167 bs->drv->bdrv_drain(bs); 168 } 169 170 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 171 BlockDriverState *bs = child->bs; 172 bool in_main_loop = 173 qemu_get_current_aio_context() == qemu_get_aio_context(); 174 assert(bs->refcnt > 0); 175 if (in_main_loop) { 176 /* In case the recursive bdrv_drain_recurse processes a 177 * block_job_defer_to_main_loop BH and modifies the graph, 178 * let's hold a reference to bs until we are done. 179 * 180 * IOThread doesn't have such a BH, and it is not safe to call 181 * bdrv_unref without BQL, so skip doing it there. 182 */ 183 bdrv_ref(bs); 184 } 185 waited |= bdrv_drain_recurse(bs); 186 if (in_main_loop) { 187 bdrv_unref(bs); 188 } 189 } 190 191 return waited; 192 } 193 194 typedef struct { 195 Coroutine *co; 196 BlockDriverState *bs; 197 bool done; 198 } BdrvCoDrainData; 199 200 static void bdrv_co_drain_bh_cb(void *opaque) 201 { 202 BdrvCoDrainData *data = opaque; 203 Coroutine *co = data->co; 204 BlockDriverState *bs = data->bs; 205 206 bdrv_dec_in_flight(bs); 207 bdrv_drained_begin(bs); 208 data->done = true; 209 aio_co_wake(co); 210 } 211 212 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) 213 { 214 BdrvCoDrainData data; 215 216 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 217 * other coroutines run if they were queued from 218 * qemu_co_queue_run_restart(). */ 219 220 assert(qemu_in_coroutine()); 221 data = (BdrvCoDrainData) { 222 .co = qemu_coroutine_self(), 223 .bs = bs, 224 .done = false, 225 }; 226 bdrv_inc_in_flight(bs); 227 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 228 bdrv_co_drain_bh_cb, &data); 229 230 qemu_coroutine_yield(); 231 /* If we are resumed from some other event (such as an aio completion or a 232 * timer callback), it is a bug in the caller that should be fixed. */ 233 assert(data.done); 234 } 235 236 void bdrv_drained_begin(BlockDriverState *bs) 237 { 238 if (qemu_in_coroutine()) { 239 bdrv_co_yield_to_drain(bs); 240 return; 241 } 242 243 if (!bs->quiesce_counter++) { 244 aio_disable_external(bdrv_get_aio_context(bs)); 245 bdrv_parent_drained_begin(bs); 246 } 247 248 bdrv_drain_recurse(bs); 249 } 250 251 void bdrv_drained_end(BlockDriverState *bs) 252 { 253 assert(bs->quiesce_counter > 0); 254 if (--bs->quiesce_counter > 0) { 255 return; 256 } 257 258 bdrv_parent_drained_end(bs); 259 aio_enable_external(bdrv_get_aio_context(bs)); 260 } 261 262 /* 263 * Wait for pending requests to complete on a single BlockDriverState subtree, 264 * and suspend block driver's internal I/O until next request arrives. 265 * 266 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 267 * AioContext. 268 * 269 * Only this BlockDriverState's AioContext is run, so in-flight requests must 270 * not depend on events in other AioContexts. In that case, use 271 * bdrv_drain_all() instead. 272 */ 273 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 274 { 275 assert(qemu_in_coroutine()); 276 bdrv_drained_begin(bs); 277 bdrv_drained_end(bs); 278 } 279 280 void bdrv_drain(BlockDriverState *bs) 281 { 282 bdrv_drained_begin(bs); 283 bdrv_drained_end(bs); 284 } 285 286 /* 287 * Wait for pending requests to complete across all BlockDriverStates 288 * 289 * This function does not flush data to disk, use bdrv_flush_all() for that 290 * after calling this function. 291 * 292 * This pauses all block jobs and disables external clients. It must 293 * be paired with bdrv_drain_all_end(). 294 * 295 * NOTE: no new block jobs or BlockDriverStates can be created between 296 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 297 */ 298 void bdrv_drain_all_begin(void) 299 { 300 /* Always run first iteration so any pending completion BHs run */ 301 bool waited = true; 302 BlockDriverState *bs; 303 BdrvNextIterator it; 304 BlockJob *job = NULL; 305 GSList *aio_ctxs = NULL, *ctx; 306 307 while ((job = block_job_next(job))) { 308 AioContext *aio_context = blk_get_aio_context(job->blk); 309 310 aio_context_acquire(aio_context); 311 block_job_pause(job); 312 aio_context_release(aio_context); 313 } 314 315 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 316 AioContext *aio_context = bdrv_get_aio_context(bs); 317 318 aio_context_acquire(aio_context); 319 bdrv_parent_drained_begin(bs); 320 aio_disable_external(aio_context); 321 aio_context_release(aio_context); 322 323 if (!g_slist_find(aio_ctxs, aio_context)) { 324 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 325 } 326 } 327 328 /* Note that completion of an asynchronous I/O operation can trigger any 329 * number of other I/O operations on other devices---for example a 330 * coroutine can submit an I/O request to another device in response to 331 * request completion. Therefore we must keep looping until there was no 332 * more activity rather than simply draining each device independently. 333 */ 334 while (waited) { 335 waited = false; 336 337 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 338 AioContext *aio_context = ctx->data; 339 340 aio_context_acquire(aio_context); 341 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 342 if (aio_context == bdrv_get_aio_context(bs)) { 343 waited |= bdrv_drain_recurse(bs); 344 } 345 } 346 aio_context_release(aio_context); 347 } 348 } 349 350 g_slist_free(aio_ctxs); 351 } 352 353 void bdrv_drain_all_end(void) 354 { 355 BlockDriverState *bs; 356 BdrvNextIterator it; 357 BlockJob *job = NULL; 358 359 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 360 AioContext *aio_context = bdrv_get_aio_context(bs); 361 362 aio_context_acquire(aio_context); 363 aio_enable_external(aio_context); 364 bdrv_parent_drained_end(bs); 365 aio_context_release(aio_context); 366 } 367 368 while ((job = block_job_next(job))) { 369 AioContext *aio_context = blk_get_aio_context(job->blk); 370 371 aio_context_acquire(aio_context); 372 block_job_resume(job); 373 aio_context_release(aio_context); 374 } 375 } 376 377 void bdrv_drain_all(void) 378 { 379 bdrv_drain_all_begin(); 380 bdrv_drain_all_end(); 381 } 382 383 /** 384 * Remove an active request from the tracked requests list 385 * 386 * This function should be called when a tracked request is completing. 387 */ 388 static void tracked_request_end(BdrvTrackedRequest *req) 389 { 390 if (req->serialising) { 391 req->bs->serialising_in_flight--; 392 } 393 394 QLIST_REMOVE(req, list); 395 qemu_co_queue_restart_all(&req->wait_queue); 396 } 397 398 /** 399 * Add an active request to the tracked requests list 400 */ 401 static void tracked_request_begin(BdrvTrackedRequest *req, 402 BlockDriverState *bs, 403 int64_t offset, 404 unsigned int bytes, 405 enum BdrvTrackedRequestType type) 406 { 407 *req = (BdrvTrackedRequest){ 408 .bs = bs, 409 .offset = offset, 410 .bytes = bytes, 411 .type = type, 412 .co = qemu_coroutine_self(), 413 .serialising = false, 414 .overlap_offset = offset, 415 .overlap_bytes = bytes, 416 }; 417 418 qemu_co_queue_init(&req->wait_queue); 419 420 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 421 } 422 423 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 424 { 425 int64_t overlap_offset = req->offset & ~(align - 1); 426 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 427 - overlap_offset; 428 429 if (!req->serialising) { 430 req->bs->serialising_in_flight++; 431 req->serialising = true; 432 } 433 434 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 435 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 436 } 437 438 /** 439 * Round a region to cluster boundaries (sector-based) 440 */ 441 void bdrv_round_sectors_to_clusters(BlockDriverState *bs, 442 int64_t sector_num, int nb_sectors, 443 int64_t *cluster_sector_num, 444 int *cluster_nb_sectors) 445 { 446 BlockDriverInfo bdi; 447 448 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 449 *cluster_sector_num = sector_num; 450 *cluster_nb_sectors = nb_sectors; 451 } else { 452 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 453 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 454 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 455 nb_sectors, c); 456 } 457 } 458 459 /** 460 * Round a region to cluster boundaries 461 */ 462 void bdrv_round_to_clusters(BlockDriverState *bs, 463 int64_t offset, unsigned int bytes, 464 int64_t *cluster_offset, 465 unsigned int *cluster_bytes) 466 { 467 BlockDriverInfo bdi; 468 469 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 470 *cluster_offset = offset; 471 *cluster_bytes = bytes; 472 } else { 473 int64_t c = bdi.cluster_size; 474 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 475 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 476 } 477 } 478 479 static int bdrv_get_cluster_size(BlockDriverState *bs) 480 { 481 BlockDriverInfo bdi; 482 int ret; 483 484 ret = bdrv_get_info(bs, &bdi); 485 if (ret < 0 || bdi.cluster_size == 0) { 486 return bs->bl.request_alignment; 487 } else { 488 return bdi.cluster_size; 489 } 490 } 491 492 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 493 int64_t offset, unsigned int bytes) 494 { 495 /* aaaa bbbb */ 496 if (offset >= req->overlap_offset + req->overlap_bytes) { 497 return false; 498 } 499 /* bbbb aaaa */ 500 if (req->overlap_offset >= offset + bytes) { 501 return false; 502 } 503 return true; 504 } 505 506 void bdrv_inc_in_flight(BlockDriverState *bs) 507 { 508 atomic_inc(&bs->in_flight); 509 } 510 511 static void dummy_bh_cb(void *opaque) 512 { 513 } 514 515 void bdrv_wakeup(BlockDriverState *bs) 516 { 517 if (bs->wakeup) { 518 aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); 519 } 520 } 521 522 void bdrv_dec_in_flight(BlockDriverState *bs) 523 { 524 atomic_dec(&bs->in_flight); 525 bdrv_wakeup(bs); 526 } 527 528 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 529 { 530 BlockDriverState *bs = self->bs; 531 BdrvTrackedRequest *req; 532 bool retry; 533 bool waited = false; 534 535 if (!bs->serialising_in_flight) { 536 return false; 537 } 538 539 do { 540 retry = false; 541 QLIST_FOREACH(req, &bs->tracked_requests, list) { 542 if (req == self || (!req->serialising && !self->serialising)) { 543 continue; 544 } 545 if (tracked_request_overlaps(req, self->overlap_offset, 546 self->overlap_bytes)) 547 { 548 /* Hitting this means there was a reentrant request, for 549 * example, a block driver issuing nested requests. This must 550 * never happen since it means deadlock. 551 */ 552 assert(qemu_coroutine_self() != req->co); 553 554 /* If the request is already (indirectly) waiting for us, or 555 * will wait for us as soon as it wakes up, then just go on 556 * (instead of producing a deadlock in the former case). */ 557 if (!req->waiting_for) { 558 self->waiting_for = req; 559 qemu_co_queue_wait(&req->wait_queue, NULL); 560 self->waiting_for = NULL; 561 retry = true; 562 waited = true; 563 break; 564 } 565 } 566 } 567 } while (retry); 568 569 return waited; 570 } 571 572 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 573 size_t size) 574 { 575 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 576 return -EIO; 577 } 578 579 if (!bdrv_is_inserted(bs)) { 580 return -ENOMEDIUM; 581 } 582 583 if (offset < 0) { 584 return -EIO; 585 } 586 587 return 0; 588 } 589 590 typedef struct RwCo { 591 BdrvChild *child; 592 int64_t offset; 593 QEMUIOVector *qiov; 594 bool is_write; 595 int ret; 596 BdrvRequestFlags flags; 597 } RwCo; 598 599 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 600 { 601 RwCo *rwco = opaque; 602 603 if (!rwco->is_write) { 604 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 605 rwco->qiov->size, rwco->qiov, 606 rwco->flags); 607 } else { 608 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 609 rwco->qiov->size, rwco->qiov, 610 rwco->flags); 611 } 612 } 613 614 /* 615 * Process a vectored synchronous request using coroutines 616 */ 617 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 618 QEMUIOVector *qiov, bool is_write, 619 BdrvRequestFlags flags) 620 { 621 Coroutine *co; 622 RwCo rwco = { 623 .child = child, 624 .offset = offset, 625 .qiov = qiov, 626 .is_write = is_write, 627 .ret = NOT_DONE, 628 .flags = flags, 629 }; 630 631 if (qemu_in_coroutine()) { 632 /* Fast-path if already in coroutine context */ 633 bdrv_rw_co_entry(&rwco); 634 } else { 635 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 636 bdrv_coroutine_enter(child->bs, co); 637 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 638 } 639 return rwco.ret; 640 } 641 642 /* 643 * Process a synchronous request using coroutines 644 */ 645 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 646 int nb_sectors, bool is_write, BdrvRequestFlags flags) 647 { 648 QEMUIOVector qiov; 649 struct iovec iov = { 650 .iov_base = (void *)buf, 651 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 652 }; 653 654 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 655 return -EINVAL; 656 } 657 658 qemu_iovec_init_external(&qiov, &iov, 1); 659 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 660 &qiov, is_write, flags); 661 } 662 663 /* return < 0 if error. See bdrv_write() for the return codes */ 664 int bdrv_read(BdrvChild *child, int64_t sector_num, 665 uint8_t *buf, int nb_sectors) 666 { 667 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 668 } 669 670 /* Return < 0 if error. Important errors are: 671 -EIO generic I/O error (may happen for all errors) 672 -ENOMEDIUM No media inserted. 673 -EINVAL Invalid sector number or nb_sectors 674 -EACCES Trying to write a read-only device 675 */ 676 int bdrv_write(BdrvChild *child, int64_t sector_num, 677 const uint8_t *buf, int nb_sectors) 678 { 679 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 680 } 681 682 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 683 int count, BdrvRequestFlags flags) 684 { 685 QEMUIOVector qiov; 686 struct iovec iov = { 687 .iov_base = NULL, 688 .iov_len = count, 689 }; 690 691 qemu_iovec_init_external(&qiov, &iov, 1); 692 return bdrv_prwv_co(child, offset, &qiov, true, 693 BDRV_REQ_ZERO_WRITE | flags); 694 } 695 696 /* 697 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 698 * The operation is sped up by checking the block status and only writing 699 * zeroes to the device if they currently do not return zeroes. Optional 700 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 701 * BDRV_REQ_FUA). 702 * 703 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 704 */ 705 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 706 { 707 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 708 BlockDriverState *bs = child->bs; 709 BlockDriverState *file; 710 int n; 711 712 target_sectors = bdrv_nb_sectors(bs); 713 if (target_sectors < 0) { 714 return target_sectors; 715 } 716 717 for (;;) { 718 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 719 if (nb_sectors <= 0) { 720 return 0; 721 } 722 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); 723 if (ret < 0) { 724 error_report("error getting block status at sector %" PRId64 ": %s", 725 sector_num, strerror(-ret)); 726 return ret; 727 } 728 if (ret & BDRV_BLOCK_ZERO) { 729 sector_num += n; 730 continue; 731 } 732 ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS, 733 n << BDRV_SECTOR_BITS, flags); 734 if (ret < 0) { 735 error_report("error writing zeroes at sector %" PRId64 ": %s", 736 sector_num, strerror(-ret)); 737 return ret; 738 } 739 sector_num += n; 740 } 741 } 742 743 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 744 { 745 int ret; 746 747 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 748 if (ret < 0) { 749 return ret; 750 } 751 752 return qiov->size; 753 } 754 755 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 756 { 757 QEMUIOVector qiov; 758 struct iovec iov = { 759 .iov_base = (void *)buf, 760 .iov_len = bytes, 761 }; 762 763 if (bytes < 0) { 764 return -EINVAL; 765 } 766 767 qemu_iovec_init_external(&qiov, &iov, 1); 768 return bdrv_preadv(child, offset, &qiov); 769 } 770 771 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 772 { 773 int ret; 774 775 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 776 if (ret < 0) { 777 return ret; 778 } 779 780 return qiov->size; 781 } 782 783 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 784 { 785 QEMUIOVector qiov; 786 struct iovec iov = { 787 .iov_base = (void *) buf, 788 .iov_len = bytes, 789 }; 790 791 if (bytes < 0) { 792 return -EINVAL; 793 } 794 795 qemu_iovec_init_external(&qiov, &iov, 1); 796 return bdrv_pwritev(child, offset, &qiov); 797 } 798 799 /* 800 * Writes to the file and ensures that no writes are reordered across this 801 * request (acts as a barrier) 802 * 803 * Returns 0 on success, -errno in error cases. 804 */ 805 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 806 const void *buf, int count) 807 { 808 int ret; 809 810 ret = bdrv_pwrite(child, offset, buf, count); 811 if (ret < 0) { 812 return ret; 813 } 814 815 ret = bdrv_flush(child->bs); 816 if (ret < 0) { 817 return ret; 818 } 819 820 return 0; 821 } 822 823 typedef struct CoroutineIOCompletion { 824 Coroutine *coroutine; 825 int ret; 826 } CoroutineIOCompletion; 827 828 static void bdrv_co_io_em_complete(void *opaque, int ret) 829 { 830 CoroutineIOCompletion *co = opaque; 831 832 co->ret = ret; 833 aio_co_wake(co->coroutine); 834 } 835 836 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 837 uint64_t offset, uint64_t bytes, 838 QEMUIOVector *qiov, int flags) 839 { 840 BlockDriver *drv = bs->drv; 841 int64_t sector_num; 842 unsigned int nb_sectors; 843 844 assert(!(flags & ~BDRV_REQ_MASK)); 845 846 if (drv->bdrv_co_preadv) { 847 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 848 } 849 850 sector_num = offset >> BDRV_SECTOR_BITS; 851 nb_sectors = bytes >> BDRV_SECTOR_BITS; 852 853 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 854 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 855 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 856 857 if (drv->bdrv_co_readv) { 858 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 859 } else { 860 BlockAIOCB *acb; 861 CoroutineIOCompletion co = { 862 .coroutine = qemu_coroutine_self(), 863 }; 864 865 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, 866 bdrv_co_io_em_complete, &co); 867 if (acb == NULL) { 868 return -EIO; 869 } else { 870 qemu_coroutine_yield(); 871 return co.ret; 872 } 873 } 874 } 875 876 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 877 uint64_t offset, uint64_t bytes, 878 QEMUIOVector *qiov, int flags) 879 { 880 BlockDriver *drv = bs->drv; 881 int64_t sector_num; 882 unsigned int nb_sectors; 883 int ret; 884 885 assert(!(flags & ~BDRV_REQ_MASK)); 886 887 if (drv->bdrv_co_pwritev) { 888 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 889 flags & bs->supported_write_flags); 890 flags &= ~bs->supported_write_flags; 891 goto emulate_flags; 892 } 893 894 sector_num = offset >> BDRV_SECTOR_BITS; 895 nb_sectors = bytes >> BDRV_SECTOR_BITS; 896 897 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 898 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 899 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 900 901 if (drv->bdrv_co_writev_flags) { 902 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, 903 flags & bs->supported_write_flags); 904 flags &= ~bs->supported_write_flags; 905 } else if (drv->bdrv_co_writev) { 906 assert(!bs->supported_write_flags); 907 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 908 } else { 909 BlockAIOCB *acb; 910 CoroutineIOCompletion co = { 911 .coroutine = qemu_coroutine_self(), 912 }; 913 914 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, 915 bdrv_co_io_em_complete, &co); 916 if (acb == NULL) { 917 ret = -EIO; 918 } else { 919 qemu_coroutine_yield(); 920 ret = co.ret; 921 } 922 } 923 924 emulate_flags: 925 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 926 ret = bdrv_co_flush(bs); 927 } 928 929 return ret; 930 } 931 932 static int coroutine_fn 933 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 934 uint64_t bytes, QEMUIOVector *qiov) 935 { 936 BlockDriver *drv = bs->drv; 937 938 if (!drv->bdrv_co_pwritev_compressed) { 939 return -ENOTSUP; 940 } 941 942 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 943 } 944 945 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 946 int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 947 { 948 BlockDriverState *bs = child->bs; 949 950 /* Perform I/O through a temporary buffer so that users who scribble over 951 * their read buffer while the operation is in progress do not end up 952 * modifying the image file. This is critical for zero-copy guest I/O 953 * where anything might happen inside guest memory. 954 */ 955 void *bounce_buffer; 956 957 BlockDriver *drv = bs->drv; 958 struct iovec iov; 959 QEMUIOVector bounce_qiov; 960 int64_t cluster_offset; 961 unsigned int cluster_bytes; 962 size_t skip_bytes; 963 int ret; 964 965 /* FIXME We cannot require callers to have write permissions when all they 966 * are doing is a read request. If we did things right, write permissions 967 * would be obtained anyway, but internally by the copy-on-read code. As 968 * long as it is implemented here rather than in a separat filter driver, 969 * the copy-on-read code doesn't have its own BdrvChild, however, for which 970 * it could request permissions. Therefore we have to bypass the permission 971 * system for the moment. */ 972 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 973 974 /* Cover entire cluster so no additional backing file I/O is required when 975 * allocating cluster in the image file. 976 */ 977 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 978 979 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 980 cluster_offset, cluster_bytes); 981 982 iov.iov_len = cluster_bytes; 983 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 984 if (bounce_buffer == NULL) { 985 ret = -ENOMEM; 986 goto err; 987 } 988 989 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 990 991 ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes, 992 &bounce_qiov, 0); 993 if (ret < 0) { 994 goto err; 995 } 996 997 if (drv->bdrv_co_pwrite_zeroes && 998 buffer_is_zero(bounce_buffer, iov.iov_len)) { 999 /* FIXME: Should we (perhaps conditionally) be setting 1000 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1001 * that still correctly reads as zero? */ 1002 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0); 1003 } else { 1004 /* This does not change the data on the disk, it is not necessary 1005 * to flush even in cache=writethrough mode. 1006 */ 1007 ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes, 1008 &bounce_qiov, 0); 1009 } 1010 1011 if (ret < 0) { 1012 /* It might be okay to ignore write errors for guest requests. If this 1013 * is a deliberate copy-on-read then we don't want to ignore the error. 1014 * Simply report it in all cases. 1015 */ 1016 goto err; 1017 } 1018 1019 skip_bytes = offset - cluster_offset; 1020 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes); 1021 1022 err: 1023 qemu_vfree(bounce_buffer); 1024 return ret; 1025 } 1026 1027 /* 1028 * Forwards an already correctly aligned request to the BlockDriver. This 1029 * handles copy on read, zeroing after EOF, and fragmentation of large 1030 * reads; any other features must be implemented by the caller. 1031 */ 1032 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1033 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1034 int64_t align, QEMUIOVector *qiov, int flags) 1035 { 1036 BlockDriverState *bs = child->bs; 1037 int64_t total_bytes, max_bytes; 1038 int ret = 0; 1039 uint64_t bytes_remaining = bytes; 1040 int max_transfer; 1041 1042 assert(is_power_of_2(align)); 1043 assert((offset & (align - 1)) == 0); 1044 assert((bytes & (align - 1)) == 0); 1045 assert(!qiov || bytes == qiov->size); 1046 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1047 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1048 align); 1049 1050 /* TODO: We would need a per-BDS .supported_read_flags and 1051 * potential fallback support, if we ever implement any read flags 1052 * to pass through to drivers. For now, there aren't any 1053 * passthrough flags. */ 1054 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 1055 1056 /* Handle Copy on Read and associated serialisation */ 1057 if (flags & BDRV_REQ_COPY_ON_READ) { 1058 /* If we touch the same cluster it counts as an overlap. This 1059 * guarantees that allocating writes will be serialized and not race 1060 * with each other for the same cluster. For example, in copy-on-read 1061 * it ensures that the CoR read and write operations are atomic and 1062 * guest writes cannot interleave between them. */ 1063 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1064 } 1065 1066 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 1067 wait_serialising_requests(req); 1068 } 1069 1070 if (flags & BDRV_REQ_COPY_ON_READ) { 1071 int64_t start_sector = offset >> BDRV_SECTOR_BITS; 1072 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1073 unsigned int nb_sectors = end_sector - start_sector; 1074 int pnum; 1075 1076 ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum); 1077 if (ret < 0) { 1078 goto out; 1079 } 1080 1081 if (!ret || pnum != nb_sectors) { 1082 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 1083 goto out; 1084 } 1085 } 1086 1087 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1088 total_bytes = bdrv_getlength(bs); 1089 if (total_bytes < 0) { 1090 ret = total_bytes; 1091 goto out; 1092 } 1093 1094 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1095 if (bytes <= max_bytes && bytes <= max_transfer) { 1096 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 1097 goto out; 1098 } 1099 1100 while (bytes_remaining) { 1101 int num; 1102 1103 if (max_bytes) { 1104 QEMUIOVector local_qiov; 1105 1106 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1107 assert(num); 1108 qemu_iovec_init(&local_qiov, qiov->niov); 1109 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1110 1111 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1112 num, &local_qiov, 0); 1113 max_bytes -= num; 1114 qemu_iovec_destroy(&local_qiov); 1115 } else { 1116 num = bytes_remaining; 1117 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1118 bytes_remaining); 1119 } 1120 if (ret < 0) { 1121 goto out; 1122 } 1123 bytes_remaining -= num; 1124 } 1125 1126 out: 1127 return ret < 0 ? ret : 0; 1128 } 1129 1130 /* 1131 * Handle a read request in coroutine context 1132 */ 1133 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1134 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1135 BdrvRequestFlags flags) 1136 { 1137 BlockDriverState *bs = child->bs; 1138 BlockDriver *drv = bs->drv; 1139 BdrvTrackedRequest req; 1140 1141 uint64_t align = bs->bl.request_alignment; 1142 uint8_t *head_buf = NULL; 1143 uint8_t *tail_buf = NULL; 1144 QEMUIOVector local_qiov; 1145 bool use_local_qiov = false; 1146 int ret; 1147 1148 if (!drv) { 1149 return -ENOMEDIUM; 1150 } 1151 1152 ret = bdrv_check_byte_request(bs, offset, bytes); 1153 if (ret < 0) { 1154 return ret; 1155 } 1156 1157 bdrv_inc_in_flight(bs); 1158 1159 /* Don't do copy-on-read if we read data before write operation */ 1160 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 1161 flags |= BDRV_REQ_COPY_ON_READ; 1162 } 1163 1164 /* Align read if necessary by padding qiov */ 1165 if (offset & (align - 1)) { 1166 head_buf = qemu_blockalign(bs, align); 1167 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1168 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1169 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1170 use_local_qiov = true; 1171 1172 bytes += offset & (align - 1); 1173 offset = offset & ~(align - 1); 1174 } 1175 1176 if ((offset + bytes) & (align - 1)) { 1177 if (!use_local_qiov) { 1178 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1179 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1180 use_local_qiov = true; 1181 } 1182 tail_buf = qemu_blockalign(bs, align); 1183 qemu_iovec_add(&local_qiov, tail_buf, 1184 align - ((offset + bytes) & (align - 1))); 1185 1186 bytes = ROUND_UP(bytes, align); 1187 } 1188 1189 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1190 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 1191 use_local_qiov ? &local_qiov : qiov, 1192 flags); 1193 tracked_request_end(&req); 1194 bdrv_dec_in_flight(bs); 1195 1196 if (use_local_qiov) { 1197 qemu_iovec_destroy(&local_qiov); 1198 qemu_vfree(head_buf); 1199 qemu_vfree(tail_buf); 1200 } 1201 1202 return ret; 1203 } 1204 1205 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, 1206 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1207 BdrvRequestFlags flags) 1208 { 1209 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1210 return -EINVAL; 1211 } 1212 1213 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, 1214 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1215 } 1216 1217 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, 1218 int nb_sectors, QEMUIOVector *qiov) 1219 { 1220 trace_bdrv_co_readv(child->bs, sector_num, nb_sectors); 1221 1222 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); 1223 } 1224 1225 /* Maximum buffer for write zeroes fallback, in bytes */ 1226 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 1227 1228 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1229 int64_t offset, int count, BdrvRequestFlags flags) 1230 { 1231 BlockDriver *drv = bs->drv; 1232 QEMUIOVector qiov; 1233 struct iovec iov = {0}; 1234 int ret = 0; 1235 bool need_flush = false; 1236 int head = 0; 1237 int tail = 0; 1238 1239 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1240 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1241 bs->bl.request_alignment); 1242 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1243 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1244 1245 assert(alignment % bs->bl.request_alignment == 0); 1246 head = offset % alignment; 1247 tail = (offset + count) % alignment; 1248 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1249 assert(max_write_zeroes >= bs->bl.request_alignment); 1250 1251 while (count > 0 && !ret) { 1252 int num = count; 1253 1254 /* Align request. Block drivers can expect the "bulk" of the request 1255 * to be aligned, and that unaligned requests do not cross cluster 1256 * boundaries. 1257 */ 1258 if (head) { 1259 /* Make a small request up to the first aligned sector. For 1260 * convenience, limit this request to max_transfer even if 1261 * we don't need to fall back to writes. */ 1262 num = MIN(MIN(count, max_transfer), alignment - head); 1263 head = (head + num) % alignment; 1264 assert(num < max_write_zeroes); 1265 } else if (tail && num > alignment) { 1266 /* Shorten the request to the last aligned sector. */ 1267 num -= tail; 1268 } 1269 1270 /* limit request size */ 1271 if (num > max_write_zeroes) { 1272 num = max_write_zeroes; 1273 } 1274 1275 ret = -ENOTSUP; 1276 /* First try the efficient write zeroes operation */ 1277 if (drv->bdrv_co_pwrite_zeroes) { 1278 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1279 flags & bs->supported_zero_flags); 1280 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1281 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1282 need_flush = true; 1283 } 1284 } else { 1285 assert(!bs->supported_zero_flags); 1286 } 1287 1288 if (ret == -ENOTSUP) { 1289 /* Fall back to bounce buffer if write zeroes is unsupported */ 1290 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1291 1292 if ((flags & BDRV_REQ_FUA) && 1293 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1294 /* No need for bdrv_driver_pwrite() to do a fallback 1295 * flush on each chunk; use just one at the end */ 1296 write_flags &= ~BDRV_REQ_FUA; 1297 need_flush = true; 1298 } 1299 num = MIN(num, max_transfer); 1300 iov.iov_len = num; 1301 if (iov.iov_base == NULL) { 1302 iov.iov_base = qemu_try_blockalign(bs, num); 1303 if (iov.iov_base == NULL) { 1304 ret = -ENOMEM; 1305 goto fail; 1306 } 1307 memset(iov.iov_base, 0, num); 1308 } 1309 qemu_iovec_init_external(&qiov, &iov, 1); 1310 1311 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 1312 1313 /* Keep bounce buffer around if it is big enough for all 1314 * all future requests. 1315 */ 1316 if (num < max_transfer) { 1317 qemu_vfree(iov.iov_base); 1318 iov.iov_base = NULL; 1319 } 1320 } 1321 1322 offset += num; 1323 count -= num; 1324 } 1325 1326 fail: 1327 if (ret == 0 && need_flush) { 1328 ret = bdrv_co_flush(bs); 1329 } 1330 qemu_vfree(iov.iov_base); 1331 return ret; 1332 } 1333 1334 /* 1335 * Forwards an already correctly aligned write request to the BlockDriver, 1336 * after possibly fragmenting it. 1337 */ 1338 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1339 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1340 int64_t align, QEMUIOVector *qiov, int flags) 1341 { 1342 BlockDriverState *bs = child->bs; 1343 BlockDriver *drv = bs->drv; 1344 bool waited; 1345 int ret; 1346 1347 int64_t start_sector = offset >> BDRV_SECTOR_BITS; 1348 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1349 uint64_t bytes_remaining = bytes; 1350 int max_transfer; 1351 1352 assert(is_power_of_2(align)); 1353 assert((offset & (align - 1)) == 0); 1354 assert((bytes & (align - 1)) == 0); 1355 assert(!qiov || bytes == qiov->size); 1356 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1357 assert(!(flags & ~BDRV_REQ_MASK)); 1358 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1359 align); 1360 1361 waited = wait_serialising_requests(req); 1362 assert(!waited || !req->serialising); 1363 assert(req->overlap_offset <= offset); 1364 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1365 /* FIXME: Block migration uses the BlockBackend of the guest device at a 1366 * point when it has not yet taken write permissions. This will be 1367 * fixed by a future patch, but for now we have to bypass this 1368 * assertion for block migration to work. */ 1369 // assert(child->perm & BLK_PERM_WRITE); 1370 /* FIXME: Because of the above, we also cannot guarantee that all format 1371 * BDS take the BLK_PERM_RESIZE permission on their file BDS, since 1372 * they are not obligated to do so if they do not have any parent 1373 * that has taken the permission to write to them. */ 1374 // assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1375 1376 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1377 1378 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1379 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1380 qemu_iovec_is_zero(qiov)) { 1381 flags |= BDRV_REQ_ZERO_WRITE; 1382 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1383 flags |= BDRV_REQ_MAY_UNMAP; 1384 } 1385 } 1386 1387 if (ret < 0) { 1388 /* Do nothing, write notifier decided to fail this request */ 1389 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1390 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1391 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1392 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1393 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 1394 } else if (bytes <= max_transfer) { 1395 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1396 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 1397 } else { 1398 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1399 while (bytes_remaining) { 1400 int num = MIN(bytes_remaining, max_transfer); 1401 QEMUIOVector local_qiov; 1402 int local_flags = flags; 1403 1404 assert(num); 1405 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1406 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1407 /* If FUA is going to be emulated by flush, we only 1408 * need to flush on the last iteration */ 1409 local_flags &= ~BDRV_REQ_FUA; 1410 } 1411 qemu_iovec_init(&local_qiov, qiov->niov); 1412 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1413 1414 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1415 num, &local_qiov, local_flags); 1416 qemu_iovec_destroy(&local_qiov); 1417 if (ret < 0) { 1418 break; 1419 } 1420 bytes_remaining -= num; 1421 } 1422 } 1423 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1424 1425 ++bs->write_gen; 1426 bdrv_set_dirty(bs, start_sector, end_sector - start_sector); 1427 1428 if (bs->wr_highest_offset < offset + bytes) { 1429 bs->wr_highest_offset = offset + bytes; 1430 } 1431 1432 if (ret >= 0) { 1433 bs->total_sectors = MAX(bs->total_sectors, end_sector); 1434 ret = 0; 1435 } 1436 1437 return ret; 1438 } 1439 1440 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1441 int64_t offset, 1442 unsigned int bytes, 1443 BdrvRequestFlags flags, 1444 BdrvTrackedRequest *req) 1445 { 1446 BlockDriverState *bs = child->bs; 1447 uint8_t *buf = NULL; 1448 QEMUIOVector local_qiov; 1449 struct iovec iov; 1450 uint64_t align = bs->bl.request_alignment; 1451 unsigned int head_padding_bytes, tail_padding_bytes; 1452 int ret = 0; 1453 1454 head_padding_bytes = offset & (align - 1); 1455 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1456 1457 1458 assert(flags & BDRV_REQ_ZERO_WRITE); 1459 if (head_padding_bytes || tail_padding_bytes) { 1460 buf = qemu_blockalign(bs, align); 1461 iov = (struct iovec) { 1462 .iov_base = buf, 1463 .iov_len = align, 1464 }; 1465 qemu_iovec_init_external(&local_qiov, &iov, 1); 1466 } 1467 if (head_padding_bytes) { 1468 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1469 1470 /* RMW the unaligned part before head. */ 1471 mark_request_serialising(req, align); 1472 wait_serialising_requests(req); 1473 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1474 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 1475 align, &local_qiov, 0); 1476 if (ret < 0) { 1477 goto fail; 1478 } 1479 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1480 1481 memset(buf + head_padding_bytes, 0, zero_bytes); 1482 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1483 align, &local_qiov, 1484 flags & ~BDRV_REQ_ZERO_WRITE); 1485 if (ret < 0) { 1486 goto fail; 1487 } 1488 offset += zero_bytes; 1489 bytes -= zero_bytes; 1490 } 1491 1492 assert(!bytes || (offset & (align - 1)) == 0); 1493 if (bytes >= align) { 1494 /* Write the aligned part in the middle. */ 1495 uint64_t aligned_bytes = bytes & ~(align - 1); 1496 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 1497 NULL, flags); 1498 if (ret < 0) { 1499 goto fail; 1500 } 1501 bytes -= aligned_bytes; 1502 offset += aligned_bytes; 1503 } 1504 1505 assert(!bytes || (offset & (align - 1)) == 0); 1506 if (bytes) { 1507 assert(align == tail_padding_bytes + bytes); 1508 /* RMW the unaligned part after tail. */ 1509 mark_request_serialising(req, align); 1510 wait_serialising_requests(req); 1511 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1512 ret = bdrv_aligned_preadv(child, req, offset, align, 1513 align, &local_qiov, 0); 1514 if (ret < 0) { 1515 goto fail; 1516 } 1517 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1518 1519 memset(buf, 0, bytes); 1520 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 1521 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1522 } 1523 fail: 1524 qemu_vfree(buf); 1525 return ret; 1526 1527 } 1528 1529 /* 1530 * Handle a write request in coroutine context 1531 */ 1532 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 1533 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1534 BdrvRequestFlags flags) 1535 { 1536 BlockDriverState *bs = child->bs; 1537 BdrvTrackedRequest req; 1538 uint64_t align = bs->bl.request_alignment; 1539 uint8_t *head_buf = NULL; 1540 uint8_t *tail_buf = NULL; 1541 QEMUIOVector local_qiov; 1542 bool use_local_qiov = false; 1543 int ret; 1544 1545 if (!bs->drv) { 1546 return -ENOMEDIUM; 1547 } 1548 if (bs->read_only) { 1549 return -EPERM; 1550 } 1551 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1552 1553 ret = bdrv_check_byte_request(bs, offset, bytes); 1554 if (ret < 0) { 1555 return ret; 1556 } 1557 1558 bdrv_inc_in_flight(bs); 1559 /* 1560 * Align write if necessary by performing a read-modify-write cycle. 1561 * Pad qiov with the read parts and be sure to have a tracked request not 1562 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1563 */ 1564 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1565 1566 if (!qiov) { 1567 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 1568 goto out; 1569 } 1570 1571 if (offset & (align - 1)) { 1572 QEMUIOVector head_qiov; 1573 struct iovec head_iov; 1574 1575 mark_request_serialising(&req, align); 1576 wait_serialising_requests(&req); 1577 1578 head_buf = qemu_blockalign(bs, align); 1579 head_iov = (struct iovec) { 1580 .iov_base = head_buf, 1581 .iov_len = align, 1582 }; 1583 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1584 1585 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1586 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 1587 align, &head_qiov, 0); 1588 if (ret < 0) { 1589 goto fail; 1590 } 1591 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1592 1593 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1594 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1595 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1596 use_local_qiov = true; 1597 1598 bytes += offset & (align - 1); 1599 offset = offset & ~(align - 1); 1600 1601 /* We have read the tail already if the request is smaller 1602 * than one aligned block. 1603 */ 1604 if (bytes < align) { 1605 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1606 bytes = align; 1607 } 1608 } 1609 1610 if ((offset + bytes) & (align - 1)) { 1611 QEMUIOVector tail_qiov; 1612 struct iovec tail_iov; 1613 size_t tail_bytes; 1614 bool waited; 1615 1616 mark_request_serialising(&req, align); 1617 waited = wait_serialising_requests(&req); 1618 assert(!waited || !use_local_qiov); 1619 1620 tail_buf = qemu_blockalign(bs, align); 1621 tail_iov = (struct iovec) { 1622 .iov_base = tail_buf, 1623 .iov_len = align, 1624 }; 1625 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1626 1627 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1628 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 1629 align, align, &tail_qiov, 0); 1630 if (ret < 0) { 1631 goto fail; 1632 } 1633 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1634 1635 if (!use_local_qiov) { 1636 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1637 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1638 use_local_qiov = true; 1639 } 1640 1641 tail_bytes = (offset + bytes) & (align - 1); 1642 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1643 1644 bytes = ROUND_UP(bytes, align); 1645 } 1646 1647 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 1648 use_local_qiov ? &local_qiov : qiov, 1649 flags); 1650 1651 fail: 1652 1653 if (use_local_qiov) { 1654 qemu_iovec_destroy(&local_qiov); 1655 } 1656 qemu_vfree(head_buf); 1657 qemu_vfree(tail_buf); 1658 out: 1659 tracked_request_end(&req); 1660 bdrv_dec_in_flight(bs); 1661 return ret; 1662 } 1663 1664 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, 1665 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1666 BdrvRequestFlags flags) 1667 { 1668 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1669 return -EINVAL; 1670 } 1671 1672 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, 1673 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1674 } 1675 1676 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, 1677 int nb_sectors, QEMUIOVector *qiov) 1678 { 1679 trace_bdrv_co_writev(child->bs, sector_num, nb_sectors); 1680 1681 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); 1682 } 1683 1684 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1685 int count, BdrvRequestFlags flags) 1686 { 1687 trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags); 1688 1689 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 1690 flags &= ~BDRV_REQ_MAY_UNMAP; 1691 } 1692 1693 return bdrv_co_pwritev(child, offset, count, NULL, 1694 BDRV_REQ_ZERO_WRITE | flags); 1695 } 1696 1697 /* 1698 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 1699 */ 1700 int bdrv_flush_all(void) 1701 { 1702 BdrvNextIterator it; 1703 BlockDriverState *bs = NULL; 1704 int result = 0; 1705 1706 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 1707 AioContext *aio_context = bdrv_get_aio_context(bs); 1708 int ret; 1709 1710 aio_context_acquire(aio_context); 1711 ret = bdrv_flush(bs); 1712 if (ret < 0 && !result) { 1713 result = ret; 1714 } 1715 aio_context_release(aio_context); 1716 } 1717 1718 return result; 1719 } 1720 1721 1722 typedef struct BdrvCoGetBlockStatusData { 1723 BlockDriverState *bs; 1724 BlockDriverState *base; 1725 BlockDriverState **file; 1726 int64_t sector_num; 1727 int nb_sectors; 1728 int *pnum; 1729 int64_t ret; 1730 bool done; 1731 } BdrvCoGetBlockStatusData; 1732 1733 /* 1734 * Returns the allocation status of the specified sectors. 1735 * Drivers not implementing the functionality are assumed to not support 1736 * backing files, hence all their sectors are reported as allocated. 1737 * 1738 * If 'sector_num' is beyond the end of the disk image the return value is 0 1739 * and 'pnum' is set to 0. 1740 * 1741 * 'pnum' is set to the number of sectors (including and immediately following 1742 * the specified sector) that are known to be in the same 1743 * allocated/unallocated state. 1744 * 1745 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1746 * beyond the end of the disk image it will be clamped. 1747 * 1748 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' 1749 * points to the BDS which the sector range is allocated in. 1750 */ 1751 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1752 int64_t sector_num, 1753 int nb_sectors, int *pnum, 1754 BlockDriverState **file) 1755 { 1756 int64_t total_sectors; 1757 int64_t n; 1758 int64_t ret, ret2; 1759 1760 total_sectors = bdrv_nb_sectors(bs); 1761 if (total_sectors < 0) { 1762 return total_sectors; 1763 } 1764 1765 if (sector_num >= total_sectors) { 1766 *pnum = 0; 1767 return 0; 1768 } 1769 1770 n = total_sectors - sector_num; 1771 if (n < nb_sectors) { 1772 nb_sectors = n; 1773 } 1774 1775 if (!bs->drv->bdrv_co_get_block_status) { 1776 *pnum = nb_sectors; 1777 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1778 if (bs->drv->protocol_name) { 1779 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1780 } 1781 return ret; 1782 } 1783 1784 *file = NULL; 1785 bdrv_inc_in_flight(bs); 1786 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, 1787 file); 1788 if (ret < 0) { 1789 *pnum = 0; 1790 goto out; 1791 } 1792 1793 if (ret & BDRV_BLOCK_RAW) { 1794 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1795 ret = bdrv_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1796 *pnum, pnum, file); 1797 goto out; 1798 } 1799 1800 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1801 ret |= BDRV_BLOCK_ALLOCATED; 1802 } else { 1803 if (bdrv_unallocated_blocks_are_zero(bs)) { 1804 ret |= BDRV_BLOCK_ZERO; 1805 } else if (bs->backing) { 1806 BlockDriverState *bs2 = bs->backing->bs; 1807 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1808 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1809 ret |= BDRV_BLOCK_ZERO; 1810 } 1811 } 1812 } 1813 1814 if (*file && *file != bs && 1815 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1816 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1817 BlockDriverState *file2; 1818 int file_pnum; 1819 1820 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1821 *pnum, &file_pnum, &file2); 1822 if (ret2 >= 0) { 1823 /* Ignore errors. This is just providing extra information, it 1824 * is useful but not necessary. 1825 */ 1826 if (!file_pnum) { 1827 /* !file_pnum indicates an offset at or beyond the EOF; it is 1828 * perfectly valid for the format block driver to point to such 1829 * offsets, so catch it and mark everything as zero */ 1830 ret |= BDRV_BLOCK_ZERO; 1831 } else { 1832 /* Limit request to the range reported by the protocol driver */ 1833 *pnum = file_pnum; 1834 ret |= (ret2 & BDRV_BLOCK_ZERO); 1835 } 1836 } 1837 } 1838 1839 out: 1840 bdrv_dec_in_flight(bs); 1841 return ret; 1842 } 1843 1844 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1845 BlockDriverState *base, 1846 int64_t sector_num, 1847 int nb_sectors, 1848 int *pnum, 1849 BlockDriverState **file) 1850 { 1851 BlockDriverState *p; 1852 int64_t ret = 0; 1853 1854 assert(bs != base); 1855 for (p = bs; p != base; p = backing_bs(p)) { 1856 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); 1857 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1858 break; 1859 } 1860 /* [sector_num, pnum] unallocated on this layer, which could be only 1861 * the first part of [sector_num, nb_sectors]. */ 1862 nb_sectors = MIN(nb_sectors, *pnum); 1863 } 1864 return ret; 1865 } 1866 1867 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1868 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1869 { 1870 BdrvCoGetBlockStatusData *data = opaque; 1871 1872 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1873 data->sector_num, 1874 data->nb_sectors, 1875 data->pnum, 1876 data->file); 1877 data->done = true; 1878 } 1879 1880 /* 1881 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1882 * 1883 * See bdrv_co_get_block_status_above() for details. 1884 */ 1885 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1886 BlockDriverState *base, 1887 int64_t sector_num, 1888 int nb_sectors, int *pnum, 1889 BlockDriverState **file) 1890 { 1891 Coroutine *co; 1892 BdrvCoGetBlockStatusData data = { 1893 .bs = bs, 1894 .base = base, 1895 .file = file, 1896 .sector_num = sector_num, 1897 .nb_sectors = nb_sectors, 1898 .pnum = pnum, 1899 .done = false, 1900 }; 1901 1902 if (qemu_in_coroutine()) { 1903 /* Fast-path if already in coroutine context */ 1904 bdrv_get_block_status_above_co_entry(&data); 1905 } else { 1906 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry, 1907 &data); 1908 bdrv_coroutine_enter(bs, co); 1909 BDRV_POLL_WHILE(bs, !data.done); 1910 } 1911 return data.ret; 1912 } 1913 1914 int64_t bdrv_get_block_status(BlockDriverState *bs, 1915 int64_t sector_num, 1916 int nb_sectors, int *pnum, 1917 BlockDriverState **file) 1918 { 1919 return bdrv_get_block_status_above(bs, backing_bs(bs), 1920 sector_num, nb_sectors, pnum, file); 1921 } 1922 1923 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1924 int nb_sectors, int *pnum) 1925 { 1926 BlockDriverState *file; 1927 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, 1928 &file); 1929 if (ret < 0) { 1930 return ret; 1931 } 1932 return !!(ret & BDRV_BLOCK_ALLOCATED); 1933 } 1934 1935 /* 1936 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1937 * 1938 * Return true if the given sector is allocated in any image between 1939 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1940 * sector is allocated in any image of the chain. Return false otherwise. 1941 * 1942 * 'pnum' is set to the number of sectors (including and immediately following 1943 * the specified sector) that are known to be in the same 1944 * allocated/unallocated state. 1945 * 1946 */ 1947 int bdrv_is_allocated_above(BlockDriverState *top, 1948 BlockDriverState *base, 1949 int64_t sector_num, 1950 int nb_sectors, int *pnum) 1951 { 1952 BlockDriverState *intermediate; 1953 int ret, n = nb_sectors; 1954 1955 intermediate = top; 1956 while (intermediate && intermediate != base) { 1957 int pnum_inter; 1958 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1959 &pnum_inter); 1960 if (ret < 0) { 1961 return ret; 1962 } else if (ret) { 1963 *pnum = pnum_inter; 1964 return 1; 1965 } 1966 1967 /* 1968 * [sector_num, nb_sectors] is unallocated on top but intermediate 1969 * might have 1970 * 1971 * [sector_num+x, nr_sectors] allocated. 1972 */ 1973 if (n > pnum_inter && 1974 (intermediate == top || 1975 sector_num + pnum_inter < intermediate->total_sectors)) { 1976 n = pnum_inter; 1977 } 1978 1979 intermediate = backing_bs(intermediate); 1980 } 1981 1982 *pnum = n; 1983 return 0; 1984 } 1985 1986 typedef struct BdrvVmstateCo { 1987 BlockDriverState *bs; 1988 QEMUIOVector *qiov; 1989 int64_t pos; 1990 bool is_read; 1991 int ret; 1992 } BdrvVmstateCo; 1993 1994 static int coroutine_fn 1995 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 1996 bool is_read) 1997 { 1998 BlockDriver *drv = bs->drv; 1999 2000 if (!drv) { 2001 return -ENOMEDIUM; 2002 } else if (drv->bdrv_load_vmstate) { 2003 return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos) 2004 : drv->bdrv_save_vmstate(bs, qiov, pos); 2005 } else if (bs->file) { 2006 return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2007 } 2008 2009 return -ENOTSUP; 2010 } 2011 2012 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2013 { 2014 BdrvVmstateCo *co = opaque; 2015 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2016 } 2017 2018 static inline int 2019 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2020 bool is_read) 2021 { 2022 if (qemu_in_coroutine()) { 2023 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2024 } else { 2025 BdrvVmstateCo data = { 2026 .bs = bs, 2027 .qiov = qiov, 2028 .pos = pos, 2029 .is_read = is_read, 2030 .ret = -EINPROGRESS, 2031 }; 2032 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2033 2034 bdrv_coroutine_enter(bs, co); 2035 while (data.ret == -EINPROGRESS) { 2036 aio_poll(bdrv_get_aio_context(bs), true); 2037 } 2038 return data.ret; 2039 } 2040 } 2041 2042 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2043 int64_t pos, int size) 2044 { 2045 QEMUIOVector qiov; 2046 struct iovec iov = { 2047 .iov_base = (void *) buf, 2048 .iov_len = size, 2049 }; 2050 int ret; 2051 2052 qemu_iovec_init_external(&qiov, &iov, 1); 2053 2054 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2055 if (ret < 0) { 2056 return ret; 2057 } 2058 2059 return size; 2060 } 2061 2062 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2063 { 2064 return bdrv_rw_vmstate(bs, qiov, pos, false); 2065 } 2066 2067 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2068 int64_t pos, int size) 2069 { 2070 QEMUIOVector qiov; 2071 struct iovec iov = { 2072 .iov_base = buf, 2073 .iov_len = size, 2074 }; 2075 int ret; 2076 2077 qemu_iovec_init_external(&qiov, &iov, 1); 2078 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2079 if (ret < 0) { 2080 return ret; 2081 } 2082 2083 return size; 2084 } 2085 2086 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2087 { 2088 return bdrv_rw_vmstate(bs, qiov, pos, true); 2089 } 2090 2091 /**************************************************************/ 2092 /* async I/Os */ 2093 2094 BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num, 2095 QEMUIOVector *qiov, int nb_sectors, 2096 BlockCompletionFunc *cb, void *opaque) 2097 { 2098 trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque); 2099 2100 assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); 2101 return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, 2102 0, cb, opaque, false); 2103 } 2104 2105 BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num, 2106 QEMUIOVector *qiov, int nb_sectors, 2107 BlockCompletionFunc *cb, void *opaque) 2108 { 2109 trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque); 2110 2111 assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); 2112 return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, 2113 0, cb, opaque, true); 2114 } 2115 2116 void bdrv_aio_cancel(BlockAIOCB *acb) 2117 { 2118 qemu_aio_ref(acb); 2119 bdrv_aio_cancel_async(acb); 2120 while (acb->refcnt > 1) { 2121 if (acb->aiocb_info->get_aio_context) { 2122 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2123 } else if (acb->bs) { 2124 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2125 * assert that we're not using an I/O thread. Thread-safe 2126 * code should use bdrv_aio_cancel_async exclusively. 2127 */ 2128 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2129 aio_poll(bdrv_get_aio_context(acb->bs), true); 2130 } else { 2131 abort(); 2132 } 2133 } 2134 qemu_aio_unref(acb); 2135 } 2136 2137 /* Async version of aio cancel. The caller is not blocked if the acb implements 2138 * cancel_async, otherwise we do nothing and let the request normally complete. 2139 * In either case the completion callback must be called. */ 2140 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2141 { 2142 if (acb->aiocb_info->cancel_async) { 2143 acb->aiocb_info->cancel_async(acb); 2144 } 2145 } 2146 2147 /**************************************************************/ 2148 /* async block device emulation */ 2149 2150 typedef struct BlockRequest { 2151 union { 2152 /* Used during read, write, trim */ 2153 struct { 2154 int64_t offset; 2155 int bytes; 2156 int flags; 2157 QEMUIOVector *qiov; 2158 }; 2159 /* Used during ioctl */ 2160 struct { 2161 int req; 2162 void *buf; 2163 }; 2164 }; 2165 BlockCompletionFunc *cb; 2166 void *opaque; 2167 2168 int error; 2169 } BlockRequest; 2170 2171 typedef struct BlockAIOCBCoroutine { 2172 BlockAIOCB common; 2173 BdrvChild *child; 2174 BlockRequest req; 2175 bool is_write; 2176 bool need_bh; 2177 bool *done; 2178 } BlockAIOCBCoroutine; 2179 2180 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2181 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2182 }; 2183 2184 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2185 { 2186 if (!acb->need_bh) { 2187 bdrv_dec_in_flight(acb->common.bs); 2188 acb->common.cb(acb->common.opaque, acb->req.error); 2189 qemu_aio_unref(acb); 2190 } 2191 } 2192 2193 static void bdrv_co_em_bh(void *opaque) 2194 { 2195 BlockAIOCBCoroutine *acb = opaque; 2196 2197 assert(!acb->need_bh); 2198 bdrv_co_complete(acb); 2199 } 2200 2201 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2202 { 2203 acb->need_bh = false; 2204 if (acb->req.error != -EINPROGRESS) { 2205 BlockDriverState *bs = acb->common.bs; 2206 2207 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2208 } 2209 } 2210 2211 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2212 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2213 { 2214 BlockAIOCBCoroutine *acb = opaque; 2215 2216 if (!acb->is_write) { 2217 acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset, 2218 acb->req.qiov->size, acb->req.qiov, acb->req.flags); 2219 } else { 2220 acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset, 2221 acb->req.qiov->size, acb->req.qiov, acb->req.flags); 2222 } 2223 2224 bdrv_co_complete(acb); 2225 } 2226 2227 static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, 2228 int64_t offset, 2229 QEMUIOVector *qiov, 2230 BdrvRequestFlags flags, 2231 BlockCompletionFunc *cb, 2232 void *opaque, 2233 bool is_write) 2234 { 2235 Coroutine *co; 2236 BlockAIOCBCoroutine *acb; 2237 2238 /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */ 2239 bdrv_inc_in_flight(child->bs); 2240 2241 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque); 2242 acb->child = child; 2243 acb->need_bh = true; 2244 acb->req.error = -EINPROGRESS; 2245 acb->req.offset = offset; 2246 acb->req.qiov = qiov; 2247 acb->req.flags = flags; 2248 acb->is_write = is_write; 2249 2250 co = qemu_coroutine_create(bdrv_co_do_rw, acb); 2251 bdrv_coroutine_enter(child->bs, co); 2252 2253 bdrv_co_maybe_schedule_bh(acb); 2254 return &acb->common; 2255 } 2256 2257 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2258 { 2259 BlockAIOCBCoroutine *acb = opaque; 2260 BlockDriverState *bs = acb->common.bs; 2261 2262 acb->req.error = bdrv_co_flush(bs); 2263 bdrv_co_complete(acb); 2264 } 2265 2266 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2267 BlockCompletionFunc *cb, void *opaque) 2268 { 2269 trace_bdrv_aio_flush(bs, opaque); 2270 2271 Coroutine *co; 2272 BlockAIOCBCoroutine *acb; 2273 2274 /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */ 2275 bdrv_inc_in_flight(bs); 2276 2277 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2278 acb->need_bh = true; 2279 acb->req.error = -EINPROGRESS; 2280 2281 co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb); 2282 bdrv_coroutine_enter(bs, co); 2283 2284 bdrv_co_maybe_schedule_bh(acb); 2285 return &acb->common; 2286 } 2287 2288 /**************************************************************/ 2289 /* Coroutine block device emulation */ 2290 2291 typedef struct FlushCo { 2292 BlockDriverState *bs; 2293 int ret; 2294 } FlushCo; 2295 2296 2297 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2298 { 2299 FlushCo *rwco = opaque; 2300 2301 rwco->ret = bdrv_co_flush(rwco->bs); 2302 } 2303 2304 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2305 { 2306 int current_gen; 2307 int ret = 0; 2308 2309 bdrv_inc_in_flight(bs); 2310 2311 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2312 bdrv_is_sg(bs)) { 2313 goto early_exit; 2314 } 2315 2316 current_gen = bs->write_gen; 2317 2318 /* Wait until any previous flushes are completed */ 2319 while (bs->active_flush_req) { 2320 qemu_co_queue_wait(&bs->flush_queue, NULL); 2321 } 2322 2323 bs->active_flush_req = true; 2324 2325 /* Write back all layers by calling one driver function */ 2326 if (bs->drv->bdrv_co_flush) { 2327 ret = bs->drv->bdrv_co_flush(bs); 2328 goto out; 2329 } 2330 2331 /* Write back cached data to the OS even with cache=unsafe */ 2332 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2333 if (bs->drv->bdrv_co_flush_to_os) { 2334 ret = bs->drv->bdrv_co_flush_to_os(bs); 2335 if (ret < 0) { 2336 goto out; 2337 } 2338 } 2339 2340 /* But don't actually force it to the disk with cache=unsafe */ 2341 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2342 goto flush_parent; 2343 } 2344 2345 /* Check if we really need to flush anything */ 2346 if (bs->flushed_gen == current_gen) { 2347 goto flush_parent; 2348 } 2349 2350 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2351 if (bs->drv->bdrv_co_flush_to_disk) { 2352 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2353 } else if (bs->drv->bdrv_aio_flush) { 2354 BlockAIOCB *acb; 2355 CoroutineIOCompletion co = { 2356 .coroutine = qemu_coroutine_self(), 2357 }; 2358 2359 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2360 if (acb == NULL) { 2361 ret = -EIO; 2362 } else { 2363 qemu_coroutine_yield(); 2364 ret = co.ret; 2365 } 2366 } else { 2367 /* 2368 * Some block drivers always operate in either writethrough or unsafe 2369 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2370 * know how the server works (because the behaviour is hardcoded or 2371 * depends on server-side configuration), so we can't ensure that 2372 * everything is safe on disk. Returning an error doesn't work because 2373 * that would break guests even if the server operates in writethrough 2374 * mode. 2375 * 2376 * Let's hope the user knows what he's doing. 2377 */ 2378 ret = 0; 2379 } 2380 2381 if (ret < 0) { 2382 goto out; 2383 } 2384 2385 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2386 * in the case of cache=unsafe, so there are no useless flushes. 2387 */ 2388 flush_parent: 2389 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2390 out: 2391 /* Notify any pending flushes that we have completed */ 2392 if (ret == 0) { 2393 bs->flushed_gen = current_gen; 2394 } 2395 bs->active_flush_req = false; 2396 /* Return value is ignored - it's ok if wait queue is empty */ 2397 qemu_co_queue_next(&bs->flush_queue); 2398 2399 early_exit: 2400 bdrv_dec_in_flight(bs); 2401 return ret; 2402 } 2403 2404 int bdrv_flush(BlockDriverState *bs) 2405 { 2406 Coroutine *co; 2407 FlushCo flush_co = { 2408 .bs = bs, 2409 .ret = NOT_DONE, 2410 }; 2411 2412 if (qemu_in_coroutine()) { 2413 /* Fast-path if already in coroutine context */ 2414 bdrv_flush_co_entry(&flush_co); 2415 } else { 2416 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2417 bdrv_coroutine_enter(bs, co); 2418 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2419 } 2420 2421 return flush_co.ret; 2422 } 2423 2424 typedef struct DiscardCo { 2425 BlockDriverState *bs; 2426 int64_t offset; 2427 int count; 2428 int ret; 2429 } DiscardCo; 2430 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2431 { 2432 DiscardCo *rwco = opaque; 2433 2434 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count); 2435 } 2436 2437 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2438 int count) 2439 { 2440 BdrvTrackedRequest req; 2441 int max_pdiscard, ret; 2442 int head, tail, align; 2443 2444 if (!bs->drv) { 2445 return -ENOMEDIUM; 2446 } 2447 2448 ret = bdrv_check_byte_request(bs, offset, count); 2449 if (ret < 0) { 2450 return ret; 2451 } else if (bs->read_only) { 2452 return -EPERM; 2453 } 2454 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2455 2456 /* Do nothing if disabled. */ 2457 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2458 return 0; 2459 } 2460 2461 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2462 return 0; 2463 } 2464 2465 /* Discard is advisory, but some devices track and coalesce 2466 * unaligned requests, so we must pass everything down rather than 2467 * round here. Still, most devices will just silently ignore 2468 * unaligned requests (by returning -ENOTSUP), so we must fragment 2469 * the request accordingly. */ 2470 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2471 assert(align % bs->bl.request_alignment == 0); 2472 head = offset % align; 2473 tail = (offset + count) % align; 2474 2475 bdrv_inc_in_flight(bs); 2476 tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD); 2477 2478 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2479 if (ret < 0) { 2480 goto out; 2481 } 2482 2483 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2484 align); 2485 assert(max_pdiscard >= bs->bl.request_alignment); 2486 2487 while (count > 0) { 2488 int ret; 2489 int num = count; 2490 2491 if (head) { 2492 /* Make small requests to get to alignment boundaries. */ 2493 num = MIN(count, align - head); 2494 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2495 num %= bs->bl.request_alignment; 2496 } 2497 head = (head + num) % align; 2498 assert(num < max_pdiscard); 2499 } else if (tail) { 2500 if (num > align) { 2501 /* Shorten the request to the last aligned cluster. */ 2502 num -= tail; 2503 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2504 tail > bs->bl.request_alignment) { 2505 tail %= bs->bl.request_alignment; 2506 num -= tail; 2507 } 2508 } 2509 /* limit request size */ 2510 if (num > max_pdiscard) { 2511 num = max_pdiscard; 2512 } 2513 2514 if (bs->drv->bdrv_co_pdiscard) { 2515 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2516 } else { 2517 BlockAIOCB *acb; 2518 CoroutineIOCompletion co = { 2519 .coroutine = qemu_coroutine_self(), 2520 }; 2521 2522 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2523 bdrv_co_io_em_complete, &co); 2524 if (acb == NULL) { 2525 ret = -EIO; 2526 goto out; 2527 } else { 2528 qemu_coroutine_yield(); 2529 ret = co.ret; 2530 } 2531 } 2532 if (ret && ret != -ENOTSUP) { 2533 goto out; 2534 } 2535 2536 offset += num; 2537 count -= num; 2538 } 2539 ret = 0; 2540 out: 2541 ++bs->write_gen; 2542 bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, 2543 req.bytes >> BDRV_SECTOR_BITS); 2544 tracked_request_end(&req); 2545 bdrv_dec_in_flight(bs); 2546 return ret; 2547 } 2548 2549 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count) 2550 { 2551 Coroutine *co; 2552 DiscardCo rwco = { 2553 .bs = bs, 2554 .offset = offset, 2555 .count = count, 2556 .ret = NOT_DONE, 2557 }; 2558 2559 if (qemu_in_coroutine()) { 2560 /* Fast-path if already in coroutine context */ 2561 bdrv_pdiscard_co_entry(&rwco); 2562 } else { 2563 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2564 bdrv_coroutine_enter(bs, co); 2565 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 2566 } 2567 2568 return rwco.ret; 2569 } 2570 2571 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 2572 { 2573 BlockDriver *drv = bs->drv; 2574 CoroutineIOCompletion co = { 2575 .coroutine = qemu_coroutine_self(), 2576 }; 2577 BlockAIOCB *acb; 2578 2579 bdrv_inc_in_flight(bs); 2580 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 2581 co.ret = -ENOTSUP; 2582 goto out; 2583 } 2584 2585 if (drv->bdrv_co_ioctl) { 2586 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 2587 } else { 2588 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2589 if (!acb) { 2590 co.ret = -ENOTSUP; 2591 goto out; 2592 } 2593 qemu_coroutine_yield(); 2594 } 2595 out: 2596 bdrv_dec_in_flight(bs); 2597 return co.ret; 2598 } 2599 2600 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2601 { 2602 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2603 } 2604 2605 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2606 { 2607 return memset(qemu_blockalign(bs, size), 0, size); 2608 } 2609 2610 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2611 { 2612 size_t align = bdrv_opt_mem_align(bs); 2613 2614 /* Ensure that NULL is never returned on success */ 2615 assert(align > 0); 2616 if (size == 0) { 2617 size = align; 2618 } 2619 2620 return qemu_try_memalign(align, size); 2621 } 2622 2623 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2624 { 2625 void *mem = qemu_try_blockalign(bs, size); 2626 2627 if (mem) { 2628 memset(mem, 0, size); 2629 } 2630 2631 return mem; 2632 } 2633 2634 /* 2635 * Check if all memory in this vector is sector aligned. 2636 */ 2637 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2638 { 2639 int i; 2640 size_t alignment = bdrv_min_mem_align(bs); 2641 2642 for (i = 0; i < qiov->niov; i++) { 2643 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2644 return false; 2645 } 2646 if (qiov->iov[i].iov_len % alignment) { 2647 return false; 2648 } 2649 } 2650 2651 return true; 2652 } 2653 2654 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2655 NotifierWithReturn *notifier) 2656 { 2657 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2658 } 2659 2660 void bdrv_io_plug(BlockDriverState *bs) 2661 { 2662 BdrvChild *child; 2663 2664 QLIST_FOREACH(child, &bs->children, next) { 2665 bdrv_io_plug(child->bs); 2666 } 2667 2668 if (bs->io_plugged++ == 0) { 2669 BlockDriver *drv = bs->drv; 2670 if (drv && drv->bdrv_io_plug) { 2671 drv->bdrv_io_plug(bs); 2672 } 2673 } 2674 } 2675 2676 void bdrv_io_unplug(BlockDriverState *bs) 2677 { 2678 BdrvChild *child; 2679 2680 assert(bs->io_plugged); 2681 if (--bs->io_plugged == 0) { 2682 BlockDriver *drv = bs->drv; 2683 if (drv && drv->bdrv_io_unplug) { 2684 drv->bdrv_io_unplug(bs); 2685 } 2686 } 2687 2688 QLIST_FOREACH(child, &bs->children, next) { 2689 bdrv_io_unplug(child->bs); 2690 } 2691 } 2692