1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "qemu/cutils.h" 31 #include "qapi/error.h" 32 #include "qemu/error-report.h" 33 34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 35 36 static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, 37 int64_t offset, 38 QEMUIOVector *qiov, 39 BdrvRequestFlags flags, 40 BlockCompletionFunc *cb, 41 void *opaque, 42 bool is_write); 43 static void coroutine_fn bdrv_co_do_rw(void *opaque); 44 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 45 int64_t offset, int count, BdrvRequestFlags flags); 46 47 void bdrv_parent_drained_begin(BlockDriverState *bs) 48 { 49 BdrvChild *c; 50 51 QLIST_FOREACH(c, &bs->parents, next_parent) { 52 if (c->role->drained_begin) { 53 c->role->drained_begin(c); 54 } 55 } 56 } 57 58 void bdrv_parent_drained_end(BlockDriverState *bs) 59 { 60 BdrvChild *c; 61 62 QLIST_FOREACH(c, &bs->parents, next_parent) { 63 if (c->role->drained_end) { 64 c->role->drained_end(c); 65 } 66 } 67 } 68 69 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 70 { 71 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 72 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 73 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 74 src->opt_mem_alignment); 75 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 76 src->min_mem_alignment); 77 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 78 } 79 80 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 81 { 82 BlockDriver *drv = bs->drv; 83 Error *local_err = NULL; 84 85 memset(&bs->bl, 0, sizeof(bs->bl)); 86 87 if (!drv) { 88 return; 89 } 90 91 /* Default alignment based on whether driver has byte interface */ 92 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; 93 94 /* Take some limits from the children as a default */ 95 if (bs->file) { 96 bdrv_refresh_limits(bs->file->bs, &local_err); 97 if (local_err) { 98 error_propagate(errp, local_err); 99 return; 100 } 101 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 102 } else { 103 bs->bl.min_mem_alignment = 512; 104 bs->bl.opt_mem_alignment = getpagesize(); 105 106 /* Safe default since most protocols use readv()/writev()/etc */ 107 bs->bl.max_iov = IOV_MAX; 108 } 109 110 if (bs->backing) { 111 bdrv_refresh_limits(bs->backing->bs, &local_err); 112 if (local_err) { 113 error_propagate(errp, local_err); 114 return; 115 } 116 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 117 } 118 119 /* Then let the driver override it */ 120 if (drv->bdrv_refresh_limits) { 121 drv->bdrv_refresh_limits(bs, errp); 122 } 123 } 124 125 /** 126 * The copy-on-read flag is actually a reference count so multiple users may 127 * use the feature without worrying about clobbering its previous state. 128 * Copy-on-read stays enabled until all users have called to disable it. 129 */ 130 void bdrv_enable_copy_on_read(BlockDriverState *bs) 131 { 132 bs->copy_on_read++; 133 } 134 135 void bdrv_disable_copy_on_read(BlockDriverState *bs) 136 { 137 assert(bs->copy_on_read > 0); 138 bs->copy_on_read--; 139 } 140 141 /* Check if any requests are in-flight (including throttled requests) */ 142 bool bdrv_requests_pending(BlockDriverState *bs) 143 { 144 BdrvChild *child; 145 146 if (atomic_read(&bs->in_flight)) { 147 return true; 148 } 149 150 QLIST_FOREACH(child, &bs->children, next) { 151 if (bdrv_requests_pending(child->bs)) { 152 return true; 153 } 154 } 155 156 return false; 157 } 158 159 static bool bdrv_drain_recurse(BlockDriverState *bs) 160 { 161 BdrvChild *child, *tmp; 162 bool waited; 163 164 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); 165 166 if (bs->drv && bs->drv->bdrv_drain) { 167 bs->drv->bdrv_drain(bs); 168 } 169 170 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 171 BlockDriverState *bs = child->bs; 172 bool in_main_loop = 173 qemu_get_current_aio_context() == qemu_get_aio_context(); 174 assert(bs->refcnt > 0); 175 if (in_main_loop) { 176 /* In case the recursive bdrv_drain_recurse processes a 177 * block_job_defer_to_main_loop BH and modifies the graph, 178 * let's hold a reference to bs until we are done. 179 * 180 * IOThread doesn't have such a BH, and it is not safe to call 181 * bdrv_unref without BQL, so skip doing it there. 182 */ 183 bdrv_ref(bs); 184 } 185 waited |= bdrv_drain_recurse(bs); 186 if (in_main_loop) { 187 bdrv_unref(bs); 188 } 189 } 190 191 return waited; 192 } 193 194 typedef struct { 195 Coroutine *co; 196 BlockDriverState *bs; 197 bool done; 198 } BdrvCoDrainData; 199 200 static void bdrv_co_drain_bh_cb(void *opaque) 201 { 202 BdrvCoDrainData *data = opaque; 203 Coroutine *co = data->co; 204 BlockDriverState *bs = data->bs; 205 206 bdrv_dec_in_flight(bs); 207 bdrv_drained_begin(bs); 208 data->done = true; 209 aio_co_wake(co); 210 } 211 212 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) 213 { 214 BdrvCoDrainData data; 215 216 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 217 * other coroutines run if they were queued from 218 * qemu_co_queue_run_restart(). */ 219 220 assert(qemu_in_coroutine()); 221 data = (BdrvCoDrainData) { 222 .co = qemu_coroutine_self(), 223 .bs = bs, 224 .done = false, 225 }; 226 bdrv_inc_in_flight(bs); 227 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 228 bdrv_co_drain_bh_cb, &data); 229 230 qemu_coroutine_yield(); 231 /* If we are resumed from some other event (such as an aio completion or a 232 * timer callback), it is a bug in the caller that should be fixed. */ 233 assert(data.done); 234 } 235 236 void bdrv_drained_begin(BlockDriverState *bs) 237 { 238 if (qemu_in_coroutine()) { 239 bdrv_co_yield_to_drain(bs); 240 return; 241 } 242 243 if (!bs->quiesce_counter++) { 244 aio_disable_external(bdrv_get_aio_context(bs)); 245 bdrv_parent_drained_begin(bs); 246 } 247 248 bdrv_drain_recurse(bs); 249 } 250 251 void bdrv_drained_end(BlockDriverState *bs) 252 { 253 assert(bs->quiesce_counter > 0); 254 if (--bs->quiesce_counter > 0) { 255 return; 256 } 257 258 bdrv_parent_drained_end(bs); 259 aio_enable_external(bdrv_get_aio_context(bs)); 260 } 261 262 /* 263 * Wait for pending requests to complete on a single BlockDriverState subtree, 264 * and suspend block driver's internal I/O until next request arrives. 265 * 266 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 267 * AioContext. 268 * 269 * Only this BlockDriverState's AioContext is run, so in-flight requests must 270 * not depend on events in other AioContexts. In that case, use 271 * bdrv_drain_all() instead. 272 */ 273 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 274 { 275 assert(qemu_in_coroutine()); 276 bdrv_drained_begin(bs); 277 bdrv_drained_end(bs); 278 } 279 280 void bdrv_drain(BlockDriverState *bs) 281 { 282 bdrv_drained_begin(bs); 283 bdrv_drained_end(bs); 284 } 285 286 /* 287 * Wait for pending requests to complete across all BlockDriverStates 288 * 289 * This function does not flush data to disk, use bdrv_flush_all() for that 290 * after calling this function. 291 * 292 * This pauses all block jobs and disables external clients. It must 293 * be paired with bdrv_drain_all_end(). 294 * 295 * NOTE: no new block jobs or BlockDriverStates can be created between 296 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 297 */ 298 void bdrv_drain_all_begin(void) 299 { 300 /* Always run first iteration so any pending completion BHs run */ 301 bool waited = true; 302 BlockDriverState *bs; 303 BdrvNextIterator it; 304 BlockJob *job = NULL; 305 GSList *aio_ctxs = NULL, *ctx; 306 307 while ((job = block_job_next(job))) { 308 AioContext *aio_context = blk_get_aio_context(job->blk); 309 310 aio_context_acquire(aio_context); 311 block_job_pause(job); 312 aio_context_release(aio_context); 313 } 314 315 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 316 AioContext *aio_context = bdrv_get_aio_context(bs); 317 318 aio_context_acquire(aio_context); 319 bdrv_parent_drained_begin(bs); 320 aio_disable_external(aio_context); 321 aio_context_release(aio_context); 322 323 if (!g_slist_find(aio_ctxs, aio_context)) { 324 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 325 } 326 } 327 328 /* Note that completion of an asynchronous I/O operation can trigger any 329 * number of other I/O operations on other devices---for example a 330 * coroutine can submit an I/O request to another device in response to 331 * request completion. Therefore we must keep looping until there was no 332 * more activity rather than simply draining each device independently. 333 */ 334 while (waited) { 335 waited = false; 336 337 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 338 AioContext *aio_context = ctx->data; 339 340 aio_context_acquire(aio_context); 341 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 342 if (aio_context == bdrv_get_aio_context(bs)) { 343 waited |= bdrv_drain_recurse(bs); 344 } 345 } 346 aio_context_release(aio_context); 347 } 348 } 349 350 g_slist_free(aio_ctxs); 351 } 352 353 void bdrv_drain_all_end(void) 354 { 355 BlockDriverState *bs; 356 BdrvNextIterator it; 357 BlockJob *job = NULL; 358 359 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 360 AioContext *aio_context = bdrv_get_aio_context(bs); 361 362 aio_context_acquire(aio_context); 363 aio_enable_external(aio_context); 364 bdrv_parent_drained_end(bs); 365 aio_context_release(aio_context); 366 } 367 368 while ((job = block_job_next(job))) { 369 AioContext *aio_context = blk_get_aio_context(job->blk); 370 371 aio_context_acquire(aio_context); 372 block_job_resume(job); 373 aio_context_release(aio_context); 374 } 375 } 376 377 void bdrv_drain_all(void) 378 { 379 bdrv_drain_all_begin(); 380 bdrv_drain_all_end(); 381 } 382 383 /** 384 * Remove an active request from the tracked requests list 385 * 386 * This function should be called when a tracked request is completing. 387 */ 388 static void tracked_request_end(BdrvTrackedRequest *req) 389 { 390 if (req->serialising) { 391 req->bs->serialising_in_flight--; 392 } 393 394 QLIST_REMOVE(req, list); 395 qemu_co_queue_restart_all(&req->wait_queue); 396 } 397 398 /** 399 * Add an active request to the tracked requests list 400 */ 401 static void tracked_request_begin(BdrvTrackedRequest *req, 402 BlockDriverState *bs, 403 int64_t offset, 404 unsigned int bytes, 405 enum BdrvTrackedRequestType type) 406 { 407 *req = (BdrvTrackedRequest){ 408 .bs = bs, 409 .offset = offset, 410 .bytes = bytes, 411 .type = type, 412 .co = qemu_coroutine_self(), 413 .serialising = false, 414 .overlap_offset = offset, 415 .overlap_bytes = bytes, 416 }; 417 418 qemu_co_queue_init(&req->wait_queue); 419 420 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 421 } 422 423 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 424 { 425 int64_t overlap_offset = req->offset & ~(align - 1); 426 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 427 - overlap_offset; 428 429 if (!req->serialising) { 430 req->bs->serialising_in_flight++; 431 req->serialising = true; 432 } 433 434 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 435 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 436 } 437 438 /** 439 * Round a region to cluster boundaries (sector-based) 440 */ 441 void bdrv_round_sectors_to_clusters(BlockDriverState *bs, 442 int64_t sector_num, int nb_sectors, 443 int64_t *cluster_sector_num, 444 int *cluster_nb_sectors) 445 { 446 BlockDriverInfo bdi; 447 448 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 449 *cluster_sector_num = sector_num; 450 *cluster_nb_sectors = nb_sectors; 451 } else { 452 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 453 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 454 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 455 nb_sectors, c); 456 } 457 } 458 459 /** 460 * Round a region to cluster boundaries 461 */ 462 void bdrv_round_to_clusters(BlockDriverState *bs, 463 int64_t offset, unsigned int bytes, 464 int64_t *cluster_offset, 465 unsigned int *cluster_bytes) 466 { 467 BlockDriverInfo bdi; 468 469 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 470 *cluster_offset = offset; 471 *cluster_bytes = bytes; 472 } else { 473 int64_t c = bdi.cluster_size; 474 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 475 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 476 } 477 } 478 479 static int bdrv_get_cluster_size(BlockDriverState *bs) 480 { 481 BlockDriverInfo bdi; 482 int ret; 483 484 ret = bdrv_get_info(bs, &bdi); 485 if (ret < 0 || bdi.cluster_size == 0) { 486 return bs->bl.request_alignment; 487 } else { 488 return bdi.cluster_size; 489 } 490 } 491 492 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 493 int64_t offset, unsigned int bytes) 494 { 495 /* aaaa bbbb */ 496 if (offset >= req->overlap_offset + req->overlap_bytes) { 497 return false; 498 } 499 /* bbbb aaaa */ 500 if (req->overlap_offset >= offset + bytes) { 501 return false; 502 } 503 return true; 504 } 505 506 void bdrv_inc_in_flight(BlockDriverState *bs) 507 { 508 atomic_inc(&bs->in_flight); 509 } 510 511 static void dummy_bh_cb(void *opaque) 512 { 513 } 514 515 void bdrv_wakeup(BlockDriverState *bs) 516 { 517 if (bs->wakeup) { 518 aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); 519 } 520 } 521 522 void bdrv_dec_in_flight(BlockDriverState *bs) 523 { 524 atomic_dec(&bs->in_flight); 525 bdrv_wakeup(bs); 526 } 527 528 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 529 { 530 BlockDriverState *bs = self->bs; 531 BdrvTrackedRequest *req; 532 bool retry; 533 bool waited = false; 534 535 if (!bs->serialising_in_flight) { 536 return false; 537 } 538 539 do { 540 retry = false; 541 QLIST_FOREACH(req, &bs->tracked_requests, list) { 542 if (req == self || (!req->serialising && !self->serialising)) { 543 continue; 544 } 545 if (tracked_request_overlaps(req, self->overlap_offset, 546 self->overlap_bytes)) 547 { 548 /* Hitting this means there was a reentrant request, for 549 * example, a block driver issuing nested requests. This must 550 * never happen since it means deadlock. 551 */ 552 assert(qemu_coroutine_self() != req->co); 553 554 /* If the request is already (indirectly) waiting for us, or 555 * will wait for us as soon as it wakes up, then just go on 556 * (instead of producing a deadlock in the former case). */ 557 if (!req->waiting_for) { 558 self->waiting_for = req; 559 qemu_co_queue_wait(&req->wait_queue, NULL); 560 self->waiting_for = NULL; 561 retry = true; 562 waited = true; 563 break; 564 } 565 } 566 } 567 } while (retry); 568 569 return waited; 570 } 571 572 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 573 size_t size) 574 { 575 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 576 return -EIO; 577 } 578 579 if (!bdrv_is_inserted(bs)) { 580 return -ENOMEDIUM; 581 } 582 583 if (offset < 0) { 584 return -EIO; 585 } 586 587 return 0; 588 } 589 590 typedef struct RwCo { 591 BdrvChild *child; 592 int64_t offset; 593 QEMUIOVector *qiov; 594 bool is_write; 595 int ret; 596 BdrvRequestFlags flags; 597 } RwCo; 598 599 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 600 { 601 RwCo *rwco = opaque; 602 603 if (!rwco->is_write) { 604 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 605 rwco->qiov->size, rwco->qiov, 606 rwco->flags); 607 } else { 608 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 609 rwco->qiov->size, rwco->qiov, 610 rwco->flags); 611 } 612 } 613 614 /* 615 * Process a vectored synchronous request using coroutines 616 */ 617 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 618 QEMUIOVector *qiov, bool is_write, 619 BdrvRequestFlags flags) 620 { 621 Coroutine *co; 622 RwCo rwco = { 623 .child = child, 624 .offset = offset, 625 .qiov = qiov, 626 .is_write = is_write, 627 .ret = NOT_DONE, 628 .flags = flags, 629 }; 630 631 if (qemu_in_coroutine()) { 632 /* Fast-path if already in coroutine context */ 633 bdrv_rw_co_entry(&rwco); 634 } else { 635 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 636 bdrv_coroutine_enter(child->bs, co); 637 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 638 } 639 return rwco.ret; 640 } 641 642 /* 643 * Process a synchronous request using coroutines 644 */ 645 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 646 int nb_sectors, bool is_write, BdrvRequestFlags flags) 647 { 648 QEMUIOVector qiov; 649 struct iovec iov = { 650 .iov_base = (void *)buf, 651 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 652 }; 653 654 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 655 return -EINVAL; 656 } 657 658 qemu_iovec_init_external(&qiov, &iov, 1); 659 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 660 &qiov, is_write, flags); 661 } 662 663 /* return < 0 if error. See bdrv_write() for the return codes */ 664 int bdrv_read(BdrvChild *child, int64_t sector_num, 665 uint8_t *buf, int nb_sectors) 666 { 667 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 668 } 669 670 /* Return < 0 if error. Important errors are: 671 -EIO generic I/O error (may happen for all errors) 672 -ENOMEDIUM No media inserted. 673 -EINVAL Invalid sector number or nb_sectors 674 -EACCES Trying to write a read-only device 675 */ 676 int bdrv_write(BdrvChild *child, int64_t sector_num, 677 const uint8_t *buf, int nb_sectors) 678 { 679 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 680 } 681 682 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 683 int count, BdrvRequestFlags flags) 684 { 685 QEMUIOVector qiov; 686 struct iovec iov = { 687 .iov_base = NULL, 688 .iov_len = count, 689 }; 690 691 qemu_iovec_init_external(&qiov, &iov, 1); 692 return bdrv_prwv_co(child, offset, &qiov, true, 693 BDRV_REQ_ZERO_WRITE | flags); 694 } 695 696 /* 697 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 698 * The operation is sped up by checking the block status and only writing 699 * zeroes to the device if they currently do not return zeroes. Optional 700 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 701 * BDRV_REQ_FUA). 702 * 703 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 704 */ 705 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 706 { 707 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 708 BlockDriverState *bs = child->bs; 709 BlockDriverState *file; 710 int n; 711 712 target_sectors = bdrv_nb_sectors(bs); 713 if (target_sectors < 0) { 714 return target_sectors; 715 } 716 717 for (;;) { 718 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 719 if (nb_sectors <= 0) { 720 return 0; 721 } 722 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); 723 if (ret < 0) { 724 error_report("error getting block status at sector %" PRId64 ": %s", 725 sector_num, strerror(-ret)); 726 return ret; 727 } 728 if (ret & BDRV_BLOCK_ZERO) { 729 sector_num += n; 730 continue; 731 } 732 ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS, 733 n << BDRV_SECTOR_BITS, flags); 734 if (ret < 0) { 735 error_report("error writing zeroes at sector %" PRId64 ": %s", 736 sector_num, strerror(-ret)); 737 return ret; 738 } 739 sector_num += n; 740 } 741 } 742 743 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 744 { 745 int ret; 746 747 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 748 if (ret < 0) { 749 return ret; 750 } 751 752 return qiov->size; 753 } 754 755 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 756 { 757 QEMUIOVector qiov; 758 struct iovec iov = { 759 .iov_base = (void *)buf, 760 .iov_len = bytes, 761 }; 762 763 if (bytes < 0) { 764 return -EINVAL; 765 } 766 767 qemu_iovec_init_external(&qiov, &iov, 1); 768 return bdrv_preadv(child, offset, &qiov); 769 } 770 771 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 772 { 773 int ret; 774 775 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 776 if (ret < 0) { 777 return ret; 778 } 779 780 return qiov->size; 781 } 782 783 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 784 { 785 QEMUIOVector qiov; 786 struct iovec iov = { 787 .iov_base = (void *) buf, 788 .iov_len = bytes, 789 }; 790 791 if (bytes < 0) { 792 return -EINVAL; 793 } 794 795 qemu_iovec_init_external(&qiov, &iov, 1); 796 return bdrv_pwritev(child, offset, &qiov); 797 } 798 799 /* 800 * Writes to the file and ensures that no writes are reordered across this 801 * request (acts as a barrier) 802 * 803 * Returns 0 on success, -errno in error cases. 804 */ 805 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 806 const void *buf, int count) 807 { 808 int ret; 809 810 ret = bdrv_pwrite(child, offset, buf, count); 811 if (ret < 0) { 812 return ret; 813 } 814 815 ret = bdrv_flush(child->bs); 816 if (ret < 0) { 817 return ret; 818 } 819 820 return 0; 821 } 822 823 typedef struct CoroutineIOCompletion { 824 Coroutine *coroutine; 825 int ret; 826 } CoroutineIOCompletion; 827 828 static void bdrv_co_io_em_complete(void *opaque, int ret) 829 { 830 CoroutineIOCompletion *co = opaque; 831 832 co->ret = ret; 833 aio_co_wake(co->coroutine); 834 } 835 836 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 837 uint64_t offset, uint64_t bytes, 838 QEMUIOVector *qiov, int flags) 839 { 840 BlockDriver *drv = bs->drv; 841 int64_t sector_num; 842 unsigned int nb_sectors; 843 844 assert(!(flags & ~BDRV_REQ_MASK)); 845 846 if (drv->bdrv_co_preadv) { 847 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 848 } 849 850 sector_num = offset >> BDRV_SECTOR_BITS; 851 nb_sectors = bytes >> BDRV_SECTOR_BITS; 852 853 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 854 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 855 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 856 857 if (drv->bdrv_co_readv) { 858 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 859 } else { 860 BlockAIOCB *acb; 861 CoroutineIOCompletion co = { 862 .coroutine = qemu_coroutine_self(), 863 }; 864 865 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, 866 bdrv_co_io_em_complete, &co); 867 if (acb == NULL) { 868 return -EIO; 869 } else { 870 qemu_coroutine_yield(); 871 return co.ret; 872 } 873 } 874 } 875 876 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 877 uint64_t offset, uint64_t bytes, 878 QEMUIOVector *qiov, int flags) 879 { 880 BlockDriver *drv = bs->drv; 881 int64_t sector_num; 882 unsigned int nb_sectors; 883 int ret; 884 885 assert(!(flags & ~BDRV_REQ_MASK)); 886 887 if (drv->bdrv_co_pwritev) { 888 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 889 flags & bs->supported_write_flags); 890 flags &= ~bs->supported_write_flags; 891 goto emulate_flags; 892 } 893 894 sector_num = offset >> BDRV_SECTOR_BITS; 895 nb_sectors = bytes >> BDRV_SECTOR_BITS; 896 897 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 898 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 899 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 900 901 if (drv->bdrv_co_writev_flags) { 902 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, 903 flags & bs->supported_write_flags); 904 flags &= ~bs->supported_write_flags; 905 } else if (drv->bdrv_co_writev) { 906 assert(!bs->supported_write_flags); 907 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 908 } else { 909 BlockAIOCB *acb; 910 CoroutineIOCompletion co = { 911 .coroutine = qemu_coroutine_self(), 912 }; 913 914 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, 915 bdrv_co_io_em_complete, &co); 916 if (acb == NULL) { 917 ret = -EIO; 918 } else { 919 qemu_coroutine_yield(); 920 ret = co.ret; 921 } 922 } 923 924 emulate_flags: 925 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 926 ret = bdrv_co_flush(bs); 927 } 928 929 return ret; 930 } 931 932 static int coroutine_fn 933 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 934 uint64_t bytes, QEMUIOVector *qiov) 935 { 936 BlockDriver *drv = bs->drv; 937 938 if (!drv->bdrv_co_pwritev_compressed) { 939 return -ENOTSUP; 940 } 941 942 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 943 } 944 945 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 946 int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 947 { 948 BlockDriverState *bs = child->bs; 949 950 /* Perform I/O through a temporary buffer so that users who scribble over 951 * their read buffer while the operation is in progress do not end up 952 * modifying the image file. This is critical for zero-copy guest I/O 953 * where anything might happen inside guest memory. 954 */ 955 void *bounce_buffer; 956 957 BlockDriver *drv = bs->drv; 958 struct iovec iov; 959 QEMUIOVector bounce_qiov; 960 int64_t cluster_offset; 961 unsigned int cluster_bytes; 962 size_t skip_bytes; 963 int ret; 964 965 /* FIXME We cannot require callers to have write permissions when all they 966 * are doing is a read request. If we did things right, write permissions 967 * would be obtained anyway, but internally by the copy-on-read code. As 968 * long as it is implemented here rather than in a separat filter driver, 969 * the copy-on-read code doesn't have its own BdrvChild, however, for which 970 * it could request permissions. Therefore we have to bypass the permission 971 * system for the moment. */ 972 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 973 974 /* Cover entire cluster so no additional backing file I/O is required when 975 * allocating cluster in the image file. 976 */ 977 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 978 979 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 980 cluster_offset, cluster_bytes); 981 982 iov.iov_len = cluster_bytes; 983 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 984 if (bounce_buffer == NULL) { 985 ret = -ENOMEM; 986 goto err; 987 } 988 989 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 990 991 ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes, 992 &bounce_qiov, 0); 993 if (ret < 0) { 994 goto err; 995 } 996 997 if (drv->bdrv_co_pwrite_zeroes && 998 buffer_is_zero(bounce_buffer, iov.iov_len)) { 999 /* FIXME: Should we (perhaps conditionally) be setting 1000 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1001 * that still correctly reads as zero? */ 1002 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0); 1003 } else { 1004 /* This does not change the data on the disk, it is not necessary 1005 * to flush even in cache=writethrough mode. 1006 */ 1007 ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes, 1008 &bounce_qiov, 0); 1009 } 1010 1011 if (ret < 0) { 1012 /* It might be okay to ignore write errors for guest requests. If this 1013 * is a deliberate copy-on-read then we don't want to ignore the error. 1014 * Simply report it in all cases. 1015 */ 1016 goto err; 1017 } 1018 1019 skip_bytes = offset - cluster_offset; 1020 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes); 1021 1022 err: 1023 qemu_vfree(bounce_buffer); 1024 return ret; 1025 } 1026 1027 /* 1028 * Forwards an already correctly aligned request to the BlockDriver. This 1029 * handles copy on read, zeroing after EOF, and fragmentation of large 1030 * reads; any other features must be implemented by the caller. 1031 */ 1032 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1033 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1034 int64_t align, QEMUIOVector *qiov, int flags) 1035 { 1036 BlockDriverState *bs = child->bs; 1037 int64_t total_bytes, max_bytes; 1038 int ret = 0; 1039 uint64_t bytes_remaining = bytes; 1040 int max_transfer; 1041 1042 assert(is_power_of_2(align)); 1043 assert((offset & (align - 1)) == 0); 1044 assert((bytes & (align - 1)) == 0); 1045 assert(!qiov || bytes == qiov->size); 1046 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1047 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1048 align); 1049 1050 /* TODO: We would need a per-BDS .supported_read_flags and 1051 * potential fallback support, if we ever implement any read flags 1052 * to pass through to drivers. For now, there aren't any 1053 * passthrough flags. */ 1054 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 1055 1056 /* Handle Copy on Read and associated serialisation */ 1057 if (flags & BDRV_REQ_COPY_ON_READ) { 1058 /* If we touch the same cluster it counts as an overlap. This 1059 * guarantees that allocating writes will be serialized and not race 1060 * with each other for the same cluster. For example, in copy-on-read 1061 * it ensures that the CoR read and write operations are atomic and 1062 * guest writes cannot interleave between them. */ 1063 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1064 } 1065 1066 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 1067 wait_serialising_requests(req); 1068 } 1069 1070 if (flags & BDRV_REQ_COPY_ON_READ) { 1071 int64_t start_sector = offset >> BDRV_SECTOR_BITS; 1072 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1073 unsigned int nb_sectors = end_sector - start_sector; 1074 int pnum; 1075 1076 ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum); 1077 if (ret < 0) { 1078 goto out; 1079 } 1080 1081 if (!ret || pnum != nb_sectors) { 1082 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 1083 goto out; 1084 } 1085 } 1086 1087 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1088 total_bytes = bdrv_getlength(bs); 1089 if (total_bytes < 0) { 1090 ret = total_bytes; 1091 goto out; 1092 } 1093 1094 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1095 if (bytes <= max_bytes && bytes <= max_transfer) { 1096 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 1097 goto out; 1098 } 1099 1100 while (bytes_remaining) { 1101 int num; 1102 1103 if (max_bytes) { 1104 QEMUIOVector local_qiov; 1105 1106 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1107 assert(num); 1108 qemu_iovec_init(&local_qiov, qiov->niov); 1109 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1110 1111 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1112 num, &local_qiov, 0); 1113 max_bytes -= num; 1114 qemu_iovec_destroy(&local_qiov); 1115 } else { 1116 num = bytes_remaining; 1117 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1118 bytes_remaining); 1119 } 1120 if (ret < 0) { 1121 goto out; 1122 } 1123 bytes_remaining -= num; 1124 } 1125 1126 out: 1127 return ret < 0 ? ret : 0; 1128 } 1129 1130 /* 1131 * Handle a read request in coroutine context 1132 */ 1133 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1134 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1135 BdrvRequestFlags flags) 1136 { 1137 BlockDriverState *bs = child->bs; 1138 BlockDriver *drv = bs->drv; 1139 BdrvTrackedRequest req; 1140 1141 uint64_t align = bs->bl.request_alignment; 1142 uint8_t *head_buf = NULL; 1143 uint8_t *tail_buf = NULL; 1144 QEMUIOVector local_qiov; 1145 bool use_local_qiov = false; 1146 int ret; 1147 1148 if (!drv) { 1149 return -ENOMEDIUM; 1150 } 1151 1152 ret = bdrv_check_byte_request(bs, offset, bytes); 1153 if (ret < 0) { 1154 return ret; 1155 } 1156 1157 bdrv_inc_in_flight(bs); 1158 1159 /* Don't do copy-on-read if we read data before write operation */ 1160 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 1161 flags |= BDRV_REQ_COPY_ON_READ; 1162 } 1163 1164 /* Align read if necessary by padding qiov */ 1165 if (offset & (align - 1)) { 1166 head_buf = qemu_blockalign(bs, align); 1167 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1168 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1169 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1170 use_local_qiov = true; 1171 1172 bytes += offset & (align - 1); 1173 offset = offset & ~(align - 1); 1174 } 1175 1176 if ((offset + bytes) & (align - 1)) { 1177 if (!use_local_qiov) { 1178 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1179 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1180 use_local_qiov = true; 1181 } 1182 tail_buf = qemu_blockalign(bs, align); 1183 qemu_iovec_add(&local_qiov, tail_buf, 1184 align - ((offset + bytes) & (align - 1))); 1185 1186 bytes = ROUND_UP(bytes, align); 1187 } 1188 1189 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1190 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 1191 use_local_qiov ? &local_qiov : qiov, 1192 flags); 1193 tracked_request_end(&req); 1194 bdrv_dec_in_flight(bs); 1195 1196 if (use_local_qiov) { 1197 qemu_iovec_destroy(&local_qiov); 1198 qemu_vfree(head_buf); 1199 qemu_vfree(tail_buf); 1200 } 1201 1202 return ret; 1203 } 1204 1205 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, 1206 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1207 BdrvRequestFlags flags) 1208 { 1209 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1210 return -EINVAL; 1211 } 1212 1213 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, 1214 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1215 } 1216 1217 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, 1218 int nb_sectors, QEMUIOVector *qiov) 1219 { 1220 trace_bdrv_co_readv(child->bs, sector_num, nb_sectors); 1221 1222 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); 1223 } 1224 1225 /* Maximum buffer for write zeroes fallback, in bytes */ 1226 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 1227 1228 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1229 int64_t offset, int count, BdrvRequestFlags flags) 1230 { 1231 BlockDriver *drv = bs->drv; 1232 QEMUIOVector qiov; 1233 struct iovec iov = {0}; 1234 int ret = 0; 1235 bool need_flush = false; 1236 int head = 0; 1237 int tail = 0; 1238 1239 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1240 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1241 bs->bl.request_alignment); 1242 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1243 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1244 1245 assert(alignment % bs->bl.request_alignment == 0); 1246 head = offset % alignment; 1247 tail = (offset + count) % alignment; 1248 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1249 assert(max_write_zeroes >= bs->bl.request_alignment); 1250 1251 while (count > 0 && !ret) { 1252 int num = count; 1253 1254 /* Align request. Block drivers can expect the "bulk" of the request 1255 * to be aligned, and that unaligned requests do not cross cluster 1256 * boundaries. 1257 */ 1258 if (head) { 1259 /* Make a small request up to the first aligned sector. For 1260 * convenience, limit this request to max_transfer even if 1261 * we don't need to fall back to writes. */ 1262 num = MIN(MIN(count, max_transfer), alignment - head); 1263 head = (head + num) % alignment; 1264 assert(num < max_write_zeroes); 1265 } else if (tail && num > alignment) { 1266 /* Shorten the request to the last aligned sector. */ 1267 num -= tail; 1268 } 1269 1270 /* limit request size */ 1271 if (num > max_write_zeroes) { 1272 num = max_write_zeroes; 1273 } 1274 1275 ret = -ENOTSUP; 1276 /* First try the efficient write zeroes operation */ 1277 if (drv->bdrv_co_pwrite_zeroes) { 1278 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1279 flags & bs->supported_zero_flags); 1280 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1281 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1282 need_flush = true; 1283 } 1284 } else { 1285 assert(!bs->supported_zero_flags); 1286 } 1287 1288 if (ret == -ENOTSUP) { 1289 /* Fall back to bounce buffer if write zeroes is unsupported */ 1290 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1291 1292 if ((flags & BDRV_REQ_FUA) && 1293 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1294 /* No need for bdrv_driver_pwrite() to do a fallback 1295 * flush on each chunk; use just one at the end */ 1296 write_flags &= ~BDRV_REQ_FUA; 1297 need_flush = true; 1298 } 1299 num = MIN(num, max_transfer); 1300 iov.iov_len = num; 1301 if (iov.iov_base == NULL) { 1302 iov.iov_base = qemu_try_blockalign(bs, num); 1303 if (iov.iov_base == NULL) { 1304 ret = -ENOMEM; 1305 goto fail; 1306 } 1307 memset(iov.iov_base, 0, num); 1308 } 1309 qemu_iovec_init_external(&qiov, &iov, 1); 1310 1311 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 1312 1313 /* Keep bounce buffer around if it is big enough for all 1314 * all future requests. 1315 */ 1316 if (num < max_transfer) { 1317 qemu_vfree(iov.iov_base); 1318 iov.iov_base = NULL; 1319 } 1320 } 1321 1322 offset += num; 1323 count -= num; 1324 } 1325 1326 fail: 1327 if (ret == 0 && need_flush) { 1328 ret = bdrv_co_flush(bs); 1329 } 1330 qemu_vfree(iov.iov_base); 1331 return ret; 1332 } 1333 1334 /* 1335 * Forwards an already correctly aligned write request to the BlockDriver, 1336 * after possibly fragmenting it. 1337 */ 1338 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1339 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1340 int64_t align, QEMUIOVector *qiov, int flags) 1341 { 1342 BlockDriverState *bs = child->bs; 1343 BlockDriver *drv = bs->drv; 1344 bool waited; 1345 int ret; 1346 1347 int64_t start_sector = offset >> BDRV_SECTOR_BITS; 1348 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1349 uint64_t bytes_remaining = bytes; 1350 int max_transfer; 1351 1352 assert(is_power_of_2(align)); 1353 assert((offset & (align - 1)) == 0); 1354 assert((bytes & (align - 1)) == 0); 1355 assert(!qiov || bytes == qiov->size); 1356 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1357 assert(!(flags & ~BDRV_REQ_MASK)); 1358 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1359 align); 1360 1361 waited = wait_serialising_requests(req); 1362 assert(!waited || !req->serialising); 1363 assert(req->overlap_offset <= offset); 1364 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1365 assert(child->perm & BLK_PERM_WRITE); 1366 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1367 1368 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1369 1370 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1371 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1372 qemu_iovec_is_zero(qiov)) { 1373 flags |= BDRV_REQ_ZERO_WRITE; 1374 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1375 flags |= BDRV_REQ_MAY_UNMAP; 1376 } 1377 } 1378 1379 if (ret < 0) { 1380 /* Do nothing, write notifier decided to fail this request */ 1381 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1382 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1383 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1384 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1385 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 1386 } else if (bytes <= max_transfer) { 1387 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1388 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 1389 } else { 1390 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1391 while (bytes_remaining) { 1392 int num = MIN(bytes_remaining, max_transfer); 1393 QEMUIOVector local_qiov; 1394 int local_flags = flags; 1395 1396 assert(num); 1397 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1398 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1399 /* If FUA is going to be emulated by flush, we only 1400 * need to flush on the last iteration */ 1401 local_flags &= ~BDRV_REQ_FUA; 1402 } 1403 qemu_iovec_init(&local_qiov, qiov->niov); 1404 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1405 1406 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1407 num, &local_qiov, local_flags); 1408 qemu_iovec_destroy(&local_qiov); 1409 if (ret < 0) { 1410 break; 1411 } 1412 bytes_remaining -= num; 1413 } 1414 } 1415 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1416 1417 ++bs->write_gen; 1418 bdrv_set_dirty(bs, start_sector, end_sector - start_sector); 1419 1420 if (bs->wr_highest_offset < offset + bytes) { 1421 bs->wr_highest_offset = offset + bytes; 1422 } 1423 1424 if (ret >= 0) { 1425 bs->total_sectors = MAX(bs->total_sectors, end_sector); 1426 ret = 0; 1427 } 1428 1429 return ret; 1430 } 1431 1432 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1433 int64_t offset, 1434 unsigned int bytes, 1435 BdrvRequestFlags flags, 1436 BdrvTrackedRequest *req) 1437 { 1438 BlockDriverState *bs = child->bs; 1439 uint8_t *buf = NULL; 1440 QEMUIOVector local_qiov; 1441 struct iovec iov; 1442 uint64_t align = bs->bl.request_alignment; 1443 unsigned int head_padding_bytes, tail_padding_bytes; 1444 int ret = 0; 1445 1446 head_padding_bytes = offset & (align - 1); 1447 tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 1448 1449 1450 assert(flags & BDRV_REQ_ZERO_WRITE); 1451 if (head_padding_bytes || tail_padding_bytes) { 1452 buf = qemu_blockalign(bs, align); 1453 iov = (struct iovec) { 1454 .iov_base = buf, 1455 .iov_len = align, 1456 }; 1457 qemu_iovec_init_external(&local_qiov, &iov, 1); 1458 } 1459 if (head_padding_bytes) { 1460 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1461 1462 /* RMW the unaligned part before head. */ 1463 mark_request_serialising(req, align); 1464 wait_serialising_requests(req); 1465 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1466 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 1467 align, &local_qiov, 0); 1468 if (ret < 0) { 1469 goto fail; 1470 } 1471 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1472 1473 memset(buf + head_padding_bytes, 0, zero_bytes); 1474 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1475 align, &local_qiov, 1476 flags & ~BDRV_REQ_ZERO_WRITE); 1477 if (ret < 0) { 1478 goto fail; 1479 } 1480 offset += zero_bytes; 1481 bytes -= zero_bytes; 1482 } 1483 1484 assert(!bytes || (offset & (align - 1)) == 0); 1485 if (bytes >= align) { 1486 /* Write the aligned part in the middle. */ 1487 uint64_t aligned_bytes = bytes & ~(align - 1); 1488 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 1489 NULL, flags); 1490 if (ret < 0) { 1491 goto fail; 1492 } 1493 bytes -= aligned_bytes; 1494 offset += aligned_bytes; 1495 } 1496 1497 assert(!bytes || (offset & (align - 1)) == 0); 1498 if (bytes) { 1499 assert(align == tail_padding_bytes + bytes); 1500 /* RMW the unaligned part after tail. */ 1501 mark_request_serialising(req, align); 1502 wait_serialising_requests(req); 1503 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1504 ret = bdrv_aligned_preadv(child, req, offset, align, 1505 align, &local_qiov, 0); 1506 if (ret < 0) { 1507 goto fail; 1508 } 1509 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1510 1511 memset(buf, 0, bytes); 1512 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 1513 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1514 } 1515 fail: 1516 qemu_vfree(buf); 1517 return ret; 1518 1519 } 1520 1521 /* 1522 * Handle a write request in coroutine context 1523 */ 1524 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 1525 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1526 BdrvRequestFlags flags) 1527 { 1528 BlockDriverState *bs = child->bs; 1529 BdrvTrackedRequest req; 1530 uint64_t align = bs->bl.request_alignment; 1531 uint8_t *head_buf = NULL; 1532 uint8_t *tail_buf = NULL; 1533 QEMUIOVector local_qiov; 1534 bool use_local_qiov = false; 1535 int ret; 1536 1537 if (!bs->drv) { 1538 return -ENOMEDIUM; 1539 } 1540 if (bs->read_only) { 1541 return -EPERM; 1542 } 1543 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1544 1545 ret = bdrv_check_byte_request(bs, offset, bytes); 1546 if (ret < 0) { 1547 return ret; 1548 } 1549 1550 bdrv_inc_in_flight(bs); 1551 /* 1552 * Align write if necessary by performing a read-modify-write cycle. 1553 * Pad qiov with the read parts and be sure to have a tracked request not 1554 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1555 */ 1556 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1557 1558 if (!qiov) { 1559 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 1560 goto out; 1561 } 1562 1563 if (offset & (align - 1)) { 1564 QEMUIOVector head_qiov; 1565 struct iovec head_iov; 1566 1567 mark_request_serialising(&req, align); 1568 wait_serialising_requests(&req); 1569 1570 head_buf = qemu_blockalign(bs, align); 1571 head_iov = (struct iovec) { 1572 .iov_base = head_buf, 1573 .iov_len = align, 1574 }; 1575 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1576 1577 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1578 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 1579 align, &head_qiov, 0); 1580 if (ret < 0) { 1581 goto fail; 1582 } 1583 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1584 1585 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1586 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1587 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1588 use_local_qiov = true; 1589 1590 bytes += offset & (align - 1); 1591 offset = offset & ~(align - 1); 1592 1593 /* We have read the tail already if the request is smaller 1594 * than one aligned block. 1595 */ 1596 if (bytes < align) { 1597 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1598 bytes = align; 1599 } 1600 } 1601 1602 if ((offset + bytes) & (align - 1)) { 1603 QEMUIOVector tail_qiov; 1604 struct iovec tail_iov; 1605 size_t tail_bytes; 1606 bool waited; 1607 1608 mark_request_serialising(&req, align); 1609 waited = wait_serialising_requests(&req); 1610 assert(!waited || !use_local_qiov); 1611 1612 tail_buf = qemu_blockalign(bs, align); 1613 tail_iov = (struct iovec) { 1614 .iov_base = tail_buf, 1615 .iov_len = align, 1616 }; 1617 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1618 1619 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1620 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 1621 align, align, &tail_qiov, 0); 1622 if (ret < 0) { 1623 goto fail; 1624 } 1625 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1626 1627 if (!use_local_qiov) { 1628 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1629 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1630 use_local_qiov = true; 1631 } 1632 1633 tail_bytes = (offset + bytes) & (align - 1); 1634 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1635 1636 bytes = ROUND_UP(bytes, align); 1637 } 1638 1639 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 1640 use_local_qiov ? &local_qiov : qiov, 1641 flags); 1642 1643 fail: 1644 1645 if (use_local_qiov) { 1646 qemu_iovec_destroy(&local_qiov); 1647 } 1648 qemu_vfree(head_buf); 1649 qemu_vfree(tail_buf); 1650 out: 1651 tracked_request_end(&req); 1652 bdrv_dec_in_flight(bs); 1653 return ret; 1654 } 1655 1656 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, 1657 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1658 BdrvRequestFlags flags) 1659 { 1660 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1661 return -EINVAL; 1662 } 1663 1664 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, 1665 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1666 } 1667 1668 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, 1669 int nb_sectors, QEMUIOVector *qiov) 1670 { 1671 trace_bdrv_co_writev(child->bs, sector_num, nb_sectors); 1672 1673 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); 1674 } 1675 1676 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1677 int count, BdrvRequestFlags flags) 1678 { 1679 trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags); 1680 1681 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 1682 flags &= ~BDRV_REQ_MAY_UNMAP; 1683 } 1684 1685 return bdrv_co_pwritev(child, offset, count, NULL, 1686 BDRV_REQ_ZERO_WRITE | flags); 1687 } 1688 1689 /* 1690 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 1691 */ 1692 int bdrv_flush_all(void) 1693 { 1694 BdrvNextIterator it; 1695 BlockDriverState *bs = NULL; 1696 int result = 0; 1697 1698 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 1699 AioContext *aio_context = bdrv_get_aio_context(bs); 1700 int ret; 1701 1702 aio_context_acquire(aio_context); 1703 ret = bdrv_flush(bs); 1704 if (ret < 0 && !result) { 1705 result = ret; 1706 } 1707 aio_context_release(aio_context); 1708 } 1709 1710 return result; 1711 } 1712 1713 1714 typedef struct BdrvCoGetBlockStatusData { 1715 BlockDriverState *bs; 1716 BlockDriverState *base; 1717 BlockDriverState **file; 1718 int64_t sector_num; 1719 int nb_sectors; 1720 int *pnum; 1721 int64_t ret; 1722 bool done; 1723 } BdrvCoGetBlockStatusData; 1724 1725 /* 1726 * Returns the allocation status of the specified sectors. 1727 * Drivers not implementing the functionality are assumed to not support 1728 * backing files, hence all their sectors are reported as allocated. 1729 * 1730 * If 'sector_num' is beyond the end of the disk image the return value is 0 1731 * and 'pnum' is set to 0. 1732 * 1733 * 'pnum' is set to the number of sectors (including and immediately following 1734 * the specified sector) that are known to be in the same 1735 * allocated/unallocated state. 1736 * 1737 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1738 * beyond the end of the disk image it will be clamped. 1739 * 1740 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' 1741 * points to the BDS which the sector range is allocated in. 1742 */ 1743 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1744 int64_t sector_num, 1745 int nb_sectors, int *pnum, 1746 BlockDriverState **file) 1747 { 1748 int64_t total_sectors; 1749 int64_t n; 1750 int64_t ret, ret2; 1751 1752 total_sectors = bdrv_nb_sectors(bs); 1753 if (total_sectors < 0) { 1754 return total_sectors; 1755 } 1756 1757 if (sector_num >= total_sectors) { 1758 *pnum = 0; 1759 return 0; 1760 } 1761 1762 n = total_sectors - sector_num; 1763 if (n < nb_sectors) { 1764 nb_sectors = n; 1765 } 1766 1767 if (!bs->drv->bdrv_co_get_block_status) { 1768 *pnum = nb_sectors; 1769 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1770 if (bs->drv->protocol_name) { 1771 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1772 } 1773 return ret; 1774 } 1775 1776 *file = NULL; 1777 bdrv_inc_in_flight(bs); 1778 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, 1779 file); 1780 if (ret < 0) { 1781 *pnum = 0; 1782 goto out; 1783 } 1784 1785 if (ret & BDRV_BLOCK_RAW) { 1786 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1787 ret = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1788 *pnum, pnum, file); 1789 goto out; 1790 } 1791 1792 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1793 ret |= BDRV_BLOCK_ALLOCATED; 1794 } else { 1795 if (bdrv_unallocated_blocks_are_zero(bs)) { 1796 ret |= BDRV_BLOCK_ZERO; 1797 } else if (bs->backing) { 1798 BlockDriverState *bs2 = bs->backing->bs; 1799 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1800 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1801 ret |= BDRV_BLOCK_ZERO; 1802 } 1803 } 1804 } 1805 1806 if (*file && *file != bs && 1807 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1808 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1809 BlockDriverState *file2; 1810 int file_pnum; 1811 1812 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1813 *pnum, &file_pnum, &file2); 1814 if (ret2 >= 0) { 1815 /* Ignore errors. This is just providing extra information, it 1816 * is useful but not necessary. 1817 */ 1818 if (!file_pnum) { 1819 /* !file_pnum indicates an offset at or beyond the EOF; it is 1820 * perfectly valid for the format block driver to point to such 1821 * offsets, so catch it and mark everything as zero */ 1822 ret |= BDRV_BLOCK_ZERO; 1823 } else { 1824 /* Limit request to the range reported by the protocol driver */ 1825 *pnum = file_pnum; 1826 ret |= (ret2 & BDRV_BLOCK_ZERO); 1827 } 1828 } 1829 } 1830 1831 out: 1832 bdrv_dec_in_flight(bs); 1833 return ret; 1834 } 1835 1836 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1837 BlockDriverState *base, 1838 int64_t sector_num, 1839 int nb_sectors, 1840 int *pnum, 1841 BlockDriverState **file) 1842 { 1843 BlockDriverState *p; 1844 int64_t ret = 0; 1845 1846 assert(bs != base); 1847 for (p = bs; p != base; p = backing_bs(p)) { 1848 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); 1849 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1850 break; 1851 } 1852 /* [sector_num, pnum] unallocated on this layer, which could be only 1853 * the first part of [sector_num, nb_sectors]. */ 1854 nb_sectors = MIN(nb_sectors, *pnum); 1855 } 1856 return ret; 1857 } 1858 1859 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1860 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1861 { 1862 BdrvCoGetBlockStatusData *data = opaque; 1863 1864 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1865 data->sector_num, 1866 data->nb_sectors, 1867 data->pnum, 1868 data->file); 1869 data->done = true; 1870 } 1871 1872 /* 1873 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1874 * 1875 * See bdrv_co_get_block_status_above() for details. 1876 */ 1877 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1878 BlockDriverState *base, 1879 int64_t sector_num, 1880 int nb_sectors, int *pnum, 1881 BlockDriverState **file) 1882 { 1883 Coroutine *co; 1884 BdrvCoGetBlockStatusData data = { 1885 .bs = bs, 1886 .base = base, 1887 .file = file, 1888 .sector_num = sector_num, 1889 .nb_sectors = nb_sectors, 1890 .pnum = pnum, 1891 .done = false, 1892 }; 1893 1894 if (qemu_in_coroutine()) { 1895 /* Fast-path if already in coroutine context */ 1896 bdrv_get_block_status_above_co_entry(&data); 1897 } else { 1898 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry, 1899 &data); 1900 bdrv_coroutine_enter(bs, co); 1901 BDRV_POLL_WHILE(bs, !data.done); 1902 } 1903 return data.ret; 1904 } 1905 1906 int64_t bdrv_get_block_status(BlockDriverState *bs, 1907 int64_t sector_num, 1908 int nb_sectors, int *pnum, 1909 BlockDriverState **file) 1910 { 1911 return bdrv_get_block_status_above(bs, backing_bs(bs), 1912 sector_num, nb_sectors, pnum, file); 1913 } 1914 1915 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1916 int nb_sectors, int *pnum) 1917 { 1918 BlockDriverState *file; 1919 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, 1920 &file); 1921 if (ret < 0) { 1922 return ret; 1923 } 1924 return !!(ret & BDRV_BLOCK_ALLOCATED); 1925 } 1926 1927 /* 1928 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1929 * 1930 * Return true if the given sector is allocated in any image between 1931 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1932 * sector is allocated in any image of the chain. Return false otherwise. 1933 * 1934 * 'pnum' is set to the number of sectors (including and immediately following 1935 * the specified sector) that are known to be in the same 1936 * allocated/unallocated state. 1937 * 1938 */ 1939 int bdrv_is_allocated_above(BlockDriverState *top, 1940 BlockDriverState *base, 1941 int64_t sector_num, 1942 int nb_sectors, int *pnum) 1943 { 1944 BlockDriverState *intermediate; 1945 int ret, n = nb_sectors; 1946 1947 intermediate = top; 1948 while (intermediate && intermediate != base) { 1949 int pnum_inter; 1950 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1951 &pnum_inter); 1952 if (ret < 0) { 1953 return ret; 1954 } else if (ret) { 1955 *pnum = pnum_inter; 1956 return 1; 1957 } 1958 1959 /* 1960 * [sector_num, nb_sectors] is unallocated on top but intermediate 1961 * might have 1962 * 1963 * [sector_num+x, nr_sectors] allocated. 1964 */ 1965 if (n > pnum_inter && 1966 (intermediate == top || 1967 sector_num + pnum_inter < intermediate->total_sectors)) { 1968 n = pnum_inter; 1969 } 1970 1971 intermediate = backing_bs(intermediate); 1972 } 1973 1974 *pnum = n; 1975 return 0; 1976 } 1977 1978 typedef struct BdrvVmstateCo { 1979 BlockDriverState *bs; 1980 QEMUIOVector *qiov; 1981 int64_t pos; 1982 bool is_read; 1983 int ret; 1984 } BdrvVmstateCo; 1985 1986 static int coroutine_fn 1987 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 1988 bool is_read) 1989 { 1990 BlockDriver *drv = bs->drv; 1991 1992 if (!drv) { 1993 return -ENOMEDIUM; 1994 } else if (drv->bdrv_load_vmstate) { 1995 return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos) 1996 : drv->bdrv_save_vmstate(bs, qiov, pos); 1997 } else if (bs->file) { 1998 return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 1999 } 2000 2001 return -ENOTSUP; 2002 } 2003 2004 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2005 { 2006 BdrvVmstateCo *co = opaque; 2007 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2008 } 2009 2010 static inline int 2011 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2012 bool is_read) 2013 { 2014 if (qemu_in_coroutine()) { 2015 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2016 } else { 2017 BdrvVmstateCo data = { 2018 .bs = bs, 2019 .qiov = qiov, 2020 .pos = pos, 2021 .is_read = is_read, 2022 .ret = -EINPROGRESS, 2023 }; 2024 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2025 2026 bdrv_coroutine_enter(bs, co); 2027 while (data.ret == -EINPROGRESS) { 2028 aio_poll(bdrv_get_aio_context(bs), true); 2029 } 2030 return data.ret; 2031 } 2032 } 2033 2034 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2035 int64_t pos, int size) 2036 { 2037 QEMUIOVector qiov; 2038 struct iovec iov = { 2039 .iov_base = (void *) buf, 2040 .iov_len = size, 2041 }; 2042 int ret; 2043 2044 qemu_iovec_init_external(&qiov, &iov, 1); 2045 2046 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2047 if (ret < 0) { 2048 return ret; 2049 } 2050 2051 return size; 2052 } 2053 2054 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2055 { 2056 return bdrv_rw_vmstate(bs, qiov, pos, false); 2057 } 2058 2059 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2060 int64_t pos, int size) 2061 { 2062 QEMUIOVector qiov; 2063 struct iovec iov = { 2064 .iov_base = buf, 2065 .iov_len = size, 2066 }; 2067 int ret; 2068 2069 qemu_iovec_init_external(&qiov, &iov, 1); 2070 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2071 if (ret < 0) { 2072 return ret; 2073 } 2074 2075 return size; 2076 } 2077 2078 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2079 { 2080 return bdrv_rw_vmstate(bs, qiov, pos, true); 2081 } 2082 2083 /**************************************************************/ 2084 /* async I/Os */ 2085 2086 BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num, 2087 QEMUIOVector *qiov, int nb_sectors, 2088 BlockCompletionFunc *cb, void *opaque) 2089 { 2090 trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque); 2091 2092 assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); 2093 return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, 2094 0, cb, opaque, false); 2095 } 2096 2097 BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num, 2098 QEMUIOVector *qiov, int nb_sectors, 2099 BlockCompletionFunc *cb, void *opaque) 2100 { 2101 trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque); 2102 2103 assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size); 2104 return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov, 2105 0, cb, opaque, true); 2106 } 2107 2108 void bdrv_aio_cancel(BlockAIOCB *acb) 2109 { 2110 qemu_aio_ref(acb); 2111 bdrv_aio_cancel_async(acb); 2112 while (acb->refcnt > 1) { 2113 if (acb->aiocb_info->get_aio_context) { 2114 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2115 } else if (acb->bs) { 2116 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2117 * assert that we're not using an I/O thread. Thread-safe 2118 * code should use bdrv_aio_cancel_async exclusively. 2119 */ 2120 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2121 aio_poll(bdrv_get_aio_context(acb->bs), true); 2122 } else { 2123 abort(); 2124 } 2125 } 2126 qemu_aio_unref(acb); 2127 } 2128 2129 /* Async version of aio cancel. The caller is not blocked if the acb implements 2130 * cancel_async, otherwise we do nothing and let the request normally complete. 2131 * In either case the completion callback must be called. */ 2132 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2133 { 2134 if (acb->aiocb_info->cancel_async) { 2135 acb->aiocb_info->cancel_async(acb); 2136 } 2137 } 2138 2139 /**************************************************************/ 2140 /* async block device emulation */ 2141 2142 typedef struct BlockRequest { 2143 union { 2144 /* Used during read, write, trim */ 2145 struct { 2146 int64_t offset; 2147 int bytes; 2148 int flags; 2149 QEMUIOVector *qiov; 2150 }; 2151 /* Used during ioctl */ 2152 struct { 2153 int req; 2154 void *buf; 2155 }; 2156 }; 2157 BlockCompletionFunc *cb; 2158 void *opaque; 2159 2160 int error; 2161 } BlockRequest; 2162 2163 typedef struct BlockAIOCBCoroutine { 2164 BlockAIOCB common; 2165 BdrvChild *child; 2166 BlockRequest req; 2167 bool is_write; 2168 bool need_bh; 2169 bool *done; 2170 } BlockAIOCBCoroutine; 2171 2172 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2173 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2174 }; 2175 2176 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2177 { 2178 if (!acb->need_bh) { 2179 bdrv_dec_in_flight(acb->common.bs); 2180 acb->common.cb(acb->common.opaque, acb->req.error); 2181 qemu_aio_unref(acb); 2182 } 2183 } 2184 2185 static void bdrv_co_em_bh(void *opaque) 2186 { 2187 BlockAIOCBCoroutine *acb = opaque; 2188 2189 assert(!acb->need_bh); 2190 bdrv_co_complete(acb); 2191 } 2192 2193 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2194 { 2195 acb->need_bh = false; 2196 if (acb->req.error != -EINPROGRESS) { 2197 BlockDriverState *bs = acb->common.bs; 2198 2199 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2200 } 2201 } 2202 2203 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2204 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2205 { 2206 BlockAIOCBCoroutine *acb = opaque; 2207 2208 if (!acb->is_write) { 2209 acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset, 2210 acb->req.qiov->size, acb->req.qiov, acb->req.flags); 2211 } else { 2212 acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset, 2213 acb->req.qiov->size, acb->req.qiov, acb->req.flags); 2214 } 2215 2216 bdrv_co_complete(acb); 2217 } 2218 2219 static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, 2220 int64_t offset, 2221 QEMUIOVector *qiov, 2222 BdrvRequestFlags flags, 2223 BlockCompletionFunc *cb, 2224 void *opaque, 2225 bool is_write) 2226 { 2227 Coroutine *co; 2228 BlockAIOCBCoroutine *acb; 2229 2230 /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */ 2231 bdrv_inc_in_flight(child->bs); 2232 2233 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque); 2234 acb->child = child; 2235 acb->need_bh = true; 2236 acb->req.error = -EINPROGRESS; 2237 acb->req.offset = offset; 2238 acb->req.qiov = qiov; 2239 acb->req.flags = flags; 2240 acb->is_write = is_write; 2241 2242 co = qemu_coroutine_create(bdrv_co_do_rw, acb); 2243 bdrv_coroutine_enter(child->bs, co); 2244 2245 bdrv_co_maybe_schedule_bh(acb); 2246 return &acb->common; 2247 } 2248 2249 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2250 { 2251 BlockAIOCBCoroutine *acb = opaque; 2252 BlockDriverState *bs = acb->common.bs; 2253 2254 acb->req.error = bdrv_co_flush(bs); 2255 bdrv_co_complete(acb); 2256 } 2257 2258 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2259 BlockCompletionFunc *cb, void *opaque) 2260 { 2261 trace_bdrv_aio_flush(bs, opaque); 2262 2263 Coroutine *co; 2264 BlockAIOCBCoroutine *acb; 2265 2266 /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */ 2267 bdrv_inc_in_flight(bs); 2268 2269 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2270 acb->need_bh = true; 2271 acb->req.error = -EINPROGRESS; 2272 2273 co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb); 2274 bdrv_coroutine_enter(bs, co); 2275 2276 bdrv_co_maybe_schedule_bh(acb); 2277 return &acb->common; 2278 } 2279 2280 /**************************************************************/ 2281 /* Coroutine block device emulation */ 2282 2283 typedef struct FlushCo { 2284 BlockDriverState *bs; 2285 int ret; 2286 } FlushCo; 2287 2288 2289 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2290 { 2291 FlushCo *rwco = opaque; 2292 2293 rwco->ret = bdrv_co_flush(rwco->bs); 2294 } 2295 2296 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2297 { 2298 int current_gen; 2299 int ret = 0; 2300 2301 bdrv_inc_in_flight(bs); 2302 2303 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2304 bdrv_is_sg(bs)) { 2305 goto early_exit; 2306 } 2307 2308 current_gen = bs->write_gen; 2309 2310 /* Wait until any previous flushes are completed */ 2311 while (bs->active_flush_req) { 2312 qemu_co_queue_wait(&bs->flush_queue, NULL); 2313 } 2314 2315 bs->active_flush_req = true; 2316 2317 /* Write back all layers by calling one driver function */ 2318 if (bs->drv->bdrv_co_flush) { 2319 ret = bs->drv->bdrv_co_flush(bs); 2320 goto out; 2321 } 2322 2323 /* Write back cached data to the OS even with cache=unsafe */ 2324 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2325 if (bs->drv->bdrv_co_flush_to_os) { 2326 ret = bs->drv->bdrv_co_flush_to_os(bs); 2327 if (ret < 0) { 2328 goto out; 2329 } 2330 } 2331 2332 /* But don't actually force it to the disk with cache=unsafe */ 2333 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2334 goto flush_parent; 2335 } 2336 2337 /* Check if we really need to flush anything */ 2338 if (bs->flushed_gen == current_gen) { 2339 goto flush_parent; 2340 } 2341 2342 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2343 if (bs->drv->bdrv_co_flush_to_disk) { 2344 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2345 } else if (bs->drv->bdrv_aio_flush) { 2346 BlockAIOCB *acb; 2347 CoroutineIOCompletion co = { 2348 .coroutine = qemu_coroutine_self(), 2349 }; 2350 2351 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2352 if (acb == NULL) { 2353 ret = -EIO; 2354 } else { 2355 qemu_coroutine_yield(); 2356 ret = co.ret; 2357 } 2358 } else { 2359 /* 2360 * Some block drivers always operate in either writethrough or unsafe 2361 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2362 * know how the server works (because the behaviour is hardcoded or 2363 * depends on server-side configuration), so we can't ensure that 2364 * everything is safe on disk. Returning an error doesn't work because 2365 * that would break guests even if the server operates in writethrough 2366 * mode. 2367 * 2368 * Let's hope the user knows what he's doing. 2369 */ 2370 ret = 0; 2371 } 2372 2373 if (ret < 0) { 2374 goto out; 2375 } 2376 2377 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2378 * in the case of cache=unsafe, so there are no useless flushes. 2379 */ 2380 flush_parent: 2381 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2382 out: 2383 /* Notify any pending flushes that we have completed */ 2384 if (ret == 0) { 2385 bs->flushed_gen = current_gen; 2386 } 2387 bs->active_flush_req = false; 2388 /* Return value is ignored - it's ok if wait queue is empty */ 2389 qemu_co_queue_next(&bs->flush_queue); 2390 2391 early_exit: 2392 bdrv_dec_in_flight(bs); 2393 return ret; 2394 } 2395 2396 int bdrv_flush(BlockDriverState *bs) 2397 { 2398 Coroutine *co; 2399 FlushCo flush_co = { 2400 .bs = bs, 2401 .ret = NOT_DONE, 2402 }; 2403 2404 if (qemu_in_coroutine()) { 2405 /* Fast-path if already in coroutine context */ 2406 bdrv_flush_co_entry(&flush_co); 2407 } else { 2408 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2409 bdrv_coroutine_enter(bs, co); 2410 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2411 } 2412 2413 return flush_co.ret; 2414 } 2415 2416 typedef struct DiscardCo { 2417 BlockDriverState *bs; 2418 int64_t offset; 2419 int count; 2420 int ret; 2421 } DiscardCo; 2422 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2423 { 2424 DiscardCo *rwco = opaque; 2425 2426 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count); 2427 } 2428 2429 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2430 int count) 2431 { 2432 BdrvTrackedRequest req; 2433 int max_pdiscard, ret; 2434 int head, tail, align; 2435 2436 if (!bs->drv) { 2437 return -ENOMEDIUM; 2438 } 2439 2440 ret = bdrv_check_byte_request(bs, offset, count); 2441 if (ret < 0) { 2442 return ret; 2443 } else if (bs->read_only) { 2444 return -EPERM; 2445 } 2446 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2447 2448 /* Do nothing if disabled. */ 2449 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2450 return 0; 2451 } 2452 2453 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2454 return 0; 2455 } 2456 2457 /* Discard is advisory, but some devices track and coalesce 2458 * unaligned requests, so we must pass everything down rather than 2459 * round here. Still, most devices will just silently ignore 2460 * unaligned requests (by returning -ENOTSUP), so we must fragment 2461 * the request accordingly. */ 2462 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2463 assert(align % bs->bl.request_alignment == 0); 2464 head = offset % align; 2465 tail = (offset + count) % align; 2466 2467 bdrv_inc_in_flight(bs); 2468 tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD); 2469 2470 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2471 if (ret < 0) { 2472 goto out; 2473 } 2474 2475 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2476 align); 2477 assert(max_pdiscard >= bs->bl.request_alignment); 2478 2479 while (count > 0) { 2480 int ret; 2481 int num = count; 2482 2483 if (head) { 2484 /* Make small requests to get to alignment boundaries. */ 2485 num = MIN(count, align - head); 2486 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2487 num %= bs->bl.request_alignment; 2488 } 2489 head = (head + num) % align; 2490 assert(num < max_pdiscard); 2491 } else if (tail) { 2492 if (num > align) { 2493 /* Shorten the request to the last aligned cluster. */ 2494 num -= tail; 2495 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2496 tail > bs->bl.request_alignment) { 2497 tail %= bs->bl.request_alignment; 2498 num -= tail; 2499 } 2500 } 2501 /* limit request size */ 2502 if (num > max_pdiscard) { 2503 num = max_pdiscard; 2504 } 2505 2506 if (bs->drv->bdrv_co_pdiscard) { 2507 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2508 } else { 2509 BlockAIOCB *acb; 2510 CoroutineIOCompletion co = { 2511 .coroutine = qemu_coroutine_self(), 2512 }; 2513 2514 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2515 bdrv_co_io_em_complete, &co); 2516 if (acb == NULL) { 2517 ret = -EIO; 2518 goto out; 2519 } else { 2520 qemu_coroutine_yield(); 2521 ret = co.ret; 2522 } 2523 } 2524 if (ret && ret != -ENOTSUP) { 2525 goto out; 2526 } 2527 2528 offset += num; 2529 count -= num; 2530 } 2531 ret = 0; 2532 out: 2533 ++bs->write_gen; 2534 bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, 2535 req.bytes >> BDRV_SECTOR_BITS); 2536 tracked_request_end(&req); 2537 bdrv_dec_in_flight(bs); 2538 return ret; 2539 } 2540 2541 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count) 2542 { 2543 Coroutine *co; 2544 DiscardCo rwco = { 2545 .bs = bs, 2546 .offset = offset, 2547 .count = count, 2548 .ret = NOT_DONE, 2549 }; 2550 2551 if (qemu_in_coroutine()) { 2552 /* Fast-path if already in coroutine context */ 2553 bdrv_pdiscard_co_entry(&rwco); 2554 } else { 2555 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2556 bdrv_coroutine_enter(bs, co); 2557 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 2558 } 2559 2560 return rwco.ret; 2561 } 2562 2563 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 2564 { 2565 BlockDriver *drv = bs->drv; 2566 CoroutineIOCompletion co = { 2567 .coroutine = qemu_coroutine_self(), 2568 }; 2569 BlockAIOCB *acb; 2570 2571 bdrv_inc_in_flight(bs); 2572 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 2573 co.ret = -ENOTSUP; 2574 goto out; 2575 } 2576 2577 if (drv->bdrv_co_ioctl) { 2578 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 2579 } else { 2580 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2581 if (!acb) { 2582 co.ret = -ENOTSUP; 2583 goto out; 2584 } 2585 qemu_coroutine_yield(); 2586 } 2587 out: 2588 bdrv_dec_in_flight(bs); 2589 return co.ret; 2590 } 2591 2592 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2593 { 2594 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2595 } 2596 2597 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2598 { 2599 return memset(qemu_blockalign(bs, size), 0, size); 2600 } 2601 2602 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2603 { 2604 size_t align = bdrv_opt_mem_align(bs); 2605 2606 /* Ensure that NULL is never returned on success */ 2607 assert(align > 0); 2608 if (size == 0) { 2609 size = align; 2610 } 2611 2612 return qemu_try_memalign(align, size); 2613 } 2614 2615 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2616 { 2617 void *mem = qemu_try_blockalign(bs, size); 2618 2619 if (mem) { 2620 memset(mem, 0, size); 2621 } 2622 2623 return mem; 2624 } 2625 2626 /* 2627 * Check if all memory in this vector is sector aligned. 2628 */ 2629 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2630 { 2631 int i; 2632 size_t alignment = bdrv_min_mem_align(bs); 2633 2634 for (i = 0; i < qiov->niov; i++) { 2635 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2636 return false; 2637 } 2638 if (qiov->iov[i].iov_len % alignment) { 2639 return false; 2640 } 2641 } 2642 2643 return true; 2644 } 2645 2646 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2647 NotifierWithReturn *notifier) 2648 { 2649 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2650 } 2651 2652 void bdrv_io_plug(BlockDriverState *bs) 2653 { 2654 BdrvChild *child; 2655 2656 QLIST_FOREACH(child, &bs->children, next) { 2657 bdrv_io_plug(child->bs); 2658 } 2659 2660 if (bs->io_plugged++ == 0) { 2661 BlockDriver *drv = bs->drv; 2662 if (drv && drv->bdrv_io_plug) { 2663 drv->bdrv_io_plug(bs); 2664 } 2665 } 2666 } 2667 2668 void bdrv_io_unplug(BlockDriverState *bs) 2669 { 2670 BdrvChild *child; 2671 2672 assert(bs->io_plugged); 2673 if (--bs->io_plugged == 0) { 2674 BlockDriver *drv = bs->drv; 2675 if (drv && drv->bdrv_io_unplug) { 2676 drv->bdrv_io_unplug(bs); 2677 } 2678 } 2679 2680 QLIST_FOREACH(child, &bs->children, next) { 2681 bdrv_io_unplug(child->bs); 2682 } 2683 } 2684