1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "block/coroutines.h" 33 #include "block/write-threshold.h" 34 #include "qemu/cutils.h" 35 #include "qemu/memalign.h" 36 #include "qapi/error.h" 37 #include "qemu/error-report.h" 38 #include "qemu/main-loop.h" 39 #include "sysemu/replay.h" 40 41 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 42 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 43 44 static void bdrv_parent_cb_resize(BlockDriverState *bs); 45 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 46 int64_t offset, int64_t bytes, BdrvRequestFlags flags); 47 48 static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) 49 { 50 BdrvChild *c, *next; 51 52 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 53 if (c == ignore) { 54 continue; 55 } 56 bdrv_parent_drained_begin_single(c); 57 } 58 } 59 60 void bdrv_parent_drained_end_single(BdrvChild *c) 61 { 62 IO_OR_GS_CODE(); 63 64 assert(c->quiesced_parent); 65 c->quiesced_parent = false; 66 67 if (c->klass->drained_end) { 68 c->klass->drained_end(c); 69 } 70 } 71 72 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) 73 { 74 BdrvChild *c; 75 76 QLIST_FOREACH(c, &bs->parents, next_parent) { 77 if (c == ignore) { 78 continue; 79 } 80 bdrv_parent_drained_end_single(c); 81 } 82 } 83 84 bool bdrv_parent_drained_poll_single(BdrvChild *c) 85 { 86 if (c->klass->drained_poll) { 87 return c->klass->drained_poll(c); 88 } 89 return false; 90 } 91 92 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 93 bool ignore_bds_parents) 94 { 95 BdrvChild *c, *next; 96 bool busy = false; 97 98 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 99 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 100 continue; 101 } 102 busy |= bdrv_parent_drained_poll_single(c); 103 } 104 105 return busy; 106 } 107 108 void bdrv_parent_drained_begin_single(BdrvChild *c) 109 { 110 IO_OR_GS_CODE(); 111 112 assert(!c->quiesced_parent); 113 c->quiesced_parent = true; 114 115 if (c->klass->drained_begin) { 116 c->klass->drained_begin(c); 117 } 118 } 119 120 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 121 { 122 dst->pdiscard_alignment = MAX(dst->pdiscard_alignment, 123 src->pdiscard_alignment); 124 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 125 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 126 dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, 127 src->max_hw_transfer); 128 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 129 src->opt_mem_alignment); 130 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 131 src->min_mem_alignment); 132 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 133 dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov); 134 } 135 136 typedef struct BdrvRefreshLimitsState { 137 BlockDriverState *bs; 138 BlockLimits old_bl; 139 } BdrvRefreshLimitsState; 140 141 static void bdrv_refresh_limits_abort(void *opaque) 142 { 143 BdrvRefreshLimitsState *s = opaque; 144 145 s->bs->bl = s->old_bl; 146 } 147 148 static TransactionActionDrv bdrv_refresh_limits_drv = { 149 .abort = bdrv_refresh_limits_abort, 150 .clean = g_free, 151 }; 152 153 /* @tran is allowed to be NULL, in this case no rollback is possible. */ 154 void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp) 155 { 156 ERRP_GUARD(); 157 BlockDriver *drv = bs->drv; 158 BdrvChild *c; 159 bool have_limits; 160 161 GLOBAL_STATE_CODE(); 162 163 if (tran) { 164 BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1); 165 *s = (BdrvRefreshLimitsState) { 166 .bs = bs, 167 .old_bl = bs->bl, 168 }; 169 tran_add(tran, &bdrv_refresh_limits_drv, s); 170 } 171 172 memset(&bs->bl, 0, sizeof(bs->bl)); 173 174 if (!drv) { 175 return; 176 } 177 178 /* Default alignment based on whether driver has byte interface */ 179 bs->bl.request_alignment = (drv->bdrv_co_preadv || 180 drv->bdrv_aio_preadv || 181 drv->bdrv_co_preadv_part) ? 1 : 512; 182 183 /* Take some limits from the children as a default */ 184 have_limits = false; 185 QLIST_FOREACH(c, &bs->children, next) { 186 if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) 187 { 188 bdrv_merge_limits(&bs->bl, &c->bs->bl); 189 have_limits = true; 190 } 191 } 192 193 if (!have_limits) { 194 bs->bl.min_mem_alignment = 512; 195 bs->bl.opt_mem_alignment = qemu_real_host_page_size(); 196 197 /* Safe default since most protocols use readv()/writev()/etc */ 198 bs->bl.max_iov = IOV_MAX; 199 } 200 201 /* Then let the driver override it */ 202 if (drv->bdrv_refresh_limits) { 203 drv->bdrv_refresh_limits(bs, errp); 204 if (*errp) { 205 return; 206 } 207 } 208 209 if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) { 210 error_setg(errp, "Driver requires too large request alignment"); 211 } 212 } 213 214 /** 215 * The copy-on-read flag is actually a reference count so multiple users may 216 * use the feature without worrying about clobbering its previous state. 217 * Copy-on-read stays enabled until all users have called to disable it. 218 */ 219 void bdrv_enable_copy_on_read(BlockDriverState *bs) 220 { 221 IO_CODE(); 222 qatomic_inc(&bs->copy_on_read); 223 } 224 225 void bdrv_disable_copy_on_read(BlockDriverState *bs) 226 { 227 int old = qatomic_fetch_dec(&bs->copy_on_read); 228 IO_CODE(); 229 assert(old >= 1); 230 } 231 232 typedef struct { 233 Coroutine *co; 234 BlockDriverState *bs; 235 bool done; 236 bool begin; 237 bool poll; 238 BdrvChild *parent; 239 } BdrvCoDrainData; 240 241 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 242 bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent, 243 bool ignore_bds_parents) 244 { 245 IO_OR_GS_CODE(); 246 247 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 248 return true; 249 } 250 251 if (qatomic_read(&bs->in_flight)) { 252 return true; 253 } 254 255 return false; 256 } 257 258 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, 259 BdrvChild *ignore_parent) 260 { 261 return bdrv_drain_poll(bs, ignore_parent, false); 262 } 263 264 static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, 265 bool poll); 266 static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent); 267 268 static void bdrv_co_drain_bh_cb(void *opaque) 269 { 270 BdrvCoDrainData *data = opaque; 271 Coroutine *co = data->co; 272 BlockDriverState *bs = data->bs; 273 274 if (bs) { 275 AioContext *ctx = bdrv_get_aio_context(bs); 276 aio_context_acquire(ctx); 277 bdrv_dec_in_flight(bs); 278 if (data->begin) { 279 bdrv_do_drained_begin(bs, data->parent, data->poll); 280 } else { 281 assert(!data->poll); 282 bdrv_do_drained_end(bs, data->parent); 283 } 284 aio_context_release(ctx); 285 } else { 286 assert(data->begin); 287 bdrv_drain_all_begin(); 288 } 289 290 data->done = true; 291 aio_co_wake(co); 292 } 293 294 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 295 bool begin, 296 BdrvChild *parent, 297 bool poll) 298 { 299 BdrvCoDrainData data; 300 Coroutine *self = qemu_coroutine_self(); 301 AioContext *ctx = bdrv_get_aio_context(bs); 302 AioContext *co_ctx = qemu_coroutine_get_aio_context(self); 303 304 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 305 * other coroutines run if they were queued by aio_co_enter(). */ 306 307 assert(qemu_in_coroutine()); 308 data = (BdrvCoDrainData) { 309 .co = self, 310 .bs = bs, 311 .done = false, 312 .begin = begin, 313 .parent = parent, 314 .poll = poll, 315 }; 316 317 if (bs) { 318 bdrv_inc_in_flight(bs); 319 } 320 321 /* 322 * Temporarily drop the lock across yield or we would get deadlocks. 323 * bdrv_co_drain_bh_cb() reaquires the lock as needed. 324 * 325 * When we yield below, the lock for the current context will be 326 * released, so if this is actually the lock that protects bs, don't drop 327 * it a second time. 328 */ 329 if (ctx != co_ctx) { 330 aio_context_release(ctx); 331 } 332 replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data); 333 334 qemu_coroutine_yield(); 335 /* If we are resumed from some other event (such as an aio completion or a 336 * timer callback), it is a bug in the caller that should be fixed. */ 337 assert(data.done); 338 339 /* Reaquire the AioContext of bs if we dropped it */ 340 if (ctx != co_ctx) { 341 aio_context_acquire(ctx); 342 } 343 } 344 345 static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, 346 bool poll) 347 { 348 IO_OR_GS_CODE(); 349 350 if (qemu_in_coroutine()) { 351 bdrv_co_yield_to_drain(bs, true, parent, poll); 352 return; 353 } 354 355 /* Stop things in parent-to-child order */ 356 if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { 357 aio_disable_external(bdrv_get_aio_context(bs)); 358 bdrv_parent_drained_begin(bs, parent); 359 if (bs->drv && bs->drv->bdrv_drain_begin) { 360 bs->drv->bdrv_drain_begin(bs); 361 } 362 } 363 364 /* 365 * Wait for drained requests to finish. 366 * 367 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 368 * call is needed so things in this AioContext can make progress even 369 * though we don't return to the main AioContext loop - this automatically 370 * includes other nodes in the same AioContext and therefore all child 371 * nodes. 372 */ 373 if (poll) { 374 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent)); 375 } 376 } 377 378 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent) 379 { 380 bdrv_do_drained_begin(bs, parent, false); 381 } 382 383 void bdrv_drained_begin(BlockDriverState *bs) 384 { 385 IO_OR_GS_CODE(); 386 bdrv_do_drained_begin(bs, NULL, true); 387 } 388 389 /** 390 * This function does not poll, nor must any of its recursively called 391 * functions. 392 */ 393 static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) 394 { 395 int old_quiesce_counter; 396 397 if (qemu_in_coroutine()) { 398 bdrv_co_yield_to_drain(bs, false, parent, false); 399 return; 400 } 401 assert(bs->quiesce_counter > 0); 402 403 /* Re-enable things in child-to-parent order */ 404 old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); 405 if (old_quiesce_counter == 1) { 406 if (bs->drv && bs->drv->bdrv_drain_end) { 407 bs->drv->bdrv_drain_end(bs); 408 } 409 bdrv_parent_drained_end(bs, parent); 410 aio_enable_external(bdrv_get_aio_context(bs)); 411 } 412 } 413 414 void bdrv_drained_end(BlockDriverState *bs) 415 { 416 IO_OR_GS_CODE(); 417 bdrv_do_drained_end(bs, NULL); 418 } 419 420 void bdrv_drain(BlockDriverState *bs) 421 { 422 IO_OR_GS_CODE(); 423 bdrv_drained_begin(bs); 424 bdrv_drained_end(bs); 425 } 426 427 static void bdrv_drain_assert_idle(BlockDriverState *bs) 428 { 429 BdrvChild *child, *next; 430 431 assert(qatomic_read(&bs->in_flight) == 0); 432 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 433 bdrv_drain_assert_idle(child->bs); 434 } 435 } 436 437 unsigned int bdrv_drain_all_count = 0; 438 439 static bool bdrv_drain_all_poll(void) 440 { 441 BlockDriverState *bs = NULL; 442 bool result = false; 443 GLOBAL_STATE_CODE(); 444 445 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 446 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 447 while ((bs = bdrv_next_all_states(bs))) { 448 AioContext *aio_context = bdrv_get_aio_context(bs); 449 aio_context_acquire(aio_context); 450 result |= bdrv_drain_poll(bs, NULL, true); 451 aio_context_release(aio_context); 452 } 453 454 return result; 455 } 456 457 /* 458 * Wait for pending requests to complete across all BlockDriverStates 459 * 460 * This function does not flush data to disk, use bdrv_flush_all() for that 461 * after calling this function. 462 * 463 * This pauses all block jobs and disables external clients. It must 464 * be paired with bdrv_drain_all_end(). 465 * 466 * NOTE: no new block jobs or BlockDriverStates can be created between 467 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 468 */ 469 void bdrv_drain_all_begin_nopoll(void) 470 { 471 BlockDriverState *bs = NULL; 472 GLOBAL_STATE_CODE(); 473 474 /* 475 * bdrv queue is managed by record/replay, 476 * waiting for finishing the I/O requests may 477 * be infinite 478 */ 479 if (replay_events_enabled()) { 480 return; 481 } 482 483 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 484 * loop AioContext, so make sure we're in the main context. */ 485 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 486 assert(bdrv_drain_all_count < INT_MAX); 487 bdrv_drain_all_count++; 488 489 /* Quiesce all nodes, without polling in-flight requests yet. The graph 490 * cannot change during this loop. */ 491 while ((bs = bdrv_next_all_states(bs))) { 492 AioContext *aio_context = bdrv_get_aio_context(bs); 493 494 aio_context_acquire(aio_context); 495 bdrv_do_drained_begin(bs, NULL, false); 496 aio_context_release(aio_context); 497 } 498 } 499 500 void bdrv_drain_all_begin(void) 501 { 502 BlockDriverState *bs = NULL; 503 504 if (qemu_in_coroutine()) { 505 bdrv_co_yield_to_drain(NULL, true, NULL, true); 506 return; 507 } 508 509 bdrv_drain_all_begin_nopoll(); 510 511 /* Now poll the in-flight requests */ 512 AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 513 514 while ((bs = bdrv_next_all_states(bs))) { 515 bdrv_drain_assert_idle(bs); 516 } 517 } 518 519 void bdrv_drain_all_end_quiesce(BlockDriverState *bs) 520 { 521 GLOBAL_STATE_CODE(); 522 523 g_assert(bs->quiesce_counter > 0); 524 g_assert(!bs->refcnt); 525 526 while (bs->quiesce_counter) { 527 bdrv_do_drained_end(bs, NULL); 528 } 529 } 530 531 void bdrv_drain_all_end(void) 532 { 533 BlockDriverState *bs = NULL; 534 GLOBAL_STATE_CODE(); 535 536 /* 537 * bdrv queue is managed by record/replay, 538 * waiting for finishing the I/O requests may 539 * be endless 540 */ 541 if (replay_events_enabled()) { 542 return; 543 } 544 545 while ((bs = bdrv_next_all_states(bs))) { 546 AioContext *aio_context = bdrv_get_aio_context(bs); 547 548 aio_context_acquire(aio_context); 549 bdrv_do_drained_end(bs, NULL); 550 aio_context_release(aio_context); 551 } 552 553 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 554 assert(bdrv_drain_all_count > 0); 555 bdrv_drain_all_count--; 556 } 557 558 void bdrv_drain_all(void) 559 { 560 GLOBAL_STATE_CODE(); 561 bdrv_drain_all_begin(); 562 bdrv_drain_all_end(); 563 } 564 565 /** 566 * Remove an active request from the tracked requests list 567 * 568 * This function should be called when a tracked request is completing. 569 */ 570 static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req) 571 { 572 if (req->serialising) { 573 qatomic_dec(&req->bs->serialising_in_flight); 574 } 575 576 qemu_co_mutex_lock(&req->bs->reqs_lock); 577 QLIST_REMOVE(req, list); 578 qemu_co_queue_restart_all(&req->wait_queue); 579 qemu_co_mutex_unlock(&req->bs->reqs_lock); 580 } 581 582 /** 583 * Add an active request to the tracked requests list 584 */ 585 static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req, 586 BlockDriverState *bs, 587 int64_t offset, 588 int64_t bytes, 589 enum BdrvTrackedRequestType type) 590 { 591 bdrv_check_request(offset, bytes, &error_abort); 592 593 *req = (BdrvTrackedRequest){ 594 .bs = bs, 595 .offset = offset, 596 .bytes = bytes, 597 .type = type, 598 .co = qemu_coroutine_self(), 599 .serialising = false, 600 .overlap_offset = offset, 601 .overlap_bytes = bytes, 602 }; 603 604 qemu_co_queue_init(&req->wait_queue); 605 606 qemu_co_mutex_lock(&bs->reqs_lock); 607 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 608 qemu_co_mutex_unlock(&bs->reqs_lock); 609 } 610 611 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 612 int64_t offset, int64_t bytes) 613 { 614 bdrv_check_request(offset, bytes, &error_abort); 615 616 /* aaaa bbbb */ 617 if (offset >= req->overlap_offset + req->overlap_bytes) { 618 return false; 619 } 620 /* bbbb aaaa */ 621 if (req->overlap_offset >= offset + bytes) { 622 return false; 623 } 624 return true; 625 } 626 627 /* Called with self->bs->reqs_lock held */ 628 static coroutine_fn BdrvTrackedRequest * 629 bdrv_find_conflicting_request(BdrvTrackedRequest *self) 630 { 631 BdrvTrackedRequest *req; 632 633 QLIST_FOREACH(req, &self->bs->tracked_requests, list) { 634 if (req == self || (!req->serialising && !self->serialising)) { 635 continue; 636 } 637 if (tracked_request_overlaps(req, self->overlap_offset, 638 self->overlap_bytes)) 639 { 640 /* 641 * Hitting this means there was a reentrant request, for 642 * example, a block driver issuing nested requests. This must 643 * never happen since it means deadlock. 644 */ 645 assert(qemu_coroutine_self() != req->co); 646 647 /* 648 * If the request is already (indirectly) waiting for us, or 649 * will wait for us as soon as it wakes up, then just go on 650 * (instead of producing a deadlock in the former case). 651 */ 652 if (!req->waiting_for) { 653 return req; 654 } 655 } 656 } 657 658 return NULL; 659 } 660 661 /* Called with self->bs->reqs_lock held */ 662 static void coroutine_fn 663 bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) 664 { 665 BdrvTrackedRequest *req; 666 667 while ((req = bdrv_find_conflicting_request(self))) { 668 self->waiting_for = req; 669 qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); 670 self->waiting_for = NULL; 671 } 672 } 673 674 /* Called with req->bs->reqs_lock held */ 675 static void tracked_request_set_serialising(BdrvTrackedRequest *req, 676 uint64_t align) 677 { 678 int64_t overlap_offset = req->offset & ~(align - 1); 679 int64_t overlap_bytes = 680 ROUND_UP(req->offset + req->bytes, align) - overlap_offset; 681 682 bdrv_check_request(req->offset, req->bytes, &error_abort); 683 684 if (!req->serialising) { 685 qatomic_inc(&req->bs->serialising_in_flight); 686 req->serialising = true; 687 } 688 689 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 690 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 691 } 692 693 /** 694 * Return the tracked request on @bs for the current coroutine, or 695 * NULL if there is none. 696 */ 697 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 698 { 699 BdrvTrackedRequest *req; 700 Coroutine *self = qemu_coroutine_self(); 701 IO_CODE(); 702 703 QLIST_FOREACH(req, &bs->tracked_requests, list) { 704 if (req->co == self) { 705 return req; 706 } 707 } 708 709 return NULL; 710 } 711 712 /** 713 * Round a region to cluster boundaries 714 */ 715 void bdrv_round_to_clusters(BlockDriverState *bs, 716 int64_t offset, int64_t bytes, 717 int64_t *cluster_offset, 718 int64_t *cluster_bytes) 719 { 720 BlockDriverInfo bdi; 721 IO_CODE(); 722 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 723 *cluster_offset = offset; 724 *cluster_bytes = bytes; 725 } else { 726 int64_t c = bdi.cluster_size; 727 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 728 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 729 } 730 } 731 732 static int bdrv_get_cluster_size(BlockDriverState *bs) 733 { 734 BlockDriverInfo bdi; 735 int ret; 736 737 ret = bdrv_get_info(bs, &bdi); 738 if (ret < 0 || bdi.cluster_size == 0) { 739 return bs->bl.request_alignment; 740 } else { 741 return bdi.cluster_size; 742 } 743 } 744 745 void bdrv_inc_in_flight(BlockDriverState *bs) 746 { 747 IO_CODE(); 748 qatomic_inc(&bs->in_flight); 749 } 750 751 void bdrv_wakeup(BlockDriverState *bs) 752 { 753 IO_CODE(); 754 aio_wait_kick(); 755 } 756 757 void bdrv_dec_in_flight(BlockDriverState *bs) 758 { 759 IO_CODE(); 760 qatomic_dec(&bs->in_flight); 761 bdrv_wakeup(bs); 762 } 763 764 static void coroutine_fn 765 bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 766 { 767 BlockDriverState *bs = self->bs; 768 769 if (!qatomic_read(&bs->serialising_in_flight)) { 770 return; 771 } 772 773 qemu_co_mutex_lock(&bs->reqs_lock); 774 bdrv_wait_serialising_requests_locked(self); 775 qemu_co_mutex_unlock(&bs->reqs_lock); 776 } 777 778 void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, 779 uint64_t align) 780 { 781 IO_CODE(); 782 783 qemu_co_mutex_lock(&req->bs->reqs_lock); 784 785 tracked_request_set_serialising(req, align); 786 bdrv_wait_serialising_requests_locked(req); 787 788 qemu_co_mutex_unlock(&req->bs->reqs_lock); 789 } 790 791 int bdrv_check_qiov_request(int64_t offset, int64_t bytes, 792 QEMUIOVector *qiov, size_t qiov_offset, 793 Error **errp) 794 { 795 /* 796 * Check generic offset/bytes correctness 797 */ 798 799 if (offset < 0) { 800 error_setg(errp, "offset is negative: %" PRIi64, offset); 801 return -EIO; 802 } 803 804 if (bytes < 0) { 805 error_setg(errp, "bytes is negative: %" PRIi64, bytes); 806 return -EIO; 807 } 808 809 if (bytes > BDRV_MAX_LENGTH) { 810 error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 811 bytes, BDRV_MAX_LENGTH); 812 return -EIO; 813 } 814 815 if (offset > BDRV_MAX_LENGTH) { 816 error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 817 offset, BDRV_MAX_LENGTH); 818 return -EIO; 819 } 820 821 if (offset > BDRV_MAX_LENGTH - bytes) { 822 error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") " 823 "exceeds maximum(%" PRIi64 ")", offset, bytes, 824 BDRV_MAX_LENGTH); 825 return -EIO; 826 } 827 828 if (!qiov) { 829 return 0; 830 } 831 832 /* 833 * Check qiov and qiov_offset 834 */ 835 836 if (qiov_offset > qiov->size) { 837 error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)", 838 qiov_offset, qiov->size); 839 return -EIO; 840 } 841 842 if (bytes > qiov->size - qiov_offset) { 843 error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io " 844 "vector size(%zu)", bytes, qiov_offset, qiov->size); 845 return -EIO; 846 } 847 848 return 0; 849 } 850 851 int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) 852 { 853 return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp); 854 } 855 856 static int bdrv_check_request32(int64_t offset, int64_t bytes, 857 QEMUIOVector *qiov, size_t qiov_offset) 858 { 859 int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 860 if (ret < 0) { 861 return ret; 862 } 863 864 if (bytes > BDRV_REQUEST_MAX_BYTES) { 865 return -EIO; 866 } 867 868 return 0; 869 } 870 871 /* 872 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 873 * The operation is sped up by checking the block status and only writing 874 * zeroes to the device if they currently do not return zeroes. Optional 875 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 876 * BDRV_REQ_FUA). 877 * 878 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 879 */ 880 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 881 { 882 int ret; 883 int64_t target_size, bytes, offset = 0; 884 BlockDriverState *bs = child->bs; 885 IO_CODE(); 886 887 target_size = bdrv_getlength(bs); 888 if (target_size < 0) { 889 return target_size; 890 } 891 892 for (;;) { 893 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 894 if (bytes <= 0) { 895 return 0; 896 } 897 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 898 if (ret < 0) { 899 return ret; 900 } 901 if (ret & BDRV_BLOCK_ZERO) { 902 offset += bytes; 903 continue; 904 } 905 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 906 if (ret < 0) { 907 return ret; 908 } 909 offset += bytes; 910 } 911 } 912 913 /* 914 * Writes to the file and ensures that no writes are reordered across this 915 * request (acts as a barrier) 916 * 917 * Returns 0 on success, -errno in error cases. 918 */ 919 int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset, 920 int64_t bytes, const void *buf, 921 BdrvRequestFlags flags) 922 { 923 int ret; 924 IO_CODE(); 925 926 ret = bdrv_co_pwrite(child, offset, bytes, buf, flags); 927 if (ret < 0) { 928 return ret; 929 } 930 931 ret = bdrv_co_flush(child->bs); 932 if (ret < 0) { 933 return ret; 934 } 935 936 return 0; 937 } 938 939 typedef struct CoroutineIOCompletion { 940 Coroutine *coroutine; 941 int ret; 942 } CoroutineIOCompletion; 943 944 static void bdrv_co_io_em_complete(void *opaque, int ret) 945 { 946 CoroutineIOCompletion *co = opaque; 947 948 co->ret = ret; 949 aio_co_wake(co->coroutine); 950 } 951 952 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 953 int64_t offset, int64_t bytes, 954 QEMUIOVector *qiov, 955 size_t qiov_offset, int flags) 956 { 957 BlockDriver *drv = bs->drv; 958 int64_t sector_num; 959 unsigned int nb_sectors; 960 QEMUIOVector local_qiov; 961 int ret; 962 963 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 964 assert(!(flags & ~bs->supported_read_flags)); 965 966 if (!drv) { 967 return -ENOMEDIUM; 968 } 969 970 if (drv->bdrv_co_preadv_part) { 971 return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 972 flags); 973 } 974 975 if (qiov_offset > 0 || bytes != qiov->size) { 976 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 977 qiov = &local_qiov; 978 } 979 980 if (drv->bdrv_co_preadv) { 981 ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 982 goto out; 983 } 984 985 if (drv->bdrv_aio_preadv) { 986 BlockAIOCB *acb; 987 CoroutineIOCompletion co = { 988 .coroutine = qemu_coroutine_self(), 989 }; 990 991 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 992 bdrv_co_io_em_complete, &co); 993 if (acb == NULL) { 994 ret = -EIO; 995 goto out; 996 } else { 997 qemu_coroutine_yield(); 998 ret = co.ret; 999 goto out; 1000 } 1001 } 1002 1003 sector_num = offset >> BDRV_SECTOR_BITS; 1004 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1005 1006 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1007 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1008 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1009 assert(drv->bdrv_co_readv); 1010 1011 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1012 1013 out: 1014 if (qiov == &local_qiov) { 1015 qemu_iovec_destroy(&local_qiov); 1016 } 1017 1018 return ret; 1019 } 1020 1021 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1022 int64_t offset, int64_t bytes, 1023 QEMUIOVector *qiov, 1024 size_t qiov_offset, 1025 BdrvRequestFlags flags) 1026 { 1027 BlockDriver *drv = bs->drv; 1028 bool emulate_fua = false; 1029 int64_t sector_num; 1030 unsigned int nb_sectors; 1031 QEMUIOVector local_qiov; 1032 int ret; 1033 1034 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1035 1036 if (!drv) { 1037 return -ENOMEDIUM; 1038 } 1039 1040 if ((flags & BDRV_REQ_FUA) && 1041 (~bs->supported_write_flags & BDRV_REQ_FUA)) { 1042 flags &= ~BDRV_REQ_FUA; 1043 emulate_fua = true; 1044 } 1045 1046 flags &= bs->supported_write_flags; 1047 1048 if (drv->bdrv_co_pwritev_part) { 1049 ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1050 flags); 1051 goto emulate_flags; 1052 } 1053 1054 if (qiov_offset > 0 || bytes != qiov->size) { 1055 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1056 qiov = &local_qiov; 1057 } 1058 1059 if (drv->bdrv_co_pwritev) { 1060 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); 1061 goto emulate_flags; 1062 } 1063 1064 if (drv->bdrv_aio_pwritev) { 1065 BlockAIOCB *acb; 1066 CoroutineIOCompletion co = { 1067 .coroutine = qemu_coroutine_self(), 1068 }; 1069 1070 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags, 1071 bdrv_co_io_em_complete, &co); 1072 if (acb == NULL) { 1073 ret = -EIO; 1074 } else { 1075 qemu_coroutine_yield(); 1076 ret = co.ret; 1077 } 1078 goto emulate_flags; 1079 } 1080 1081 sector_num = offset >> BDRV_SECTOR_BITS; 1082 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1083 1084 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1085 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1086 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1087 1088 assert(drv->bdrv_co_writev); 1089 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags); 1090 1091 emulate_flags: 1092 if (ret == 0 && emulate_fua) { 1093 ret = bdrv_co_flush(bs); 1094 } 1095 1096 if (qiov == &local_qiov) { 1097 qemu_iovec_destroy(&local_qiov); 1098 } 1099 1100 return ret; 1101 } 1102 1103 static int coroutine_fn 1104 bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset, 1105 int64_t bytes, QEMUIOVector *qiov, 1106 size_t qiov_offset) 1107 { 1108 BlockDriver *drv = bs->drv; 1109 QEMUIOVector local_qiov; 1110 int ret; 1111 1112 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1113 1114 if (!drv) { 1115 return -ENOMEDIUM; 1116 } 1117 1118 if (!block_driver_can_compress(drv)) { 1119 return -ENOTSUP; 1120 } 1121 1122 if (drv->bdrv_co_pwritev_compressed_part) { 1123 return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1124 qiov, qiov_offset); 1125 } 1126 1127 if (qiov_offset == 0) { 1128 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1129 } 1130 1131 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1132 ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1133 qemu_iovec_destroy(&local_qiov); 1134 1135 return ret; 1136 } 1137 1138 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1139 int64_t offset, int64_t bytes, QEMUIOVector *qiov, 1140 size_t qiov_offset, int flags) 1141 { 1142 BlockDriverState *bs = child->bs; 1143 1144 /* Perform I/O through a temporary buffer so that users who scribble over 1145 * their read buffer while the operation is in progress do not end up 1146 * modifying the image file. This is critical for zero-copy guest I/O 1147 * where anything might happen inside guest memory. 1148 */ 1149 void *bounce_buffer = NULL; 1150 1151 BlockDriver *drv = bs->drv; 1152 int64_t cluster_offset; 1153 int64_t cluster_bytes; 1154 int64_t skip_bytes; 1155 int ret; 1156 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1157 BDRV_REQUEST_MAX_BYTES); 1158 int64_t progress = 0; 1159 bool skip_write; 1160 1161 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1162 1163 if (!drv) { 1164 return -ENOMEDIUM; 1165 } 1166 1167 /* 1168 * Do not write anything when the BDS is inactive. That is not 1169 * allowed, and it would not help. 1170 */ 1171 skip_write = (bs->open_flags & BDRV_O_INACTIVE); 1172 1173 /* FIXME We cannot require callers to have write permissions when all they 1174 * are doing is a read request. If we did things right, write permissions 1175 * would be obtained anyway, but internally by the copy-on-read code. As 1176 * long as it is implemented here rather than in a separate filter driver, 1177 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1178 * it could request permissions. Therefore we have to bypass the permission 1179 * system for the moment. */ 1180 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1181 1182 /* Cover entire cluster so no additional backing file I/O is required when 1183 * allocating cluster in the image file. Note that this value may exceed 1184 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1185 * is one reason we loop rather than doing it all at once. 1186 */ 1187 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1188 skip_bytes = offset - cluster_offset; 1189 1190 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1191 cluster_offset, cluster_bytes); 1192 1193 while (cluster_bytes) { 1194 int64_t pnum; 1195 1196 if (skip_write) { 1197 ret = 1; /* "already allocated", so nothing will be copied */ 1198 pnum = MIN(cluster_bytes, max_transfer); 1199 } else { 1200 ret = bdrv_is_allocated(bs, cluster_offset, 1201 MIN(cluster_bytes, max_transfer), &pnum); 1202 if (ret < 0) { 1203 /* 1204 * Safe to treat errors in querying allocation as if 1205 * unallocated; we'll probably fail again soon on the 1206 * read, but at least that will set a decent errno. 1207 */ 1208 pnum = MIN(cluster_bytes, max_transfer); 1209 } 1210 1211 /* Stop at EOF if the image ends in the middle of the cluster */ 1212 if (ret == 0 && pnum == 0) { 1213 assert(progress >= bytes); 1214 break; 1215 } 1216 1217 assert(skip_bytes < pnum); 1218 } 1219 1220 if (ret <= 0) { 1221 QEMUIOVector local_qiov; 1222 1223 /* Must copy-on-read; use the bounce buffer */ 1224 pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1225 if (!bounce_buffer) { 1226 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 1227 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 1228 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 1229 1230 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 1231 if (!bounce_buffer) { 1232 ret = -ENOMEM; 1233 goto err; 1234 } 1235 } 1236 qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1237 1238 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1239 &local_qiov, 0, 0); 1240 if (ret < 0) { 1241 goto err; 1242 } 1243 1244 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1245 if (drv->bdrv_co_pwrite_zeroes && 1246 buffer_is_zero(bounce_buffer, pnum)) { 1247 /* FIXME: Should we (perhaps conditionally) be setting 1248 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1249 * that still correctly reads as zero? */ 1250 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1251 BDRV_REQ_WRITE_UNCHANGED); 1252 } else { 1253 /* This does not change the data on the disk, it is not 1254 * necessary to flush even in cache=writethrough mode. 1255 */ 1256 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1257 &local_qiov, 0, 1258 BDRV_REQ_WRITE_UNCHANGED); 1259 } 1260 1261 if (ret < 0) { 1262 /* It might be okay to ignore write errors for guest 1263 * requests. If this is a deliberate copy-on-read 1264 * then we don't want to ignore the error. Simply 1265 * report it in all cases. 1266 */ 1267 goto err; 1268 } 1269 1270 if (!(flags & BDRV_REQ_PREFETCH)) { 1271 qemu_iovec_from_buf(qiov, qiov_offset + progress, 1272 bounce_buffer + skip_bytes, 1273 MIN(pnum - skip_bytes, bytes - progress)); 1274 } 1275 } else if (!(flags & BDRV_REQ_PREFETCH)) { 1276 /* Read directly into the destination */ 1277 ret = bdrv_driver_preadv(bs, offset + progress, 1278 MIN(pnum - skip_bytes, bytes - progress), 1279 qiov, qiov_offset + progress, 0); 1280 if (ret < 0) { 1281 goto err; 1282 } 1283 } 1284 1285 cluster_offset += pnum; 1286 cluster_bytes -= pnum; 1287 progress += pnum - skip_bytes; 1288 skip_bytes = 0; 1289 } 1290 ret = 0; 1291 1292 err: 1293 qemu_vfree(bounce_buffer); 1294 return ret; 1295 } 1296 1297 /* 1298 * Forwards an already correctly aligned request to the BlockDriver. This 1299 * handles copy on read, zeroing after EOF, and fragmentation of large 1300 * reads; any other features must be implemented by the caller. 1301 */ 1302 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1303 BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 1304 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1305 { 1306 BlockDriverState *bs = child->bs; 1307 int64_t total_bytes, max_bytes; 1308 int ret = 0; 1309 int64_t bytes_remaining = bytes; 1310 int max_transfer; 1311 1312 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1313 assert(is_power_of_2(align)); 1314 assert((offset & (align - 1)) == 0); 1315 assert((bytes & (align - 1)) == 0); 1316 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1317 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1318 align); 1319 1320 /* 1321 * TODO: We would need a per-BDS .supported_read_flags and 1322 * potential fallback support, if we ever implement any read flags 1323 * to pass through to drivers. For now, there aren't any 1324 * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint. 1325 */ 1326 assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH | 1327 BDRV_REQ_REGISTERED_BUF))); 1328 1329 /* Handle Copy on Read and associated serialisation */ 1330 if (flags & BDRV_REQ_COPY_ON_READ) { 1331 /* If we touch the same cluster it counts as an overlap. This 1332 * guarantees that allocating writes will be serialized and not race 1333 * with each other for the same cluster. For example, in copy-on-read 1334 * it ensures that the CoR read and write operations are atomic and 1335 * guest writes cannot interleave between them. */ 1336 bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); 1337 } else { 1338 bdrv_wait_serialising_requests(req); 1339 } 1340 1341 if (flags & BDRV_REQ_COPY_ON_READ) { 1342 int64_t pnum; 1343 1344 /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ 1345 flags &= ~BDRV_REQ_COPY_ON_READ; 1346 1347 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1348 if (ret < 0) { 1349 goto out; 1350 } 1351 1352 if (!ret || pnum != bytes) { 1353 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 1354 qiov, qiov_offset, flags); 1355 goto out; 1356 } else if (flags & BDRV_REQ_PREFETCH) { 1357 goto out; 1358 } 1359 } 1360 1361 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1362 total_bytes = bdrv_getlength(bs); 1363 if (total_bytes < 0) { 1364 ret = total_bytes; 1365 goto out; 1366 } 1367 1368 assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF))); 1369 1370 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1371 if (bytes <= max_bytes && bytes <= max_transfer) { 1372 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); 1373 goto out; 1374 } 1375 1376 while (bytes_remaining) { 1377 int64_t num; 1378 1379 if (max_bytes) { 1380 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1381 assert(num); 1382 1383 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1384 num, qiov, 1385 qiov_offset + bytes - bytes_remaining, 1386 flags); 1387 max_bytes -= num; 1388 } else { 1389 num = bytes_remaining; 1390 ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1391 0, bytes_remaining); 1392 } 1393 if (ret < 0) { 1394 goto out; 1395 } 1396 bytes_remaining -= num; 1397 } 1398 1399 out: 1400 return ret < 0 ? ret : 0; 1401 } 1402 1403 /* 1404 * Request padding 1405 * 1406 * |<---- align ----->| |<----- align ---->| 1407 * |<- head ->|<------------- bytes ------------->|<-- tail -->| 1408 * | | | | | | 1409 * -*----------$-------*-------- ... --------*-----$------------*--- 1410 * | | | | | | 1411 * | offset | | end | 1412 * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 1413 * [buf ... ) [tail_buf ) 1414 * 1415 * @buf is an aligned allocation needed to store @head and @tail paddings. @head 1416 * is placed at the beginning of @buf and @tail at the @end. 1417 * 1418 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 1419 * around tail, if tail exists. 1420 * 1421 * @merge_reads is true for small requests, 1422 * if @buf_len == @head + bytes + @tail. In this case it is possible that both 1423 * head and tail exist but @buf_len == align and @tail_buf == @buf. 1424 */ 1425 typedef struct BdrvRequestPadding { 1426 uint8_t *buf; 1427 size_t buf_len; 1428 uint8_t *tail_buf; 1429 size_t head; 1430 size_t tail; 1431 bool merge_reads; 1432 QEMUIOVector local_qiov; 1433 } BdrvRequestPadding; 1434 1435 static bool bdrv_init_padding(BlockDriverState *bs, 1436 int64_t offset, int64_t bytes, 1437 BdrvRequestPadding *pad) 1438 { 1439 int64_t align = bs->bl.request_alignment; 1440 int64_t sum; 1441 1442 bdrv_check_request(offset, bytes, &error_abort); 1443 assert(align <= INT_MAX); /* documented in block/block_int.h */ 1444 assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */ 1445 1446 memset(pad, 0, sizeof(*pad)); 1447 1448 pad->head = offset & (align - 1); 1449 pad->tail = ((offset + bytes) & (align - 1)); 1450 if (pad->tail) { 1451 pad->tail = align - pad->tail; 1452 } 1453 1454 if (!pad->head && !pad->tail) { 1455 return false; 1456 } 1457 1458 assert(bytes); /* Nothing good in aligning zero-length requests */ 1459 1460 sum = pad->head + bytes + pad->tail; 1461 pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 1462 pad->buf = qemu_blockalign(bs, pad->buf_len); 1463 pad->merge_reads = sum == pad->buf_len; 1464 if (pad->tail) { 1465 pad->tail_buf = pad->buf + pad->buf_len - align; 1466 } 1467 1468 return true; 1469 } 1470 1471 static coroutine_fn int bdrv_padding_rmw_read(BdrvChild *child, 1472 BdrvTrackedRequest *req, 1473 BdrvRequestPadding *pad, 1474 bool zero_middle) 1475 { 1476 QEMUIOVector local_qiov; 1477 BlockDriverState *bs = child->bs; 1478 uint64_t align = bs->bl.request_alignment; 1479 int ret; 1480 1481 assert(req->serialising && pad->buf); 1482 1483 if (pad->head || pad->merge_reads) { 1484 int64_t bytes = pad->merge_reads ? pad->buf_len : align; 1485 1486 qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 1487 1488 if (pad->head) { 1489 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1490 } 1491 if (pad->merge_reads && pad->tail) { 1492 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1493 } 1494 ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 1495 align, &local_qiov, 0, 0); 1496 if (ret < 0) { 1497 return ret; 1498 } 1499 if (pad->head) { 1500 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1501 } 1502 if (pad->merge_reads && pad->tail) { 1503 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1504 } 1505 1506 if (pad->merge_reads) { 1507 goto zero_mem; 1508 } 1509 } 1510 1511 if (pad->tail) { 1512 qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 1513 1514 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1515 ret = bdrv_aligned_preadv( 1516 child, req, 1517 req->overlap_offset + req->overlap_bytes - align, 1518 align, align, &local_qiov, 0, 0); 1519 if (ret < 0) { 1520 return ret; 1521 } 1522 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1523 } 1524 1525 zero_mem: 1526 if (zero_middle) { 1527 memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 1528 } 1529 1530 return 0; 1531 } 1532 1533 static void bdrv_padding_destroy(BdrvRequestPadding *pad) 1534 { 1535 if (pad->buf) { 1536 qemu_vfree(pad->buf); 1537 qemu_iovec_destroy(&pad->local_qiov); 1538 } 1539 memset(pad, 0, sizeof(*pad)); 1540 } 1541 1542 /* 1543 * bdrv_pad_request 1544 * 1545 * Exchange request parameters with padded request if needed. Don't include RMW 1546 * read of padding, bdrv_padding_rmw_read() should be called separately if 1547 * needed. 1548 * 1549 * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: 1550 * - on function start they represent original request 1551 * - on failure or when padding is not needed they are unchanged 1552 * - on success when padding is needed they represent padded request 1553 */ 1554 static int bdrv_pad_request(BlockDriverState *bs, 1555 QEMUIOVector **qiov, size_t *qiov_offset, 1556 int64_t *offset, int64_t *bytes, 1557 BdrvRequestPadding *pad, bool *padded, 1558 BdrvRequestFlags *flags) 1559 { 1560 int ret; 1561 1562 bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); 1563 1564 if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 1565 if (padded) { 1566 *padded = false; 1567 } 1568 return 0; 1569 } 1570 1571 ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 1572 *qiov, *qiov_offset, *bytes, 1573 pad->buf + pad->buf_len - pad->tail, 1574 pad->tail); 1575 if (ret < 0) { 1576 bdrv_padding_destroy(pad); 1577 return ret; 1578 } 1579 *bytes += pad->head + pad->tail; 1580 *offset -= pad->head; 1581 *qiov = &pad->local_qiov; 1582 *qiov_offset = 0; 1583 if (padded) { 1584 *padded = true; 1585 } 1586 if (flags) { 1587 /* Can't use optimization hint with bounce buffer */ 1588 *flags &= ~BDRV_REQ_REGISTERED_BUF; 1589 } 1590 1591 return 0; 1592 } 1593 1594 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1595 int64_t offset, int64_t bytes, QEMUIOVector *qiov, 1596 BdrvRequestFlags flags) 1597 { 1598 IO_CODE(); 1599 return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 1600 } 1601 1602 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 1603 int64_t offset, int64_t bytes, 1604 QEMUIOVector *qiov, size_t qiov_offset, 1605 BdrvRequestFlags flags) 1606 { 1607 BlockDriverState *bs = child->bs; 1608 BdrvTrackedRequest req; 1609 BdrvRequestPadding pad; 1610 int ret; 1611 IO_CODE(); 1612 1613 trace_bdrv_co_preadv_part(bs, offset, bytes, flags); 1614 1615 if (!bdrv_is_inserted(bs)) { 1616 return -ENOMEDIUM; 1617 } 1618 1619 ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 1620 if (ret < 0) { 1621 return ret; 1622 } 1623 1624 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1625 /* 1626 * Aligning zero request is nonsense. Even if driver has special meaning 1627 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1628 * it to driver due to request_alignment. 1629 * 1630 * Still, no reason to return an error if someone do unaligned 1631 * zero-length read occasionally. 1632 */ 1633 return 0; 1634 } 1635 1636 bdrv_inc_in_flight(bs); 1637 1638 /* Don't do copy-on-read if we read data before write operation */ 1639 if (qatomic_read(&bs->copy_on_read)) { 1640 flags |= BDRV_REQ_COPY_ON_READ; 1641 } 1642 1643 ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 1644 NULL, &flags); 1645 if (ret < 0) { 1646 goto fail; 1647 } 1648 1649 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1650 ret = bdrv_aligned_preadv(child, &req, offset, bytes, 1651 bs->bl.request_alignment, 1652 qiov, qiov_offset, flags); 1653 tracked_request_end(&req); 1654 bdrv_padding_destroy(&pad); 1655 1656 fail: 1657 bdrv_dec_in_flight(bs); 1658 1659 return ret; 1660 } 1661 1662 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1663 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 1664 { 1665 BlockDriver *drv = bs->drv; 1666 QEMUIOVector qiov; 1667 void *buf = NULL; 1668 int ret = 0; 1669 bool need_flush = false; 1670 int head = 0; 1671 int tail = 0; 1672 1673 int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, 1674 INT64_MAX); 1675 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1676 bs->bl.request_alignment); 1677 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1678 1679 bdrv_check_request(offset, bytes, &error_abort); 1680 1681 if (!drv) { 1682 return -ENOMEDIUM; 1683 } 1684 1685 if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1686 return -ENOTSUP; 1687 } 1688 1689 /* By definition there is no user buffer so this flag doesn't make sense */ 1690 if (flags & BDRV_REQ_REGISTERED_BUF) { 1691 return -EINVAL; 1692 } 1693 1694 /* Invalidate the cached block-status data range if this write overlaps */ 1695 bdrv_bsc_invalidate_range(bs, offset, bytes); 1696 1697 assert(alignment % bs->bl.request_alignment == 0); 1698 head = offset % alignment; 1699 tail = (offset + bytes) % alignment; 1700 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1701 assert(max_write_zeroes >= bs->bl.request_alignment); 1702 1703 while (bytes > 0 && !ret) { 1704 int64_t num = bytes; 1705 1706 /* Align request. Block drivers can expect the "bulk" of the request 1707 * to be aligned, and that unaligned requests do not cross cluster 1708 * boundaries. 1709 */ 1710 if (head) { 1711 /* Make a small request up to the first aligned sector. For 1712 * convenience, limit this request to max_transfer even if 1713 * we don't need to fall back to writes. */ 1714 num = MIN(MIN(bytes, max_transfer), alignment - head); 1715 head = (head + num) % alignment; 1716 assert(num < max_write_zeroes); 1717 } else if (tail && num > alignment) { 1718 /* Shorten the request to the last aligned sector. */ 1719 num -= tail; 1720 } 1721 1722 /* limit request size */ 1723 if (num > max_write_zeroes) { 1724 num = max_write_zeroes; 1725 } 1726 1727 ret = -ENOTSUP; 1728 /* First try the efficient write zeroes operation */ 1729 if (drv->bdrv_co_pwrite_zeroes) { 1730 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1731 flags & bs->supported_zero_flags); 1732 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1733 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1734 need_flush = true; 1735 } 1736 } else { 1737 assert(!bs->supported_zero_flags); 1738 } 1739 1740 if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 1741 /* Fall back to bounce buffer if write zeroes is unsupported */ 1742 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1743 1744 if ((flags & BDRV_REQ_FUA) && 1745 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1746 /* No need for bdrv_driver_pwrite() to do a fallback 1747 * flush on each chunk; use just one at the end */ 1748 write_flags &= ~BDRV_REQ_FUA; 1749 need_flush = true; 1750 } 1751 num = MIN(num, max_transfer); 1752 if (buf == NULL) { 1753 buf = qemu_try_blockalign0(bs, num); 1754 if (buf == NULL) { 1755 ret = -ENOMEM; 1756 goto fail; 1757 } 1758 } 1759 qemu_iovec_init_buf(&qiov, buf, num); 1760 1761 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 1762 1763 /* Keep bounce buffer around if it is big enough for all 1764 * all future requests. 1765 */ 1766 if (num < max_transfer) { 1767 qemu_vfree(buf); 1768 buf = NULL; 1769 } 1770 } 1771 1772 offset += num; 1773 bytes -= num; 1774 } 1775 1776 fail: 1777 if (ret == 0 && need_flush) { 1778 ret = bdrv_co_flush(bs); 1779 } 1780 qemu_vfree(buf); 1781 return ret; 1782 } 1783 1784 static inline int coroutine_fn 1785 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, 1786 BdrvTrackedRequest *req, int flags) 1787 { 1788 BlockDriverState *bs = child->bs; 1789 1790 bdrv_check_request(offset, bytes, &error_abort); 1791 1792 if (bdrv_is_read_only(bs)) { 1793 return -EPERM; 1794 } 1795 1796 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1797 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1798 assert(!(flags & ~BDRV_REQ_MASK)); 1799 assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); 1800 1801 if (flags & BDRV_REQ_SERIALISING) { 1802 QEMU_LOCK_GUARD(&bs->reqs_lock); 1803 1804 tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); 1805 1806 if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { 1807 return -EBUSY; 1808 } 1809 1810 bdrv_wait_serialising_requests_locked(req); 1811 } else { 1812 bdrv_wait_serialising_requests(req); 1813 } 1814 1815 assert(req->overlap_offset <= offset); 1816 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1817 assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE || 1818 child->perm & BLK_PERM_RESIZE); 1819 1820 switch (req->type) { 1821 case BDRV_TRACKED_WRITE: 1822 case BDRV_TRACKED_DISCARD: 1823 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1824 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1825 } else { 1826 assert(child->perm & BLK_PERM_WRITE); 1827 } 1828 bdrv_write_threshold_check_write(bs, offset, bytes); 1829 return 0; 1830 case BDRV_TRACKED_TRUNCATE: 1831 assert(child->perm & BLK_PERM_RESIZE); 1832 return 0; 1833 default: 1834 abort(); 1835 } 1836 } 1837 1838 static inline void coroutine_fn 1839 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes, 1840 BdrvTrackedRequest *req, int ret) 1841 { 1842 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1843 BlockDriverState *bs = child->bs; 1844 1845 bdrv_check_request(offset, bytes, &error_abort); 1846 1847 qatomic_inc(&bs->write_gen); 1848 1849 /* 1850 * Discard cannot extend the image, but in error handling cases, such as 1851 * when reverting a qcow2 cluster allocation, the discarded range can pass 1852 * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 1853 * here. Instead, just skip it, since semantically a discard request 1854 * beyond EOF cannot expand the image anyway. 1855 */ 1856 if (ret == 0 && 1857 (req->type == BDRV_TRACKED_TRUNCATE || 1858 end_sector > bs->total_sectors) && 1859 req->type != BDRV_TRACKED_DISCARD) { 1860 bs->total_sectors = end_sector; 1861 bdrv_parent_cb_resize(bs); 1862 bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 1863 } 1864 if (req->bytes) { 1865 switch (req->type) { 1866 case BDRV_TRACKED_WRITE: 1867 stat64_max(&bs->wr_highest_offset, offset + bytes); 1868 /* fall through, to set dirty bits */ 1869 case BDRV_TRACKED_DISCARD: 1870 bdrv_set_dirty(bs, offset, bytes); 1871 break; 1872 default: 1873 break; 1874 } 1875 } 1876 } 1877 1878 /* 1879 * Forwards an already correctly aligned write request to the BlockDriver, 1880 * after possibly fragmenting it. 1881 */ 1882 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1883 BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 1884 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, 1885 BdrvRequestFlags flags) 1886 { 1887 BlockDriverState *bs = child->bs; 1888 BlockDriver *drv = bs->drv; 1889 int ret; 1890 1891 int64_t bytes_remaining = bytes; 1892 int max_transfer; 1893 1894 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1895 1896 if (!drv) { 1897 return -ENOMEDIUM; 1898 } 1899 1900 if (bdrv_has_readonly_bitmaps(bs)) { 1901 return -EPERM; 1902 } 1903 1904 assert(is_power_of_2(align)); 1905 assert((offset & (align - 1)) == 0); 1906 assert((bytes & (align - 1)) == 0); 1907 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1908 align); 1909 1910 ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 1911 1912 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1913 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1914 qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 1915 flags |= BDRV_REQ_ZERO_WRITE; 1916 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1917 flags |= BDRV_REQ_MAY_UNMAP; 1918 } 1919 } 1920 1921 if (ret < 0) { 1922 /* Do nothing, write notifier decided to fail this request */ 1923 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1924 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1925 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1926 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1927 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 1928 qiov, qiov_offset); 1929 } else if (bytes <= max_transfer) { 1930 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1931 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 1932 } else { 1933 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1934 while (bytes_remaining) { 1935 int num = MIN(bytes_remaining, max_transfer); 1936 int local_flags = flags; 1937 1938 assert(num); 1939 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1940 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1941 /* If FUA is going to be emulated by flush, we only 1942 * need to flush on the last iteration */ 1943 local_flags &= ~BDRV_REQ_FUA; 1944 } 1945 1946 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1947 num, qiov, 1948 qiov_offset + bytes - bytes_remaining, 1949 local_flags); 1950 if (ret < 0) { 1951 break; 1952 } 1953 bytes_remaining -= num; 1954 } 1955 } 1956 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1957 1958 if (ret >= 0) { 1959 ret = 0; 1960 } 1961 bdrv_co_write_req_finish(child, offset, bytes, req, ret); 1962 1963 return ret; 1964 } 1965 1966 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1967 int64_t offset, 1968 int64_t bytes, 1969 BdrvRequestFlags flags, 1970 BdrvTrackedRequest *req) 1971 { 1972 BlockDriverState *bs = child->bs; 1973 QEMUIOVector local_qiov; 1974 uint64_t align = bs->bl.request_alignment; 1975 int ret = 0; 1976 bool padding; 1977 BdrvRequestPadding pad; 1978 1979 /* This flag doesn't make sense for padding or zero writes */ 1980 flags &= ~BDRV_REQ_REGISTERED_BUF; 1981 1982 padding = bdrv_init_padding(bs, offset, bytes, &pad); 1983 if (padding) { 1984 assert(!(flags & BDRV_REQ_NO_WAIT)); 1985 bdrv_make_request_serialising(req, align); 1986 1987 bdrv_padding_rmw_read(child, req, &pad, true); 1988 1989 if (pad.head || pad.merge_reads) { 1990 int64_t aligned_offset = offset & ~(align - 1); 1991 int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 1992 1993 qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 1994 ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 1995 align, &local_qiov, 0, 1996 flags & ~BDRV_REQ_ZERO_WRITE); 1997 if (ret < 0 || pad.merge_reads) { 1998 /* Error or all work is done */ 1999 goto out; 2000 } 2001 offset += write_bytes - pad.head; 2002 bytes -= write_bytes - pad.head; 2003 } 2004 } 2005 2006 assert(!bytes || (offset & (align - 1)) == 0); 2007 if (bytes >= align) { 2008 /* Write the aligned part in the middle. */ 2009 int64_t aligned_bytes = bytes & ~(align - 1); 2010 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 2011 NULL, 0, flags); 2012 if (ret < 0) { 2013 goto out; 2014 } 2015 bytes -= aligned_bytes; 2016 offset += aligned_bytes; 2017 } 2018 2019 assert(!bytes || (offset & (align - 1)) == 0); 2020 if (bytes) { 2021 assert(align == pad.tail + bytes); 2022 2023 qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 2024 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 2025 &local_qiov, 0, 2026 flags & ~BDRV_REQ_ZERO_WRITE); 2027 } 2028 2029 out: 2030 bdrv_padding_destroy(&pad); 2031 2032 return ret; 2033 } 2034 2035 /* 2036 * Handle a write request in coroutine context 2037 */ 2038 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2039 int64_t offset, int64_t bytes, QEMUIOVector *qiov, 2040 BdrvRequestFlags flags) 2041 { 2042 IO_CODE(); 2043 return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 2044 } 2045 2046 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 2047 int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, 2048 BdrvRequestFlags flags) 2049 { 2050 BlockDriverState *bs = child->bs; 2051 BdrvTrackedRequest req; 2052 uint64_t align = bs->bl.request_alignment; 2053 BdrvRequestPadding pad; 2054 int ret; 2055 bool padded = false; 2056 IO_CODE(); 2057 2058 trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags); 2059 2060 if (!bdrv_is_inserted(bs)) { 2061 return -ENOMEDIUM; 2062 } 2063 2064 if (flags & BDRV_REQ_ZERO_WRITE) { 2065 ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 2066 } else { 2067 ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 2068 } 2069 if (ret < 0) { 2070 return ret; 2071 } 2072 2073 /* If the request is misaligned then we can't make it efficient */ 2074 if ((flags & BDRV_REQ_NO_FALLBACK) && 2075 !QEMU_IS_ALIGNED(offset | bytes, align)) 2076 { 2077 return -ENOTSUP; 2078 } 2079 2080 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2081 /* 2082 * Aligning zero request is nonsense. Even if driver has special meaning 2083 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2084 * it to driver due to request_alignment. 2085 * 2086 * Still, no reason to return an error if someone do unaligned 2087 * zero-length write occasionally. 2088 */ 2089 return 0; 2090 } 2091 2092 if (!(flags & BDRV_REQ_ZERO_WRITE)) { 2093 /* 2094 * Pad request for following read-modify-write cycle. 2095 * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do 2096 * alignment only if there is no ZERO flag. 2097 */ 2098 ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 2099 &padded, &flags); 2100 if (ret < 0) { 2101 return ret; 2102 } 2103 } 2104 2105 bdrv_inc_in_flight(bs); 2106 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 2107 2108 if (flags & BDRV_REQ_ZERO_WRITE) { 2109 assert(!padded); 2110 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 2111 goto out; 2112 } 2113 2114 if (padded) { 2115 /* 2116 * Request was unaligned to request_alignment and therefore 2117 * padded. We are going to do read-modify-write, and must 2118 * serialize the request to prevent interactions of the 2119 * widened region with other transactions. 2120 */ 2121 assert(!(flags & BDRV_REQ_NO_WAIT)); 2122 bdrv_make_request_serialising(&req, align); 2123 bdrv_padding_rmw_read(child, &req, &pad, false); 2124 } 2125 2126 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 2127 qiov, qiov_offset, flags); 2128 2129 bdrv_padding_destroy(&pad); 2130 2131 out: 2132 tracked_request_end(&req); 2133 bdrv_dec_in_flight(bs); 2134 2135 return ret; 2136 } 2137 2138 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2139 int64_t bytes, BdrvRequestFlags flags) 2140 { 2141 IO_CODE(); 2142 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 2143 2144 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 2145 flags &= ~BDRV_REQ_MAY_UNMAP; 2146 } 2147 2148 return bdrv_co_pwritev(child, offset, bytes, NULL, 2149 BDRV_REQ_ZERO_WRITE | flags); 2150 } 2151 2152 /* 2153 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 2154 */ 2155 int bdrv_flush_all(void) 2156 { 2157 BdrvNextIterator it; 2158 BlockDriverState *bs = NULL; 2159 int result = 0; 2160 2161 GLOBAL_STATE_CODE(); 2162 2163 /* 2164 * bdrv queue is managed by record/replay, 2165 * creating new flush request for stopping 2166 * the VM may break the determinism 2167 */ 2168 if (replay_events_enabled()) { 2169 return result; 2170 } 2171 2172 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 2173 AioContext *aio_context = bdrv_get_aio_context(bs); 2174 int ret; 2175 2176 aio_context_acquire(aio_context); 2177 ret = bdrv_flush(bs); 2178 if (ret < 0 && !result) { 2179 result = ret; 2180 } 2181 aio_context_release(aio_context); 2182 } 2183 2184 return result; 2185 } 2186 2187 /* 2188 * Returns the allocation status of the specified sectors. 2189 * Drivers not implementing the functionality are assumed to not support 2190 * backing files, hence all their sectors are reported as allocated. 2191 * 2192 * If 'want_zero' is true, the caller is querying for mapping 2193 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 2194 * _ZERO where possible; otherwise, the result favors larger 'pnum', 2195 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2196 * 2197 * If 'offset' is beyond the end of the disk image the return value is 2198 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 2199 * 2200 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2201 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2202 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 2203 * 2204 * 'pnum' is set to the number of bytes (including and immediately 2205 * following the specified offset) that are easily known to be in the 2206 * same allocated/unallocated state. Note that a second call starting 2207 * at the original offset plus returned pnum may have the same status. 2208 * The returned value is non-zero on success except at end-of-file. 2209 * 2210 * Returns negative errno on failure. Otherwise, if the 2211 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 2212 * set to the host mapping and BDS corresponding to the guest offset. 2213 */ 2214 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2215 bool want_zero, 2216 int64_t offset, int64_t bytes, 2217 int64_t *pnum, int64_t *map, 2218 BlockDriverState **file) 2219 { 2220 int64_t total_size; 2221 int64_t n; /* bytes */ 2222 int ret; 2223 int64_t local_map = 0; 2224 BlockDriverState *local_file = NULL; 2225 int64_t aligned_offset, aligned_bytes; 2226 uint32_t align; 2227 bool has_filtered_child; 2228 2229 assert(pnum); 2230 *pnum = 0; 2231 total_size = bdrv_getlength(bs); 2232 if (total_size < 0) { 2233 ret = total_size; 2234 goto early_out; 2235 } 2236 2237 if (offset >= total_size) { 2238 ret = BDRV_BLOCK_EOF; 2239 goto early_out; 2240 } 2241 if (!bytes) { 2242 ret = 0; 2243 goto early_out; 2244 } 2245 2246 n = total_size - offset; 2247 if (n < bytes) { 2248 bytes = n; 2249 } 2250 2251 /* Must be non-NULL or bdrv_getlength() would have failed */ 2252 assert(bs->drv); 2253 has_filtered_child = bdrv_filter_child(bs); 2254 if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { 2255 *pnum = bytes; 2256 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2257 if (offset + bytes == total_size) { 2258 ret |= BDRV_BLOCK_EOF; 2259 } 2260 if (bs->drv->protocol_name) { 2261 ret |= BDRV_BLOCK_OFFSET_VALID; 2262 local_map = offset; 2263 local_file = bs; 2264 } 2265 goto early_out; 2266 } 2267 2268 bdrv_inc_in_flight(bs); 2269 2270 /* Round out to request_alignment boundaries */ 2271 align = bs->bl.request_alignment; 2272 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2273 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2274 2275 if (bs->drv->bdrv_co_block_status) { 2276 /* 2277 * Use the block-status cache only for protocol nodes: Format 2278 * drivers are generally quick to inquire the status, but protocol 2279 * drivers often need to get information from outside of qemu, so 2280 * we do not have control over the actual implementation. There 2281 * have been cases where inquiring the status took an unreasonably 2282 * long time, and we can do nothing in qemu to fix it. 2283 * This is especially problematic for images with large data areas, 2284 * because finding the few holes in them and giving them special 2285 * treatment does not gain much performance. Therefore, we try to 2286 * cache the last-identified data region. 2287 * 2288 * Second, limiting ourselves to protocol nodes allows us to assume 2289 * the block status for data regions to be DATA | OFFSET_VALID, and 2290 * that the host offset is the same as the guest offset. 2291 * 2292 * Note that it is possible that external writers zero parts of 2293 * the cached regions without the cache being invalidated, and so 2294 * we may report zeroes as data. This is not catastrophic, 2295 * however, because reporting zeroes as data is fine. 2296 */ 2297 if (QLIST_EMPTY(&bs->children) && 2298 bdrv_bsc_is_data(bs, aligned_offset, pnum)) 2299 { 2300 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 2301 local_file = bs; 2302 local_map = aligned_offset; 2303 } else { 2304 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2305 aligned_bytes, pnum, &local_map, 2306 &local_file); 2307 2308 /* 2309 * Note that checking QLIST_EMPTY(&bs->children) is also done when 2310 * the cache is queried above. Technically, we do not need to check 2311 * it here; the worst that can happen is that we fill the cache for 2312 * non-protocol nodes, and then it is never used. However, filling 2313 * the cache requires an RCU update, so double check here to avoid 2314 * such an update if possible. 2315 * 2316 * Check want_zero, because we only want to update the cache when we 2317 * have accurate information about what is zero and what is data. 2318 */ 2319 if (want_zero && 2320 ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && 2321 QLIST_EMPTY(&bs->children)) 2322 { 2323 /* 2324 * When a protocol driver reports BLOCK_OFFSET_VALID, the 2325 * returned local_map value must be the same as the offset we 2326 * have passed (aligned_offset), and local_bs must be the node 2327 * itself. 2328 * Assert this, because we follow this rule when reading from 2329 * the cache (see the `local_file = bs` and 2330 * `local_map = aligned_offset` assignments above), and the 2331 * result the cache delivers must be the same as the driver 2332 * would deliver. 2333 */ 2334 assert(local_file == bs); 2335 assert(local_map == aligned_offset); 2336 bdrv_bsc_fill(bs, aligned_offset, *pnum); 2337 } 2338 } 2339 } else { 2340 /* Default code for filters */ 2341 2342 local_file = bdrv_filter_bs(bs); 2343 assert(local_file); 2344 2345 *pnum = aligned_bytes; 2346 local_map = aligned_offset; 2347 ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2348 } 2349 if (ret < 0) { 2350 *pnum = 0; 2351 goto out; 2352 } 2353 2354 /* 2355 * The driver's result must be a non-zero multiple of request_alignment. 2356 * Clamp pnum and adjust map to original request. 2357 */ 2358 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2359 align > offset - aligned_offset); 2360 if (ret & BDRV_BLOCK_RECURSE) { 2361 assert(ret & BDRV_BLOCK_DATA); 2362 assert(ret & BDRV_BLOCK_OFFSET_VALID); 2363 assert(!(ret & BDRV_BLOCK_ZERO)); 2364 } 2365 2366 *pnum -= offset - aligned_offset; 2367 if (*pnum > bytes) { 2368 *pnum = bytes; 2369 } 2370 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2371 local_map += offset - aligned_offset; 2372 } 2373 2374 if (ret & BDRV_BLOCK_RAW) { 2375 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2376 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2377 *pnum, pnum, &local_map, &local_file); 2378 goto out; 2379 } 2380 2381 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2382 ret |= BDRV_BLOCK_ALLOCATED; 2383 } else if (bs->drv->supports_backing) { 2384 BlockDriverState *cow_bs = bdrv_cow_bs(bs); 2385 2386 if (!cow_bs) { 2387 ret |= BDRV_BLOCK_ZERO; 2388 } else if (want_zero) { 2389 int64_t size2 = bdrv_getlength(cow_bs); 2390 2391 if (size2 >= 0 && offset >= size2) { 2392 ret |= BDRV_BLOCK_ZERO; 2393 } 2394 } 2395 } 2396 2397 if (want_zero && ret & BDRV_BLOCK_RECURSE && 2398 local_file && local_file != bs && 2399 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2400 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2401 int64_t file_pnum; 2402 int ret2; 2403 2404 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2405 *pnum, &file_pnum, NULL, NULL); 2406 if (ret2 >= 0) { 2407 /* Ignore errors. This is just providing extra information, it 2408 * is useful but not necessary. 2409 */ 2410 if (ret2 & BDRV_BLOCK_EOF && 2411 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2412 /* 2413 * It is valid for the format block driver to read 2414 * beyond the end of the underlying file's current 2415 * size; such areas read as zero. 2416 */ 2417 ret |= BDRV_BLOCK_ZERO; 2418 } else { 2419 /* Limit request to the range reported by the protocol driver */ 2420 *pnum = file_pnum; 2421 ret |= (ret2 & BDRV_BLOCK_ZERO); 2422 } 2423 } 2424 } 2425 2426 out: 2427 bdrv_dec_in_flight(bs); 2428 if (ret >= 0 && offset + *pnum == total_size) { 2429 ret |= BDRV_BLOCK_EOF; 2430 } 2431 early_out: 2432 if (file) { 2433 *file = local_file; 2434 } 2435 if (map) { 2436 *map = local_map; 2437 } 2438 return ret; 2439 } 2440 2441 int coroutine_fn 2442 bdrv_co_common_block_status_above(BlockDriverState *bs, 2443 BlockDriverState *base, 2444 bool include_base, 2445 bool want_zero, 2446 int64_t offset, 2447 int64_t bytes, 2448 int64_t *pnum, 2449 int64_t *map, 2450 BlockDriverState **file, 2451 int *depth) 2452 { 2453 int ret; 2454 BlockDriverState *p; 2455 int64_t eof = 0; 2456 int dummy; 2457 IO_CODE(); 2458 2459 assert(!include_base || base); /* Can't include NULL base */ 2460 2461 if (!depth) { 2462 depth = &dummy; 2463 } 2464 *depth = 0; 2465 2466 if (!include_base && bs == base) { 2467 *pnum = bytes; 2468 return 0; 2469 } 2470 2471 ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file); 2472 ++*depth; 2473 if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { 2474 return ret; 2475 } 2476 2477 if (ret & BDRV_BLOCK_EOF) { 2478 eof = offset + *pnum; 2479 } 2480 2481 assert(*pnum <= bytes); 2482 bytes = *pnum; 2483 2484 for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; 2485 p = bdrv_filter_or_cow_bs(p)) 2486 { 2487 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2488 file); 2489 ++*depth; 2490 if (ret < 0) { 2491 return ret; 2492 } 2493 if (*pnum == 0) { 2494 /* 2495 * The top layer deferred to this layer, and because this layer is 2496 * short, any zeroes that we synthesize beyond EOF behave as if they 2497 * were allocated at this layer. 2498 * 2499 * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be 2500 * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 2501 * below. 2502 */ 2503 assert(ret & BDRV_BLOCK_EOF); 2504 *pnum = bytes; 2505 if (file) { 2506 *file = p; 2507 } 2508 ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED; 2509 break; 2510 } 2511 if (ret & BDRV_BLOCK_ALLOCATED) { 2512 /* 2513 * We've found the node and the status, we must break. 2514 * 2515 * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be 2516 * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 2517 * below. 2518 */ 2519 ret &= ~BDRV_BLOCK_EOF; 2520 break; 2521 } 2522 2523 if (p == base) { 2524 assert(include_base); 2525 break; 2526 } 2527 2528 /* 2529 * OK, [offset, offset + *pnum) region is unallocated on this layer, 2530 * let's continue the diving. 2531 */ 2532 assert(*pnum <= bytes); 2533 bytes = *pnum; 2534 } 2535 2536 if (offset + *pnum == eof) { 2537 ret |= BDRV_BLOCK_EOF; 2538 } 2539 2540 return ret; 2541 } 2542 2543 int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2544 BlockDriverState *base, 2545 int64_t offset, int64_t bytes, 2546 int64_t *pnum, int64_t *map, 2547 BlockDriverState **file) 2548 { 2549 IO_CODE(); 2550 return bdrv_co_common_block_status_above(bs, base, false, true, offset, 2551 bytes, pnum, map, file, NULL); 2552 } 2553 2554 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2555 int64_t offset, int64_t bytes, int64_t *pnum, 2556 int64_t *map, BlockDriverState **file) 2557 { 2558 IO_CODE(); 2559 return bdrv_common_block_status_above(bs, base, false, true, offset, bytes, 2560 pnum, map, file, NULL); 2561 } 2562 2563 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2564 int64_t *pnum, int64_t *map, BlockDriverState **file) 2565 { 2566 IO_CODE(); 2567 return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), 2568 offset, bytes, pnum, map, file); 2569 } 2570 2571 /* 2572 * Check @bs (and its backing chain) to see if the range defined 2573 * by @offset and @bytes is known to read as zeroes. 2574 * Return 1 if that is the case, 0 otherwise and -errno on error. 2575 * This test is meant to be fast rather than accurate so returning 0 2576 * does not guarantee non-zero data. 2577 */ 2578 int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, 2579 int64_t bytes) 2580 { 2581 int ret; 2582 int64_t pnum = bytes; 2583 IO_CODE(); 2584 2585 if (!bytes) { 2586 return 1; 2587 } 2588 2589 ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset, 2590 bytes, &pnum, NULL, NULL, NULL); 2591 2592 if (ret < 0) { 2593 return ret; 2594 } 2595 2596 return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); 2597 } 2598 2599 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, 2600 int64_t bytes, int64_t *pnum) 2601 { 2602 int ret; 2603 int64_t dummy; 2604 IO_CODE(); 2605 2606 ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset, 2607 bytes, pnum ? pnum : &dummy, NULL, 2608 NULL, NULL); 2609 if (ret < 0) { 2610 return ret; 2611 } 2612 return !!(ret & BDRV_BLOCK_ALLOCATED); 2613 } 2614 2615 int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, 2616 int64_t *pnum) 2617 { 2618 int ret; 2619 int64_t dummy; 2620 IO_CODE(); 2621 2622 ret = bdrv_common_block_status_above(bs, bs, true, false, offset, 2623 bytes, pnum ? pnum : &dummy, NULL, 2624 NULL, NULL); 2625 if (ret < 0) { 2626 return ret; 2627 } 2628 return !!(ret & BDRV_BLOCK_ALLOCATED); 2629 } 2630 2631 /* See bdrv_is_allocated_above for documentation */ 2632 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, 2633 BlockDriverState *base, 2634 bool include_base, int64_t offset, 2635 int64_t bytes, int64_t *pnum) 2636 { 2637 int depth; 2638 int ret; 2639 IO_CODE(); 2640 2641 ret = bdrv_co_common_block_status_above(top, base, include_base, false, 2642 offset, bytes, pnum, NULL, NULL, 2643 &depth); 2644 if (ret < 0) { 2645 return ret; 2646 } 2647 2648 if (ret & BDRV_BLOCK_ALLOCATED) { 2649 return depth; 2650 } 2651 return 0; 2652 } 2653 2654 /* 2655 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2656 * 2657 * Return a positive depth if (a prefix of) the given range is allocated 2658 * in any image between BASE and TOP (BASE is only included if include_base 2659 * is set). Depth 1 is TOP, 2 is the first backing layer, and so forth. 2660 * BASE can be NULL to check if the given offset is allocated in any 2661 * image of the chain. Return 0 otherwise, or negative errno on 2662 * failure. 2663 * 2664 * 'pnum' is set to the number of bytes (including and immediately 2665 * following the specified offset) that are known to be in the same 2666 * allocated/unallocated state. Note that a subsequent call starting 2667 * at 'offset + *pnum' may return the same allocation status (in other 2668 * words, the result is not necessarily the maximum possible range); 2669 * but 'pnum' will only be 0 when end of file is reached. 2670 */ 2671 int bdrv_is_allocated_above(BlockDriverState *top, 2672 BlockDriverState *base, 2673 bool include_base, int64_t offset, 2674 int64_t bytes, int64_t *pnum) 2675 { 2676 int depth; 2677 int ret; 2678 IO_CODE(); 2679 2680 ret = bdrv_common_block_status_above(top, base, include_base, false, 2681 offset, bytes, pnum, NULL, NULL, 2682 &depth); 2683 if (ret < 0) { 2684 return ret; 2685 } 2686 2687 if (ret & BDRV_BLOCK_ALLOCATED) { 2688 return depth; 2689 } 2690 return 0; 2691 } 2692 2693 int coroutine_fn 2694 bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2695 { 2696 BlockDriver *drv = bs->drv; 2697 BlockDriverState *child_bs = bdrv_primary_bs(bs); 2698 int ret; 2699 IO_CODE(); 2700 assert_bdrv_graph_readable(); 2701 2702 ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2703 if (ret < 0) { 2704 return ret; 2705 } 2706 2707 if (!drv) { 2708 return -ENOMEDIUM; 2709 } 2710 2711 bdrv_inc_in_flight(bs); 2712 2713 if (drv->bdrv_load_vmstate) { 2714 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2715 } else if (child_bs) { 2716 ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); 2717 } else { 2718 ret = -ENOTSUP; 2719 } 2720 2721 bdrv_dec_in_flight(bs); 2722 2723 return ret; 2724 } 2725 2726 int coroutine_fn 2727 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2728 { 2729 BlockDriver *drv = bs->drv; 2730 BlockDriverState *child_bs = bdrv_primary_bs(bs); 2731 int ret; 2732 IO_CODE(); 2733 assert_bdrv_graph_readable(); 2734 2735 ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2736 if (ret < 0) { 2737 return ret; 2738 } 2739 2740 if (!drv) { 2741 return -ENOMEDIUM; 2742 } 2743 2744 bdrv_inc_in_flight(bs); 2745 2746 if (drv->bdrv_save_vmstate) { 2747 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2748 } else if (child_bs) { 2749 ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); 2750 } else { 2751 ret = -ENOTSUP; 2752 } 2753 2754 bdrv_dec_in_flight(bs); 2755 2756 return ret; 2757 } 2758 2759 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2760 int64_t pos, int size) 2761 { 2762 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2763 int ret = bdrv_writev_vmstate(bs, &qiov, pos); 2764 IO_CODE(); 2765 2766 return ret < 0 ? ret : size; 2767 } 2768 2769 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2770 int64_t pos, int size) 2771 { 2772 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2773 int ret = bdrv_readv_vmstate(bs, &qiov, pos); 2774 IO_CODE(); 2775 2776 return ret < 0 ? ret : size; 2777 } 2778 2779 /**************************************************************/ 2780 /* async I/Os */ 2781 2782 void bdrv_aio_cancel(BlockAIOCB *acb) 2783 { 2784 IO_CODE(); 2785 qemu_aio_ref(acb); 2786 bdrv_aio_cancel_async(acb); 2787 while (acb->refcnt > 1) { 2788 if (acb->aiocb_info->get_aio_context) { 2789 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2790 } else if (acb->bs) { 2791 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2792 * assert that we're not using an I/O thread. Thread-safe 2793 * code should use bdrv_aio_cancel_async exclusively. 2794 */ 2795 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2796 aio_poll(bdrv_get_aio_context(acb->bs), true); 2797 } else { 2798 abort(); 2799 } 2800 } 2801 qemu_aio_unref(acb); 2802 } 2803 2804 /* Async version of aio cancel. The caller is not blocked if the acb implements 2805 * cancel_async, otherwise we do nothing and let the request normally complete. 2806 * In either case the completion callback must be called. */ 2807 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2808 { 2809 IO_CODE(); 2810 if (acb->aiocb_info->cancel_async) { 2811 acb->aiocb_info->cancel_async(acb); 2812 } 2813 } 2814 2815 /**************************************************************/ 2816 /* Coroutine block device emulation */ 2817 2818 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2819 { 2820 BdrvChild *primary_child = bdrv_primary_child(bs); 2821 BdrvChild *child; 2822 int current_gen; 2823 int ret = 0; 2824 IO_CODE(); 2825 2826 bdrv_inc_in_flight(bs); 2827 2828 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2829 bdrv_is_sg(bs)) { 2830 goto early_exit; 2831 } 2832 2833 qemu_co_mutex_lock(&bs->reqs_lock); 2834 current_gen = qatomic_read(&bs->write_gen); 2835 2836 /* Wait until any previous flushes are completed */ 2837 while (bs->active_flush_req) { 2838 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2839 } 2840 2841 /* Flushes reach this point in nondecreasing current_gen order. */ 2842 bs->active_flush_req = true; 2843 qemu_co_mutex_unlock(&bs->reqs_lock); 2844 2845 /* Write back all layers by calling one driver function */ 2846 if (bs->drv->bdrv_co_flush) { 2847 ret = bs->drv->bdrv_co_flush(bs); 2848 goto out; 2849 } 2850 2851 /* Write back cached data to the OS even with cache=unsafe */ 2852 BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); 2853 if (bs->drv->bdrv_co_flush_to_os) { 2854 ret = bs->drv->bdrv_co_flush_to_os(bs); 2855 if (ret < 0) { 2856 goto out; 2857 } 2858 } 2859 2860 /* But don't actually force it to the disk with cache=unsafe */ 2861 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2862 goto flush_children; 2863 } 2864 2865 /* Check if we really need to flush anything */ 2866 if (bs->flushed_gen == current_gen) { 2867 goto flush_children; 2868 } 2869 2870 BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); 2871 if (!bs->drv) { 2872 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2873 * (even in case of apparent success) */ 2874 ret = -ENOMEDIUM; 2875 goto out; 2876 } 2877 if (bs->drv->bdrv_co_flush_to_disk) { 2878 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2879 } else if (bs->drv->bdrv_aio_flush) { 2880 BlockAIOCB *acb; 2881 CoroutineIOCompletion co = { 2882 .coroutine = qemu_coroutine_self(), 2883 }; 2884 2885 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2886 if (acb == NULL) { 2887 ret = -EIO; 2888 } else { 2889 qemu_coroutine_yield(); 2890 ret = co.ret; 2891 } 2892 } else { 2893 /* 2894 * Some block drivers always operate in either writethrough or unsafe 2895 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2896 * know how the server works (because the behaviour is hardcoded or 2897 * depends on server-side configuration), so we can't ensure that 2898 * everything is safe on disk. Returning an error doesn't work because 2899 * that would break guests even if the server operates in writethrough 2900 * mode. 2901 * 2902 * Let's hope the user knows what he's doing. 2903 */ 2904 ret = 0; 2905 } 2906 2907 if (ret < 0) { 2908 goto out; 2909 } 2910 2911 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2912 * in the case of cache=unsafe, so there are no useless flushes. 2913 */ 2914 flush_children: 2915 ret = 0; 2916 QLIST_FOREACH(child, &bs->children, next) { 2917 if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { 2918 int this_child_ret = bdrv_co_flush(child->bs); 2919 if (!ret) { 2920 ret = this_child_ret; 2921 } 2922 } 2923 } 2924 2925 out: 2926 /* Notify any pending flushes that we have completed */ 2927 if (ret == 0) { 2928 bs->flushed_gen = current_gen; 2929 } 2930 2931 qemu_co_mutex_lock(&bs->reqs_lock); 2932 bs->active_flush_req = false; 2933 /* Return value is ignored - it's ok if wait queue is empty */ 2934 qemu_co_queue_next(&bs->flush_queue); 2935 qemu_co_mutex_unlock(&bs->reqs_lock); 2936 2937 early_exit: 2938 bdrv_dec_in_flight(bs); 2939 return ret; 2940 } 2941 2942 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2943 int64_t bytes) 2944 { 2945 BdrvTrackedRequest req; 2946 int ret; 2947 int64_t max_pdiscard; 2948 int head, tail, align; 2949 BlockDriverState *bs = child->bs; 2950 IO_CODE(); 2951 2952 if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 2953 return -ENOMEDIUM; 2954 } 2955 2956 if (bdrv_has_readonly_bitmaps(bs)) { 2957 return -EPERM; 2958 } 2959 2960 ret = bdrv_check_request(offset, bytes, NULL); 2961 if (ret < 0) { 2962 return ret; 2963 } 2964 2965 /* Do nothing if disabled. */ 2966 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2967 return 0; 2968 } 2969 2970 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2971 return 0; 2972 } 2973 2974 /* Invalidate the cached block-status data range if this discard overlaps */ 2975 bdrv_bsc_invalidate_range(bs, offset, bytes); 2976 2977 /* Discard is advisory, but some devices track and coalesce 2978 * unaligned requests, so we must pass everything down rather than 2979 * round here. Still, most devices will just silently ignore 2980 * unaligned requests (by returning -ENOTSUP), so we must fragment 2981 * the request accordingly. */ 2982 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2983 assert(align % bs->bl.request_alignment == 0); 2984 head = offset % align; 2985 tail = (offset + bytes) % align; 2986 2987 bdrv_inc_in_flight(bs); 2988 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2989 2990 ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2991 if (ret < 0) { 2992 goto out; 2993 } 2994 2995 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX), 2996 align); 2997 assert(max_pdiscard >= bs->bl.request_alignment); 2998 2999 while (bytes > 0) { 3000 int64_t num = bytes; 3001 3002 if (head) { 3003 /* Make small requests to get to alignment boundaries. */ 3004 num = MIN(bytes, align - head); 3005 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 3006 num %= bs->bl.request_alignment; 3007 } 3008 head = (head + num) % align; 3009 assert(num < max_pdiscard); 3010 } else if (tail) { 3011 if (num > align) { 3012 /* Shorten the request to the last aligned cluster. */ 3013 num -= tail; 3014 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 3015 tail > bs->bl.request_alignment) { 3016 tail %= bs->bl.request_alignment; 3017 num -= tail; 3018 } 3019 } 3020 /* limit request size */ 3021 if (num > max_pdiscard) { 3022 num = max_pdiscard; 3023 } 3024 3025 if (!bs->drv) { 3026 ret = -ENOMEDIUM; 3027 goto out; 3028 } 3029 if (bs->drv->bdrv_co_pdiscard) { 3030 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 3031 } else { 3032 BlockAIOCB *acb; 3033 CoroutineIOCompletion co = { 3034 .coroutine = qemu_coroutine_self(), 3035 }; 3036 3037 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 3038 bdrv_co_io_em_complete, &co); 3039 if (acb == NULL) { 3040 ret = -EIO; 3041 goto out; 3042 } else { 3043 qemu_coroutine_yield(); 3044 ret = co.ret; 3045 } 3046 } 3047 if (ret && ret != -ENOTSUP) { 3048 goto out; 3049 } 3050 3051 offset += num; 3052 bytes -= num; 3053 } 3054 ret = 0; 3055 out: 3056 bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3057 tracked_request_end(&req); 3058 bdrv_dec_in_flight(bs); 3059 return ret; 3060 } 3061 3062 int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 3063 { 3064 BlockDriver *drv = bs->drv; 3065 CoroutineIOCompletion co = { 3066 .coroutine = qemu_coroutine_self(), 3067 }; 3068 BlockAIOCB *acb; 3069 IO_CODE(); 3070 3071 bdrv_inc_in_flight(bs); 3072 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 3073 co.ret = -ENOTSUP; 3074 goto out; 3075 } 3076 3077 if (drv->bdrv_co_ioctl) { 3078 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 3079 } else { 3080 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 3081 if (!acb) { 3082 co.ret = -ENOTSUP; 3083 goto out; 3084 } 3085 qemu_coroutine_yield(); 3086 } 3087 out: 3088 bdrv_dec_in_flight(bs); 3089 return co.ret; 3090 } 3091 3092 void *qemu_blockalign(BlockDriverState *bs, size_t size) 3093 { 3094 IO_CODE(); 3095 return qemu_memalign(bdrv_opt_mem_align(bs), size); 3096 } 3097 3098 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 3099 { 3100 IO_CODE(); 3101 return memset(qemu_blockalign(bs, size), 0, size); 3102 } 3103 3104 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 3105 { 3106 size_t align = bdrv_opt_mem_align(bs); 3107 IO_CODE(); 3108 3109 /* Ensure that NULL is never returned on success */ 3110 assert(align > 0); 3111 if (size == 0) { 3112 size = align; 3113 } 3114 3115 return qemu_try_memalign(align, size); 3116 } 3117 3118 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 3119 { 3120 void *mem = qemu_try_blockalign(bs, size); 3121 IO_CODE(); 3122 3123 if (mem) { 3124 memset(mem, 0, size); 3125 } 3126 3127 return mem; 3128 } 3129 3130 void bdrv_io_plug(BlockDriverState *bs) 3131 { 3132 BdrvChild *child; 3133 IO_CODE(); 3134 3135 QLIST_FOREACH(child, &bs->children, next) { 3136 bdrv_io_plug(child->bs); 3137 } 3138 3139 if (qatomic_fetch_inc(&bs->io_plugged) == 0) { 3140 BlockDriver *drv = bs->drv; 3141 if (drv && drv->bdrv_io_plug) { 3142 drv->bdrv_io_plug(bs); 3143 } 3144 } 3145 } 3146 3147 void bdrv_io_unplug(BlockDriverState *bs) 3148 { 3149 BdrvChild *child; 3150 IO_CODE(); 3151 3152 assert(bs->io_plugged); 3153 if (qatomic_fetch_dec(&bs->io_plugged) == 1) { 3154 BlockDriver *drv = bs->drv; 3155 if (drv && drv->bdrv_io_unplug) { 3156 drv->bdrv_io_unplug(bs); 3157 } 3158 } 3159 3160 QLIST_FOREACH(child, &bs->children, next) { 3161 bdrv_io_unplug(child->bs); 3162 } 3163 } 3164 3165 /* Helper that undoes bdrv_register_buf() when it fails partway through */ 3166 static void bdrv_register_buf_rollback(BlockDriverState *bs, 3167 void *host, 3168 size_t size, 3169 BdrvChild *final_child) 3170 { 3171 BdrvChild *child; 3172 3173 QLIST_FOREACH(child, &bs->children, next) { 3174 if (child == final_child) { 3175 break; 3176 } 3177 3178 bdrv_unregister_buf(child->bs, host, size); 3179 } 3180 3181 if (bs->drv && bs->drv->bdrv_unregister_buf) { 3182 bs->drv->bdrv_unregister_buf(bs, host, size); 3183 } 3184 } 3185 3186 bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size, 3187 Error **errp) 3188 { 3189 BdrvChild *child; 3190 3191 GLOBAL_STATE_CODE(); 3192 if (bs->drv && bs->drv->bdrv_register_buf) { 3193 if (!bs->drv->bdrv_register_buf(bs, host, size, errp)) { 3194 return false; 3195 } 3196 } 3197 QLIST_FOREACH(child, &bs->children, next) { 3198 if (!bdrv_register_buf(child->bs, host, size, errp)) { 3199 bdrv_register_buf_rollback(bs, host, size, child); 3200 return false; 3201 } 3202 } 3203 return true; 3204 } 3205 3206 void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size) 3207 { 3208 BdrvChild *child; 3209 3210 GLOBAL_STATE_CODE(); 3211 if (bs->drv && bs->drv->bdrv_unregister_buf) { 3212 bs->drv->bdrv_unregister_buf(bs, host, size); 3213 } 3214 QLIST_FOREACH(child, &bs->children, next) { 3215 bdrv_unregister_buf(child->bs, host, size); 3216 } 3217 } 3218 3219 static int coroutine_fn bdrv_co_copy_range_internal( 3220 BdrvChild *src, int64_t src_offset, BdrvChild *dst, 3221 int64_t dst_offset, int64_t bytes, 3222 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3223 bool recurse_src) 3224 { 3225 BdrvTrackedRequest req; 3226 int ret; 3227 3228 /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3229 assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3230 assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3231 assert(!(read_flags & BDRV_REQ_NO_WAIT)); 3232 assert(!(write_flags & BDRV_REQ_NO_WAIT)); 3233 3234 if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) { 3235 return -ENOMEDIUM; 3236 } 3237 ret = bdrv_check_request32(dst_offset, bytes, NULL, 0); 3238 if (ret) { 3239 return ret; 3240 } 3241 if (write_flags & BDRV_REQ_ZERO_WRITE) { 3242 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3243 } 3244 3245 if (!src || !src->bs || !bdrv_is_inserted(src->bs)) { 3246 return -ENOMEDIUM; 3247 } 3248 ret = bdrv_check_request32(src_offset, bytes, NULL, 0); 3249 if (ret) { 3250 return ret; 3251 } 3252 3253 if (!src->bs->drv->bdrv_co_copy_range_from 3254 || !dst->bs->drv->bdrv_co_copy_range_to 3255 || src->bs->encrypted || dst->bs->encrypted) { 3256 return -ENOTSUP; 3257 } 3258 3259 if (recurse_src) { 3260 bdrv_inc_in_flight(src->bs); 3261 tracked_request_begin(&req, src->bs, src_offset, bytes, 3262 BDRV_TRACKED_READ); 3263 3264 /* BDRV_REQ_SERIALISING is only for write operation */ 3265 assert(!(read_flags & BDRV_REQ_SERIALISING)); 3266 bdrv_wait_serialising_requests(&req); 3267 3268 ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3269 src, src_offset, 3270 dst, dst_offset, 3271 bytes, 3272 read_flags, write_flags); 3273 3274 tracked_request_end(&req); 3275 bdrv_dec_in_flight(src->bs); 3276 } else { 3277 bdrv_inc_in_flight(dst->bs); 3278 tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3279 BDRV_TRACKED_WRITE); 3280 ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 3281 write_flags); 3282 if (!ret) { 3283 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3284 src, src_offset, 3285 dst, dst_offset, 3286 bytes, 3287 read_flags, write_flags); 3288 } 3289 bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3290 tracked_request_end(&req); 3291 bdrv_dec_in_flight(dst->bs); 3292 } 3293 3294 return ret; 3295 } 3296 3297 /* Copy range from @src to @dst. 3298 * 3299 * See the comment of bdrv_co_copy_range for the parameter and return value 3300 * semantics. */ 3301 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, 3302 BdrvChild *dst, int64_t dst_offset, 3303 int64_t bytes, 3304 BdrvRequestFlags read_flags, 3305 BdrvRequestFlags write_flags) 3306 { 3307 IO_CODE(); 3308 trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3309 read_flags, write_flags); 3310 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3311 bytes, read_flags, write_flags, true); 3312 } 3313 3314 /* Copy range from @src to @dst. 3315 * 3316 * See the comment of bdrv_co_copy_range for the parameter and return value 3317 * semantics. */ 3318 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, 3319 BdrvChild *dst, int64_t dst_offset, 3320 int64_t bytes, 3321 BdrvRequestFlags read_flags, 3322 BdrvRequestFlags write_flags) 3323 { 3324 IO_CODE(); 3325 trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3326 read_flags, write_flags); 3327 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3328 bytes, read_flags, write_flags, false); 3329 } 3330 3331 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, 3332 BdrvChild *dst, int64_t dst_offset, 3333 int64_t bytes, BdrvRequestFlags read_flags, 3334 BdrvRequestFlags write_flags) 3335 { 3336 IO_CODE(); 3337 return bdrv_co_copy_range_from(src, src_offset, 3338 dst, dst_offset, 3339 bytes, read_flags, write_flags); 3340 } 3341 3342 static void bdrv_parent_cb_resize(BlockDriverState *bs) 3343 { 3344 BdrvChild *c; 3345 QLIST_FOREACH(c, &bs->parents, next_parent) { 3346 if (c->klass->resize) { 3347 c->klass->resize(c); 3348 } 3349 } 3350 } 3351 3352 /** 3353 * Truncate file to 'offset' bytes (needed only for file protocols) 3354 * 3355 * If 'exact' is true, the file must be resized to exactly the given 3356 * 'offset'. Otherwise, it is sufficient for the node to be at least 3357 * 'offset' bytes in length. 3358 */ 3359 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 3360 PreallocMode prealloc, BdrvRequestFlags flags, 3361 Error **errp) 3362 { 3363 BlockDriverState *bs = child->bs; 3364 BdrvChild *filtered, *backing; 3365 BlockDriver *drv = bs->drv; 3366 BdrvTrackedRequest req; 3367 int64_t old_size, new_bytes; 3368 int ret; 3369 IO_CODE(); 3370 3371 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 3372 if (!drv) { 3373 error_setg(errp, "No medium inserted"); 3374 return -ENOMEDIUM; 3375 } 3376 if (offset < 0) { 3377 error_setg(errp, "Image size cannot be negative"); 3378 return -EINVAL; 3379 } 3380 3381 ret = bdrv_check_request(offset, 0, errp); 3382 if (ret < 0) { 3383 return ret; 3384 } 3385 3386 old_size = bdrv_getlength(bs); 3387 if (old_size < 0) { 3388 error_setg_errno(errp, -old_size, "Failed to get old image size"); 3389 return old_size; 3390 } 3391 3392 if (bdrv_is_read_only(bs)) { 3393 error_setg(errp, "Image is read-only"); 3394 return -EACCES; 3395 } 3396 3397 if (offset > old_size) { 3398 new_bytes = offset - old_size; 3399 } else { 3400 new_bytes = 0; 3401 } 3402 3403 bdrv_inc_in_flight(bs); 3404 tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 3405 BDRV_TRACKED_TRUNCATE); 3406 3407 /* If we are growing the image and potentially using preallocation for the 3408 * new area, we need to make sure that no write requests are made to it 3409 * concurrently or they might be overwritten by preallocation. */ 3410 if (new_bytes) { 3411 bdrv_make_request_serialising(&req, 1); 3412 } 3413 ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3414 0); 3415 if (ret < 0) { 3416 error_setg_errno(errp, -ret, 3417 "Failed to prepare request for truncation"); 3418 goto out; 3419 } 3420 3421 filtered = bdrv_filter_child(bs); 3422 backing = bdrv_cow_child(bs); 3423 3424 /* 3425 * If the image has a backing file that is large enough that it would 3426 * provide data for the new area, we cannot leave it unallocated because 3427 * then the backing file content would become visible. Instead, zero-fill 3428 * the new area. 3429 * 3430 * Note that if the image has a backing file, but was opened without the 3431 * backing file, taking care of keeping things consistent with that backing 3432 * file is the user's responsibility. 3433 */ 3434 if (new_bytes && backing) { 3435 int64_t backing_len; 3436 3437 backing_len = bdrv_getlength(backing->bs); 3438 if (backing_len < 0) { 3439 ret = backing_len; 3440 error_setg_errno(errp, -ret, "Could not get backing file size"); 3441 goto out; 3442 } 3443 3444 if (backing_len > old_size) { 3445 flags |= BDRV_REQ_ZERO_WRITE; 3446 } 3447 } 3448 3449 if (drv->bdrv_co_truncate) { 3450 if (flags & ~bs->supported_truncate_flags) { 3451 error_setg(errp, "Block driver does not support requested flags"); 3452 ret = -ENOTSUP; 3453 goto out; 3454 } 3455 ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 3456 } else if (filtered) { 3457 ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); 3458 } else { 3459 error_setg(errp, "Image format driver does not support resize"); 3460 ret = -ENOTSUP; 3461 goto out; 3462 } 3463 if (ret < 0) { 3464 goto out; 3465 } 3466 3467 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3468 if (ret < 0) { 3469 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 3470 } else { 3471 offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3472 } 3473 /* It's possible that truncation succeeded but refresh_total_sectors 3474 * failed, but the latter doesn't affect how we should finish the request. 3475 * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3476 bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 3477 3478 out: 3479 tracked_request_end(&req); 3480 bdrv_dec_in_flight(bs); 3481 3482 return ret; 3483 } 3484 3485 void bdrv_cancel_in_flight(BlockDriverState *bs) 3486 { 3487 GLOBAL_STATE_CODE(); 3488 if (!bs || !bs->drv) { 3489 return; 3490 } 3491 3492 if (bs->drv->bdrv_cancel_in_flight) { 3493 bs->drv->bdrv_cancel_in_flight(bs); 3494 } 3495 } 3496 3497 int coroutine_fn 3498 bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes, 3499 QEMUIOVector *qiov, size_t qiov_offset) 3500 { 3501 BlockDriverState *bs = child->bs; 3502 BlockDriver *drv = bs->drv; 3503 int ret; 3504 IO_CODE(); 3505 3506 if (!drv) { 3507 return -ENOMEDIUM; 3508 } 3509 3510 if (!drv->bdrv_co_preadv_snapshot) { 3511 return -ENOTSUP; 3512 } 3513 3514 bdrv_inc_in_flight(bs); 3515 ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset); 3516 bdrv_dec_in_flight(bs); 3517 3518 return ret; 3519 } 3520 3521 int coroutine_fn 3522 bdrv_co_snapshot_block_status(BlockDriverState *bs, 3523 bool want_zero, int64_t offset, int64_t bytes, 3524 int64_t *pnum, int64_t *map, 3525 BlockDriverState **file) 3526 { 3527 BlockDriver *drv = bs->drv; 3528 int ret; 3529 IO_CODE(); 3530 3531 if (!drv) { 3532 return -ENOMEDIUM; 3533 } 3534 3535 if (!drv->bdrv_co_snapshot_block_status) { 3536 return -ENOTSUP; 3537 } 3538 3539 bdrv_inc_in_flight(bs); 3540 ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes, 3541 pnum, map, file); 3542 bdrv_dec_in_flight(bs); 3543 3544 return ret; 3545 } 3546 3547 int coroutine_fn 3548 bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes) 3549 { 3550 BlockDriver *drv = bs->drv; 3551 int ret; 3552 IO_CODE(); 3553 3554 if (!drv) { 3555 return -ENOMEDIUM; 3556 } 3557 3558 if (!drv->bdrv_co_pdiscard_snapshot) { 3559 return -ENOTSUP; 3560 } 3561 3562 bdrv_inc_in_flight(bs); 3563 ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes); 3564 bdrv_dec_in_flight(bs); 3565 3566 return ret; 3567 } 3568