1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "block/coroutines.h" 33 #include "block/write-threshold.h" 34 #include "qemu/cutils.h" 35 #include "qapi/error.h" 36 #include "qemu/error-report.h" 37 #include "qemu/main-loop.h" 38 #include "sysemu/replay.h" 39 40 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 41 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 42 43 static void bdrv_parent_cb_resize(BlockDriverState *bs); 44 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 45 int64_t offset, int64_t bytes, BdrvRequestFlags flags); 46 47 static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 48 bool ignore_bds_parents) 49 { 50 BdrvChild *c, *next; 51 52 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 53 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 54 continue; 55 } 56 bdrv_parent_drained_begin_single(c, false); 57 } 58 } 59 60 static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 61 int *drained_end_counter) 62 { 63 assert(c->parent_quiesce_counter > 0); 64 c->parent_quiesce_counter--; 65 if (c->klass->drained_end) { 66 c->klass->drained_end(c, drained_end_counter); 67 } 68 } 69 70 void bdrv_parent_drained_end_single(BdrvChild *c) 71 { 72 int drained_end_counter = 0; 73 bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 74 BDRV_POLL_WHILE(c->bs, qatomic_read(&drained_end_counter) > 0); 75 } 76 77 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 78 bool ignore_bds_parents, 79 int *drained_end_counter) 80 { 81 BdrvChild *c; 82 83 QLIST_FOREACH(c, &bs->parents, next_parent) { 84 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 85 continue; 86 } 87 bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 88 } 89 } 90 91 static bool bdrv_parent_drained_poll_single(BdrvChild *c) 92 { 93 if (c->klass->drained_poll) { 94 return c->klass->drained_poll(c); 95 } 96 return false; 97 } 98 99 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 100 bool ignore_bds_parents) 101 { 102 BdrvChild *c, *next; 103 bool busy = false; 104 105 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 106 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 107 continue; 108 } 109 busy |= bdrv_parent_drained_poll_single(c); 110 } 111 112 return busy; 113 } 114 115 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 116 { 117 c->parent_quiesce_counter++; 118 if (c->klass->drained_begin) { 119 c->klass->drained_begin(c); 120 } 121 if (poll) { 122 BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 123 } 124 } 125 126 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 127 { 128 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 129 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 130 dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, 131 src->max_hw_transfer); 132 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 133 src->opt_mem_alignment); 134 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 135 src->min_mem_alignment); 136 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 137 } 138 139 typedef struct BdrvRefreshLimitsState { 140 BlockDriverState *bs; 141 BlockLimits old_bl; 142 } BdrvRefreshLimitsState; 143 144 static void bdrv_refresh_limits_abort(void *opaque) 145 { 146 BdrvRefreshLimitsState *s = opaque; 147 148 s->bs->bl = s->old_bl; 149 } 150 151 static TransactionActionDrv bdrv_refresh_limits_drv = { 152 .abort = bdrv_refresh_limits_abort, 153 .clean = g_free, 154 }; 155 156 /* @tran is allowed to be NULL, in this case no rollback is possible. */ 157 void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp) 158 { 159 ERRP_GUARD(); 160 BlockDriver *drv = bs->drv; 161 BdrvChild *c; 162 bool have_limits; 163 164 if (tran) { 165 BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1); 166 *s = (BdrvRefreshLimitsState) { 167 .bs = bs, 168 .old_bl = bs->bl, 169 }; 170 tran_add(tran, &bdrv_refresh_limits_drv, s); 171 } 172 173 memset(&bs->bl, 0, sizeof(bs->bl)); 174 175 if (!drv) { 176 return; 177 } 178 179 /* Default alignment based on whether driver has byte interface */ 180 bs->bl.request_alignment = (drv->bdrv_co_preadv || 181 drv->bdrv_aio_preadv || 182 drv->bdrv_co_preadv_part) ? 1 : 512; 183 184 /* Take some limits from the children as a default */ 185 have_limits = false; 186 QLIST_FOREACH(c, &bs->children, next) { 187 if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) 188 { 189 bdrv_refresh_limits(c->bs, tran, errp); 190 if (*errp) { 191 return; 192 } 193 bdrv_merge_limits(&bs->bl, &c->bs->bl); 194 have_limits = true; 195 } 196 } 197 198 if (!have_limits) { 199 bs->bl.min_mem_alignment = 512; 200 bs->bl.opt_mem_alignment = qemu_real_host_page_size; 201 202 /* Safe default since most protocols use readv()/writev()/etc */ 203 bs->bl.max_iov = IOV_MAX; 204 } 205 206 /* Then let the driver override it */ 207 if (drv->bdrv_refresh_limits) { 208 drv->bdrv_refresh_limits(bs, errp); 209 if (*errp) { 210 return; 211 } 212 } 213 214 if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) { 215 error_setg(errp, "Driver requires too large request alignment"); 216 } 217 } 218 219 /** 220 * The copy-on-read flag is actually a reference count so multiple users may 221 * use the feature without worrying about clobbering its previous state. 222 * Copy-on-read stays enabled until all users have called to disable it. 223 */ 224 void bdrv_enable_copy_on_read(BlockDriverState *bs) 225 { 226 qatomic_inc(&bs->copy_on_read); 227 } 228 229 void bdrv_disable_copy_on_read(BlockDriverState *bs) 230 { 231 int old = qatomic_fetch_dec(&bs->copy_on_read); 232 assert(old >= 1); 233 } 234 235 typedef struct { 236 Coroutine *co; 237 BlockDriverState *bs; 238 bool done; 239 bool begin; 240 bool recursive; 241 bool poll; 242 BdrvChild *parent; 243 bool ignore_bds_parents; 244 int *drained_end_counter; 245 } BdrvCoDrainData; 246 247 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 248 { 249 BdrvCoDrainData *data = opaque; 250 BlockDriverState *bs = data->bs; 251 252 if (data->begin) { 253 bs->drv->bdrv_co_drain_begin(bs); 254 } else { 255 bs->drv->bdrv_co_drain_end(bs); 256 } 257 258 /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 259 qatomic_mb_set(&data->done, true); 260 if (!data->begin) { 261 qatomic_dec(data->drained_end_counter); 262 } 263 bdrv_dec_in_flight(bs); 264 265 g_free(data); 266 } 267 268 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 269 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 270 int *drained_end_counter) 271 { 272 BdrvCoDrainData *data; 273 274 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 275 (!begin && !bs->drv->bdrv_co_drain_end)) { 276 return; 277 } 278 279 data = g_new(BdrvCoDrainData, 1); 280 *data = (BdrvCoDrainData) { 281 .bs = bs, 282 .done = false, 283 .begin = begin, 284 .drained_end_counter = drained_end_counter, 285 }; 286 287 if (!begin) { 288 qatomic_inc(drained_end_counter); 289 } 290 291 /* Make sure the driver callback completes during the polling phase for 292 * drain_begin. */ 293 bdrv_inc_in_flight(bs); 294 data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 295 aio_co_schedule(bdrv_get_aio_context(bs), data->co); 296 } 297 298 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 299 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 300 BdrvChild *ignore_parent, bool ignore_bds_parents) 301 { 302 BdrvChild *child, *next; 303 304 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 305 return true; 306 } 307 308 if (qatomic_read(&bs->in_flight)) { 309 return true; 310 } 311 312 if (recursive) { 313 assert(!ignore_bds_parents); 314 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 315 if (bdrv_drain_poll(child->bs, recursive, child, false)) { 316 return true; 317 } 318 } 319 } 320 321 return false; 322 } 323 324 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 325 BdrvChild *ignore_parent) 326 { 327 return bdrv_drain_poll(bs, recursive, ignore_parent, false); 328 } 329 330 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 331 BdrvChild *parent, bool ignore_bds_parents, 332 bool poll); 333 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 334 BdrvChild *parent, bool ignore_bds_parents, 335 int *drained_end_counter); 336 337 static void bdrv_co_drain_bh_cb(void *opaque) 338 { 339 BdrvCoDrainData *data = opaque; 340 Coroutine *co = data->co; 341 BlockDriverState *bs = data->bs; 342 343 if (bs) { 344 AioContext *ctx = bdrv_get_aio_context(bs); 345 aio_context_acquire(ctx); 346 bdrv_dec_in_flight(bs); 347 if (data->begin) { 348 assert(!data->drained_end_counter); 349 bdrv_do_drained_begin(bs, data->recursive, data->parent, 350 data->ignore_bds_parents, data->poll); 351 } else { 352 assert(!data->poll); 353 bdrv_do_drained_end(bs, data->recursive, data->parent, 354 data->ignore_bds_parents, 355 data->drained_end_counter); 356 } 357 aio_context_release(ctx); 358 } else { 359 assert(data->begin); 360 bdrv_drain_all_begin(); 361 } 362 363 data->done = true; 364 aio_co_wake(co); 365 } 366 367 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 368 bool begin, bool recursive, 369 BdrvChild *parent, 370 bool ignore_bds_parents, 371 bool poll, 372 int *drained_end_counter) 373 { 374 BdrvCoDrainData data; 375 Coroutine *self = qemu_coroutine_self(); 376 AioContext *ctx = bdrv_get_aio_context(bs); 377 AioContext *co_ctx = qemu_coroutine_get_aio_context(self); 378 379 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 380 * other coroutines run if they were queued by aio_co_enter(). */ 381 382 assert(qemu_in_coroutine()); 383 data = (BdrvCoDrainData) { 384 .co = self, 385 .bs = bs, 386 .done = false, 387 .begin = begin, 388 .recursive = recursive, 389 .parent = parent, 390 .ignore_bds_parents = ignore_bds_parents, 391 .poll = poll, 392 .drained_end_counter = drained_end_counter, 393 }; 394 395 if (bs) { 396 bdrv_inc_in_flight(bs); 397 } 398 399 /* 400 * Temporarily drop the lock across yield or we would get deadlocks. 401 * bdrv_co_drain_bh_cb() reaquires the lock as needed. 402 * 403 * When we yield below, the lock for the current context will be 404 * released, so if this is actually the lock that protects bs, don't drop 405 * it a second time. 406 */ 407 if (ctx != co_ctx) { 408 aio_context_release(ctx); 409 } 410 replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data); 411 412 qemu_coroutine_yield(); 413 /* If we are resumed from some other event (such as an aio completion or a 414 * timer callback), it is a bug in the caller that should be fixed. */ 415 assert(data.done); 416 417 /* Reaquire the AioContext of bs if we dropped it */ 418 if (ctx != co_ctx) { 419 aio_context_acquire(ctx); 420 } 421 } 422 423 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 424 BdrvChild *parent, bool ignore_bds_parents) 425 { 426 assert(!qemu_in_coroutine()); 427 428 /* Stop things in parent-to-child order */ 429 if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { 430 aio_disable_external(bdrv_get_aio_context(bs)); 431 } 432 433 bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 434 bdrv_drain_invoke(bs, true, NULL); 435 } 436 437 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 438 BdrvChild *parent, bool ignore_bds_parents, 439 bool poll) 440 { 441 BdrvChild *child, *next; 442 443 if (qemu_in_coroutine()) { 444 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 445 poll, NULL); 446 return; 447 } 448 449 bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 450 451 if (recursive) { 452 assert(!ignore_bds_parents); 453 bs->recursive_quiesce_counter++; 454 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 455 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 456 false); 457 } 458 } 459 460 /* 461 * Wait for drained requests to finish. 462 * 463 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 464 * call is needed so things in this AioContext can make progress even 465 * though we don't return to the main AioContext loop - this automatically 466 * includes other nodes in the same AioContext and therefore all child 467 * nodes. 468 */ 469 if (poll) { 470 assert(!ignore_bds_parents); 471 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 472 } 473 } 474 475 void bdrv_drained_begin(BlockDriverState *bs) 476 { 477 bdrv_do_drained_begin(bs, false, NULL, false, true); 478 } 479 480 void bdrv_subtree_drained_begin(BlockDriverState *bs) 481 { 482 bdrv_do_drained_begin(bs, true, NULL, false, true); 483 } 484 485 /** 486 * This function does not poll, nor must any of its recursively called 487 * functions. The *drained_end_counter pointee will be incremented 488 * once for every background operation scheduled, and decremented once 489 * the operation settles. Therefore, the pointer must remain valid 490 * until the pointee reaches 0. That implies that whoever sets up the 491 * pointee has to poll until it is 0. 492 * 493 * We use atomic operations to access *drained_end_counter, because 494 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 495 * @bs may contain nodes in different AioContexts, 496 * (2) bdrv_drain_all_end() uses the same counter for all nodes, 497 * regardless of which AioContext they are in. 498 */ 499 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 500 BdrvChild *parent, bool ignore_bds_parents, 501 int *drained_end_counter) 502 { 503 BdrvChild *child; 504 int old_quiesce_counter; 505 506 assert(drained_end_counter != NULL); 507 508 if (qemu_in_coroutine()) { 509 bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 510 false, drained_end_counter); 511 return; 512 } 513 assert(bs->quiesce_counter > 0); 514 515 /* Re-enable things in child-to-parent order */ 516 bdrv_drain_invoke(bs, false, drained_end_counter); 517 bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 518 drained_end_counter); 519 520 old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); 521 if (old_quiesce_counter == 1) { 522 aio_enable_external(bdrv_get_aio_context(bs)); 523 } 524 525 if (recursive) { 526 assert(!ignore_bds_parents); 527 bs->recursive_quiesce_counter--; 528 QLIST_FOREACH(child, &bs->children, next) { 529 bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 530 drained_end_counter); 531 } 532 } 533 } 534 535 void bdrv_drained_end(BlockDriverState *bs) 536 { 537 int drained_end_counter = 0; 538 bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 539 BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 540 } 541 542 void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 543 { 544 bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 545 } 546 547 void bdrv_subtree_drained_end(BlockDriverState *bs) 548 { 549 int drained_end_counter = 0; 550 bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 551 BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 552 } 553 554 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 555 { 556 int i; 557 558 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 559 bdrv_do_drained_begin(child->bs, true, child, false, true); 560 } 561 } 562 563 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 564 { 565 int drained_end_counter = 0; 566 int i; 567 568 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 569 bdrv_do_drained_end(child->bs, true, child, false, 570 &drained_end_counter); 571 } 572 573 BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0); 574 } 575 576 /* 577 * Wait for pending requests to complete on a single BlockDriverState subtree, 578 * and suspend block driver's internal I/O until next request arrives. 579 * 580 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 581 * AioContext. 582 */ 583 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 584 { 585 assert(qemu_in_coroutine()); 586 bdrv_drained_begin(bs); 587 bdrv_drained_end(bs); 588 } 589 590 void bdrv_drain(BlockDriverState *bs) 591 { 592 bdrv_drained_begin(bs); 593 bdrv_drained_end(bs); 594 } 595 596 static void bdrv_drain_assert_idle(BlockDriverState *bs) 597 { 598 BdrvChild *child, *next; 599 600 assert(qatomic_read(&bs->in_flight) == 0); 601 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 602 bdrv_drain_assert_idle(child->bs); 603 } 604 } 605 606 unsigned int bdrv_drain_all_count = 0; 607 608 static bool bdrv_drain_all_poll(void) 609 { 610 BlockDriverState *bs = NULL; 611 bool result = false; 612 613 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 614 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 615 while ((bs = bdrv_next_all_states(bs))) { 616 AioContext *aio_context = bdrv_get_aio_context(bs); 617 aio_context_acquire(aio_context); 618 result |= bdrv_drain_poll(bs, false, NULL, true); 619 aio_context_release(aio_context); 620 } 621 622 return result; 623 } 624 625 /* 626 * Wait for pending requests to complete across all BlockDriverStates 627 * 628 * This function does not flush data to disk, use bdrv_flush_all() for that 629 * after calling this function. 630 * 631 * This pauses all block jobs and disables external clients. It must 632 * be paired with bdrv_drain_all_end(). 633 * 634 * NOTE: no new block jobs or BlockDriverStates can be created between 635 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 636 */ 637 void bdrv_drain_all_begin(void) 638 { 639 BlockDriverState *bs = NULL; 640 641 if (qemu_in_coroutine()) { 642 bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 643 return; 644 } 645 646 /* 647 * bdrv queue is managed by record/replay, 648 * waiting for finishing the I/O requests may 649 * be infinite 650 */ 651 if (replay_events_enabled()) { 652 return; 653 } 654 655 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 656 * loop AioContext, so make sure we're in the main context. */ 657 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 658 assert(bdrv_drain_all_count < INT_MAX); 659 bdrv_drain_all_count++; 660 661 /* Quiesce all nodes, without polling in-flight requests yet. The graph 662 * cannot change during this loop. */ 663 while ((bs = bdrv_next_all_states(bs))) { 664 AioContext *aio_context = bdrv_get_aio_context(bs); 665 666 aio_context_acquire(aio_context); 667 bdrv_do_drained_begin(bs, false, NULL, true, false); 668 aio_context_release(aio_context); 669 } 670 671 /* Now poll the in-flight requests */ 672 AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 673 674 while ((bs = bdrv_next_all_states(bs))) { 675 bdrv_drain_assert_idle(bs); 676 } 677 } 678 679 void bdrv_drain_all_end_quiesce(BlockDriverState *bs) 680 { 681 int drained_end_counter = 0; 682 683 g_assert(bs->quiesce_counter > 0); 684 g_assert(!bs->refcnt); 685 686 while (bs->quiesce_counter) { 687 bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 688 } 689 BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 690 } 691 692 void bdrv_drain_all_end(void) 693 { 694 BlockDriverState *bs = NULL; 695 int drained_end_counter = 0; 696 697 /* 698 * bdrv queue is managed by record/replay, 699 * waiting for finishing the I/O requests may 700 * be endless 701 */ 702 if (replay_events_enabled()) { 703 return; 704 } 705 706 while ((bs = bdrv_next_all_states(bs))) { 707 AioContext *aio_context = bdrv_get_aio_context(bs); 708 709 aio_context_acquire(aio_context); 710 bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 711 aio_context_release(aio_context); 712 } 713 714 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 715 AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0); 716 717 assert(bdrv_drain_all_count > 0); 718 bdrv_drain_all_count--; 719 } 720 721 void bdrv_drain_all(void) 722 { 723 bdrv_drain_all_begin(); 724 bdrv_drain_all_end(); 725 } 726 727 /** 728 * Remove an active request from the tracked requests list 729 * 730 * This function should be called when a tracked request is completing. 731 */ 732 static void tracked_request_end(BdrvTrackedRequest *req) 733 { 734 if (req->serialising) { 735 qatomic_dec(&req->bs->serialising_in_flight); 736 } 737 738 qemu_co_mutex_lock(&req->bs->reqs_lock); 739 QLIST_REMOVE(req, list); 740 qemu_co_queue_restart_all(&req->wait_queue); 741 qemu_co_mutex_unlock(&req->bs->reqs_lock); 742 } 743 744 /** 745 * Add an active request to the tracked requests list 746 */ 747 static void tracked_request_begin(BdrvTrackedRequest *req, 748 BlockDriverState *bs, 749 int64_t offset, 750 int64_t bytes, 751 enum BdrvTrackedRequestType type) 752 { 753 bdrv_check_request(offset, bytes, &error_abort); 754 755 *req = (BdrvTrackedRequest){ 756 .bs = bs, 757 .offset = offset, 758 .bytes = bytes, 759 .type = type, 760 .co = qemu_coroutine_self(), 761 .serialising = false, 762 .overlap_offset = offset, 763 .overlap_bytes = bytes, 764 }; 765 766 qemu_co_queue_init(&req->wait_queue); 767 768 qemu_co_mutex_lock(&bs->reqs_lock); 769 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 770 qemu_co_mutex_unlock(&bs->reqs_lock); 771 } 772 773 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 774 int64_t offset, int64_t bytes) 775 { 776 bdrv_check_request(offset, bytes, &error_abort); 777 778 /* aaaa bbbb */ 779 if (offset >= req->overlap_offset + req->overlap_bytes) { 780 return false; 781 } 782 /* bbbb aaaa */ 783 if (req->overlap_offset >= offset + bytes) { 784 return false; 785 } 786 return true; 787 } 788 789 /* Called with self->bs->reqs_lock held */ 790 static BdrvTrackedRequest * 791 bdrv_find_conflicting_request(BdrvTrackedRequest *self) 792 { 793 BdrvTrackedRequest *req; 794 795 QLIST_FOREACH(req, &self->bs->tracked_requests, list) { 796 if (req == self || (!req->serialising && !self->serialising)) { 797 continue; 798 } 799 if (tracked_request_overlaps(req, self->overlap_offset, 800 self->overlap_bytes)) 801 { 802 /* 803 * Hitting this means there was a reentrant request, for 804 * example, a block driver issuing nested requests. This must 805 * never happen since it means deadlock. 806 */ 807 assert(qemu_coroutine_self() != req->co); 808 809 /* 810 * If the request is already (indirectly) waiting for us, or 811 * will wait for us as soon as it wakes up, then just go on 812 * (instead of producing a deadlock in the former case). 813 */ 814 if (!req->waiting_for) { 815 return req; 816 } 817 } 818 } 819 820 return NULL; 821 } 822 823 /* Called with self->bs->reqs_lock held */ 824 static bool coroutine_fn 825 bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) 826 { 827 BdrvTrackedRequest *req; 828 bool waited = false; 829 830 while ((req = bdrv_find_conflicting_request(self))) { 831 self->waiting_for = req; 832 qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); 833 self->waiting_for = NULL; 834 waited = true; 835 } 836 837 return waited; 838 } 839 840 /* Called with req->bs->reqs_lock held */ 841 static void tracked_request_set_serialising(BdrvTrackedRequest *req, 842 uint64_t align) 843 { 844 int64_t overlap_offset = req->offset & ~(align - 1); 845 int64_t overlap_bytes = 846 ROUND_UP(req->offset + req->bytes, align) - overlap_offset; 847 848 bdrv_check_request(req->offset, req->bytes, &error_abort); 849 850 if (!req->serialising) { 851 qatomic_inc(&req->bs->serialising_in_flight); 852 req->serialising = true; 853 } 854 855 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 856 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 857 } 858 859 /** 860 * Return the tracked request on @bs for the current coroutine, or 861 * NULL if there is none. 862 */ 863 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 864 { 865 BdrvTrackedRequest *req; 866 Coroutine *self = qemu_coroutine_self(); 867 868 QLIST_FOREACH(req, &bs->tracked_requests, list) { 869 if (req->co == self) { 870 return req; 871 } 872 } 873 874 return NULL; 875 } 876 877 /** 878 * Round a region to cluster boundaries 879 */ 880 void bdrv_round_to_clusters(BlockDriverState *bs, 881 int64_t offset, int64_t bytes, 882 int64_t *cluster_offset, 883 int64_t *cluster_bytes) 884 { 885 BlockDriverInfo bdi; 886 887 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 888 *cluster_offset = offset; 889 *cluster_bytes = bytes; 890 } else { 891 int64_t c = bdi.cluster_size; 892 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 893 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 894 } 895 } 896 897 static int bdrv_get_cluster_size(BlockDriverState *bs) 898 { 899 BlockDriverInfo bdi; 900 int ret; 901 902 ret = bdrv_get_info(bs, &bdi); 903 if (ret < 0 || bdi.cluster_size == 0) { 904 return bs->bl.request_alignment; 905 } else { 906 return bdi.cluster_size; 907 } 908 } 909 910 void bdrv_inc_in_flight(BlockDriverState *bs) 911 { 912 qatomic_inc(&bs->in_flight); 913 } 914 915 void bdrv_wakeup(BlockDriverState *bs) 916 { 917 aio_wait_kick(); 918 } 919 920 void bdrv_dec_in_flight(BlockDriverState *bs) 921 { 922 qatomic_dec(&bs->in_flight); 923 bdrv_wakeup(bs); 924 } 925 926 static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 927 { 928 BlockDriverState *bs = self->bs; 929 bool waited = false; 930 931 if (!qatomic_read(&bs->serialising_in_flight)) { 932 return false; 933 } 934 935 qemu_co_mutex_lock(&bs->reqs_lock); 936 waited = bdrv_wait_serialising_requests_locked(self); 937 qemu_co_mutex_unlock(&bs->reqs_lock); 938 939 return waited; 940 } 941 942 bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, 943 uint64_t align) 944 { 945 bool waited; 946 947 qemu_co_mutex_lock(&req->bs->reqs_lock); 948 949 tracked_request_set_serialising(req, align); 950 waited = bdrv_wait_serialising_requests_locked(req); 951 952 qemu_co_mutex_unlock(&req->bs->reqs_lock); 953 954 return waited; 955 } 956 957 static int bdrv_check_qiov_request(int64_t offset, int64_t bytes, 958 QEMUIOVector *qiov, size_t qiov_offset, 959 Error **errp) 960 { 961 /* 962 * Check generic offset/bytes correctness 963 */ 964 965 if (offset < 0) { 966 error_setg(errp, "offset is negative: %" PRIi64, offset); 967 return -EIO; 968 } 969 970 if (bytes < 0) { 971 error_setg(errp, "bytes is negative: %" PRIi64, bytes); 972 return -EIO; 973 } 974 975 if (bytes > BDRV_MAX_LENGTH) { 976 error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 977 bytes, BDRV_MAX_LENGTH); 978 return -EIO; 979 } 980 981 if (offset > BDRV_MAX_LENGTH) { 982 error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 983 offset, BDRV_MAX_LENGTH); 984 return -EIO; 985 } 986 987 if (offset > BDRV_MAX_LENGTH - bytes) { 988 error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") " 989 "exceeds maximum(%" PRIi64 ")", offset, bytes, 990 BDRV_MAX_LENGTH); 991 return -EIO; 992 } 993 994 if (!qiov) { 995 return 0; 996 } 997 998 /* 999 * Check qiov and qiov_offset 1000 */ 1001 1002 if (qiov_offset > qiov->size) { 1003 error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)", 1004 qiov_offset, qiov->size); 1005 return -EIO; 1006 } 1007 1008 if (bytes > qiov->size - qiov_offset) { 1009 error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io " 1010 "vector size(%zu)", bytes, qiov_offset, qiov->size); 1011 return -EIO; 1012 } 1013 1014 return 0; 1015 } 1016 1017 int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) 1018 { 1019 return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp); 1020 } 1021 1022 static int bdrv_check_request32(int64_t offset, int64_t bytes, 1023 QEMUIOVector *qiov, size_t qiov_offset) 1024 { 1025 int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 1026 if (ret < 0) { 1027 return ret; 1028 } 1029 1030 if (bytes > BDRV_REQUEST_MAX_BYTES) { 1031 return -EIO; 1032 } 1033 1034 return 0; 1035 } 1036 1037 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 1038 int64_t bytes, BdrvRequestFlags flags) 1039 { 1040 return bdrv_pwritev(child, offset, bytes, NULL, 1041 BDRV_REQ_ZERO_WRITE | flags); 1042 } 1043 1044 /* 1045 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 1046 * The operation is sped up by checking the block status and only writing 1047 * zeroes to the device if they currently do not return zeroes. Optional 1048 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 1049 * BDRV_REQ_FUA). 1050 * 1051 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 1052 */ 1053 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 1054 { 1055 int ret; 1056 int64_t target_size, bytes, offset = 0; 1057 BlockDriverState *bs = child->bs; 1058 1059 target_size = bdrv_getlength(bs); 1060 if (target_size < 0) { 1061 return target_size; 1062 } 1063 1064 for (;;) { 1065 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 1066 if (bytes <= 0) { 1067 return 0; 1068 } 1069 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 1070 if (ret < 0) { 1071 return ret; 1072 } 1073 if (ret & BDRV_BLOCK_ZERO) { 1074 offset += bytes; 1075 continue; 1076 } 1077 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 1078 if (ret < 0) { 1079 return ret; 1080 } 1081 offset += bytes; 1082 } 1083 } 1084 1085 /* See bdrv_pwrite() for the return codes */ 1086 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes) 1087 { 1088 int ret; 1089 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1090 1091 if (bytes < 0) { 1092 return -EINVAL; 1093 } 1094 1095 ret = bdrv_preadv(child, offset, bytes, &qiov, 0); 1096 1097 return ret < 0 ? ret : bytes; 1098 } 1099 1100 /* Return no. of bytes on success or < 0 on error. Important errors are: 1101 -EIO generic I/O error (may happen for all errors) 1102 -ENOMEDIUM No media inserted. 1103 -EINVAL Invalid offset or number of bytes 1104 -EACCES Trying to write a read-only device 1105 */ 1106 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, 1107 int64_t bytes) 1108 { 1109 int ret; 1110 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1111 1112 if (bytes < 0) { 1113 return -EINVAL; 1114 } 1115 1116 ret = bdrv_pwritev(child, offset, bytes, &qiov, 0); 1117 1118 return ret < 0 ? ret : bytes; 1119 } 1120 1121 /* 1122 * Writes to the file and ensures that no writes are reordered across this 1123 * request (acts as a barrier) 1124 * 1125 * Returns 0 on success, -errno in error cases. 1126 */ 1127 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 1128 const void *buf, int64_t count) 1129 { 1130 int ret; 1131 1132 ret = bdrv_pwrite(child, offset, buf, count); 1133 if (ret < 0) { 1134 return ret; 1135 } 1136 1137 ret = bdrv_flush(child->bs); 1138 if (ret < 0) { 1139 return ret; 1140 } 1141 1142 return 0; 1143 } 1144 1145 typedef struct CoroutineIOCompletion { 1146 Coroutine *coroutine; 1147 int ret; 1148 } CoroutineIOCompletion; 1149 1150 static void bdrv_co_io_em_complete(void *opaque, int ret) 1151 { 1152 CoroutineIOCompletion *co = opaque; 1153 1154 co->ret = ret; 1155 aio_co_wake(co->coroutine); 1156 } 1157 1158 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1159 int64_t offset, int64_t bytes, 1160 QEMUIOVector *qiov, 1161 size_t qiov_offset, int flags) 1162 { 1163 BlockDriver *drv = bs->drv; 1164 int64_t sector_num; 1165 unsigned int nb_sectors; 1166 QEMUIOVector local_qiov; 1167 int ret; 1168 1169 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1170 assert(!(flags & ~BDRV_REQ_MASK)); 1171 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1172 1173 if (!drv) { 1174 return -ENOMEDIUM; 1175 } 1176 1177 if (drv->bdrv_co_preadv_part) { 1178 return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1179 flags); 1180 } 1181 1182 if (qiov_offset > 0 || bytes != qiov->size) { 1183 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1184 qiov = &local_qiov; 1185 } 1186 1187 if (drv->bdrv_co_preadv) { 1188 ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1189 goto out; 1190 } 1191 1192 if (drv->bdrv_aio_preadv) { 1193 BlockAIOCB *acb; 1194 CoroutineIOCompletion co = { 1195 .coroutine = qemu_coroutine_self(), 1196 }; 1197 1198 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 1199 bdrv_co_io_em_complete, &co); 1200 if (acb == NULL) { 1201 ret = -EIO; 1202 goto out; 1203 } else { 1204 qemu_coroutine_yield(); 1205 ret = co.ret; 1206 goto out; 1207 } 1208 } 1209 1210 sector_num = offset >> BDRV_SECTOR_BITS; 1211 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1212 1213 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1214 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1215 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1216 assert(drv->bdrv_co_readv); 1217 1218 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1219 1220 out: 1221 if (qiov == &local_qiov) { 1222 qemu_iovec_destroy(&local_qiov); 1223 } 1224 1225 return ret; 1226 } 1227 1228 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1229 int64_t offset, int64_t bytes, 1230 QEMUIOVector *qiov, 1231 size_t qiov_offset, int flags) 1232 { 1233 BlockDriver *drv = bs->drv; 1234 int64_t sector_num; 1235 unsigned int nb_sectors; 1236 QEMUIOVector local_qiov; 1237 int ret; 1238 1239 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1240 assert(!(flags & ~BDRV_REQ_MASK)); 1241 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1242 1243 if (!drv) { 1244 return -ENOMEDIUM; 1245 } 1246 1247 if (drv->bdrv_co_pwritev_part) { 1248 ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1249 flags & bs->supported_write_flags); 1250 flags &= ~bs->supported_write_flags; 1251 goto emulate_flags; 1252 } 1253 1254 if (qiov_offset > 0 || bytes != qiov->size) { 1255 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1256 qiov = &local_qiov; 1257 } 1258 1259 if (drv->bdrv_co_pwritev) { 1260 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1261 flags & bs->supported_write_flags); 1262 flags &= ~bs->supported_write_flags; 1263 goto emulate_flags; 1264 } 1265 1266 if (drv->bdrv_aio_pwritev) { 1267 BlockAIOCB *acb; 1268 CoroutineIOCompletion co = { 1269 .coroutine = qemu_coroutine_self(), 1270 }; 1271 1272 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1273 flags & bs->supported_write_flags, 1274 bdrv_co_io_em_complete, &co); 1275 flags &= ~bs->supported_write_flags; 1276 if (acb == NULL) { 1277 ret = -EIO; 1278 } else { 1279 qemu_coroutine_yield(); 1280 ret = co.ret; 1281 } 1282 goto emulate_flags; 1283 } 1284 1285 sector_num = offset >> BDRV_SECTOR_BITS; 1286 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1287 1288 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1289 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1290 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1291 1292 assert(drv->bdrv_co_writev); 1293 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1294 flags & bs->supported_write_flags); 1295 flags &= ~bs->supported_write_flags; 1296 1297 emulate_flags: 1298 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1299 ret = bdrv_co_flush(bs); 1300 } 1301 1302 if (qiov == &local_qiov) { 1303 qemu_iovec_destroy(&local_qiov); 1304 } 1305 1306 return ret; 1307 } 1308 1309 static int coroutine_fn 1310 bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset, 1311 int64_t bytes, QEMUIOVector *qiov, 1312 size_t qiov_offset) 1313 { 1314 BlockDriver *drv = bs->drv; 1315 QEMUIOVector local_qiov; 1316 int ret; 1317 1318 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1319 1320 if (!drv) { 1321 return -ENOMEDIUM; 1322 } 1323 1324 if (!block_driver_can_compress(drv)) { 1325 return -ENOTSUP; 1326 } 1327 1328 if (drv->bdrv_co_pwritev_compressed_part) { 1329 return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1330 qiov, qiov_offset); 1331 } 1332 1333 if (qiov_offset == 0) { 1334 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1335 } 1336 1337 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1338 ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1339 qemu_iovec_destroy(&local_qiov); 1340 1341 return ret; 1342 } 1343 1344 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1345 int64_t offset, int64_t bytes, QEMUIOVector *qiov, 1346 size_t qiov_offset, int flags) 1347 { 1348 BlockDriverState *bs = child->bs; 1349 1350 /* Perform I/O through a temporary buffer so that users who scribble over 1351 * their read buffer while the operation is in progress do not end up 1352 * modifying the image file. This is critical for zero-copy guest I/O 1353 * where anything might happen inside guest memory. 1354 */ 1355 void *bounce_buffer = NULL; 1356 1357 BlockDriver *drv = bs->drv; 1358 int64_t cluster_offset; 1359 int64_t cluster_bytes; 1360 int64_t skip_bytes; 1361 int ret; 1362 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1363 BDRV_REQUEST_MAX_BYTES); 1364 int64_t progress = 0; 1365 bool skip_write; 1366 1367 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1368 1369 if (!drv) { 1370 return -ENOMEDIUM; 1371 } 1372 1373 /* 1374 * Do not write anything when the BDS is inactive. That is not 1375 * allowed, and it would not help. 1376 */ 1377 skip_write = (bs->open_flags & BDRV_O_INACTIVE); 1378 1379 /* FIXME We cannot require callers to have write permissions when all they 1380 * are doing is a read request. If we did things right, write permissions 1381 * would be obtained anyway, but internally by the copy-on-read code. As 1382 * long as it is implemented here rather than in a separate filter driver, 1383 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1384 * it could request permissions. Therefore we have to bypass the permission 1385 * system for the moment. */ 1386 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1387 1388 /* Cover entire cluster so no additional backing file I/O is required when 1389 * allocating cluster in the image file. Note that this value may exceed 1390 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1391 * is one reason we loop rather than doing it all at once. 1392 */ 1393 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1394 skip_bytes = offset - cluster_offset; 1395 1396 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1397 cluster_offset, cluster_bytes); 1398 1399 while (cluster_bytes) { 1400 int64_t pnum; 1401 1402 if (skip_write) { 1403 ret = 1; /* "already allocated", so nothing will be copied */ 1404 pnum = MIN(cluster_bytes, max_transfer); 1405 } else { 1406 ret = bdrv_is_allocated(bs, cluster_offset, 1407 MIN(cluster_bytes, max_transfer), &pnum); 1408 if (ret < 0) { 1409 /* 1410 * Safe to treat errors in querying allocation as if 1411 * unallocated; we'll probably fail again soon on the 1412 * read, but at least that will set a decent errno. 1413 */ 1414 pnum = MIN(cluster_bytes, max_transfer); 1415 } 1416 1417 /* Stop at EOF if the image ends in the middle of the cluster */ 1418 if (ret == 0 && pnum == 0) { 1419 assert(progress >= bytes); 1420 break; 1421 } 1422 1423 assert(skip_bytes < pnum); 1424 } 1425 1426 if (ret <= 0) { 1427 QEMUIOVector local_qiov; 1428 1429 /* Must copy-on-read; use the bounce buffer */ 1430 pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1431 if (!bounce_buffer) { 1432 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 1433 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 1434 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 1435 1436 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 1437 if (!bounce_buffer) { 1438 ret = -ENOMEM; 1439 goto err; 1440 } 1441 } 1442 qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1443 1444 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1445 &local_qiov, 0, 0); 1446 if (ret < 0) { 1447 goto err; 1448 } 1449 1450 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1451 if (drv->bdrv_co_pwrite_zeroes && 1452 buffer_is_zero(bounce_buffer, pnum)) { 1453 /* FIXME: Should we (perhaps conditionally) be setting 1454 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1455 * that still correctly reads as zero? */ 1456 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1457 BDRV_REQ_WRITE_UNCHANGED); 1458 } else { 1459 /* This does not change the data on the disk, it is not 1460 * necessary to flush even in cache=writethrough mode. 1461 */ 1462 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1463 &local_qiov, 0, 1464 BDRV_REQ_WRITE_UNCHANGED); 1465 } 1466 1467 if (ret < 0) { 1468 /* It might be okay to ignore write errors for guest 1469 * requests. If this is a deliberate copy-on-read 1470 * then we don't want to ignore the error. Simply 1471 * report it in all cases. 1472 */ 1473 goto err; 1474 } 1475 1476 if (!(flags & BDRV_REQ_PREFETCH)) { 1477 qemu_iovec_from_buf(qiov, qiov_offset + progress, 1478 bounce_buffer + skip_bytes, 1479 MIN(pnum - skip_bytes, bytes - progress)); 1480 } 1481 } else if (!(flags & BDRV_REQ_PREFETCH)) { 1482 /* Read directly into the destination */ 1483 ret = bdrv_driver_preadv(bs, offset + progress, 1484 MIN(pnum - skip_bytes, bytes - progress), 1485 qiov, qiov_offset + progress, 0); 1486 if (ret < 0) { 1487 goto err; 1488 } 1489 } 1490 1491 cluster_offset += pnum; 1492 cluster_bytes -= pnum; 1493 progress += pnum - skip_bytes; 1494 skip_bytes = 0; 1495 } 1496 ret = 0; 1497 1498 err: 1499 qemu_vfree(bounce_buffer); 1500 return ret; 1501 } 1502 1503 /* 1504 * Forwards an already correctly aligned request to the BlockDriver. This 1505 * handles copy on read, zeroing after EOF, and fragmentation of large 1506 * reads; any other features must be implemented by the caller. 1507 */ 1508 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1509 BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 1510 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1511 { 1512 BlockDriverState *bs = child->bs; 1513 int64_t total_bytes, max_bytes; 1514 int ret = 0; 1515 int64_t bytes_remaining = bytes; 1516 int max_transfer; 1517 1518 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1519 assert(is_power_of_2(align)); 1520 assert((offset & (align - 1)) == 0); 1521 assert((bytes & (align - 1)) == 0); 1522 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1523 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1524 align); 1525 1526 /* TODO: We would need a per-BDS .supported_read_flags and 1527 * potential fallback support, if we ever implement any read flags 1528 * to pass through to drivers. For now, there aren't any 1529 * passthrough flags. */ 1530 assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); 1531 1532 /* Handle Copy on Read and associated serialisation */ 1533 if (flags & BDRV_REQ_COPY_ON_READ) { 1534 /* If we touch the same cluster it counts as an overlap. This 1535 * guarantees that allocating writes will be serialized and not race 1536 * with each other for the same cluster. For example, in copy-on-read 1537 * it ensures that the CoR read and write operations are atomic and 1538 * guest writes cannot interleave between them. */ 1539 bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); 1540 } else { 1541 bdrv_wait_serialising_requests(req); 1542 } 1543 1544 if (flags & BDRV_REQ_COPY_ON_READ) { 1545 int64_t pnum; 1546 1547 /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ 1548 flags &= ~BDRV_REQ_COPY_ON_READ; 1549 1550 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1551 if (ret < 0) { 1552 goto out; 1553 } 1554 1555 if (!ret || pnum != bytes) { 1556 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 1557 qiov, qiov_offset, flags); 1558 goto out; 1559 } else if (flags & BDRV_REQ_PREFETCH) { 1560 goto out; 1561 } 1562 } 1563 1564 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1565 total_bytes = bdrv_getlength(bs); 1566 if (total_bytes < 0) { 1567 ret = total_bytes; 1568 goto out; 1569 } 1570 1571 assert(!(flags & ~bs->supported_read_flags)); 1572 1573 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1574 if (bytes <= max_bytes && bytes <= max_transfer) { 1575 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); 1576 goto out; 1577 } 1578 1579 while (bytes_remaining) { 1580 int64_t num; 1581 1582 if (max_bytes) { 1583 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1584 assert(num); 1585 1586 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1587 num, qiov, 1588 qiov_offset + bytes - bytes_remaining, 1589 flags); 1590 max_bytes -= num; 1591 } else { 1592 num = bytes_remaining; 1593 ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1594 0, bytes_remaining); 1595 } 1596 if (ret < 0) { 1597 goto out; 1598 } 1599 bytes_remaining -= num; 1600 } 1601 1602 out: 1603 return ret < 0 ? ret : 0; 1604 } 1605 1606 /* 1607 * Request padding 1608 * 1609 * |<---- align ----->| |<----- align ---->| 1610 * |<- head ->|<------------- bytes ------------->|<-- tail -->| 1611 * | | | | | | 1612 * -*----------$-------*-------- ... --------*-----$------------*--- 1613 * | | | | | | 1614 * | offset | | end | 1615 * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 1616 * [buf ... ) [tail_buf ) 1617 * 1618 * @buf is an aligned allocation needed to store @head and @tail paddings. @head 1619 * is placed at the beginning of @buf and @tail at the @end. 1620 * 1621 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 1622 * around tail, if tail exists. 1623 * 1624 * @merge_reads is true for small requests, 1625 * if @buf_len == @head + bytes + @tail. In this case it is possible that both 1626 * head and tail exist but @buf_len == align and @tail_buf == @buf. 1627 */ 1628 typedef struct BdrvRequestPadding { 1629 uint8_t *buf; 1630 size_t buf_len; 1631 uint8_t *tail_buf; 1632 size_t head; 1633 size_t tail; 1634 bool merge_reads; 1635 QEMUIOVector local_qiov; 1636 } BdrvRequestPadding; 1637 1638 static bool bdrv_init_padding(BlockDriverState *bs, 1639 int64_t offset, int64_t bytes, 1640 BdrvRequestPadding *pad) 1641 { 1642 int64_t align = bs->bl.request_alignment; 1643 int64_t sum; 1644 1645 bdrv_check_request(offset, bytes, &error_abort); 1646 assert(align <= INT_MAX); /* documented in block/block_int.h */ 1647 assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */ 1648 1649 memset(pad, 0, sizeof(*pad)); 1650 1651 pad->head = offset & (align - 1); 1652 pad->tail = ((offset + bytes) & (align - 1)); 1653 if (pad->tail) { 1654 pad->tail = align - pad->tail; 1655 } 1656 1657 if (!pad->head && !pad->tail) { 1658 return false; 1659 } 1660 1661 assert(bytes); /* Nothing good in aligning zero-length requests */ 1662 1663 sum = pad->head + bytes + pad->tail; 1664 pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 1665 pad->buf = qemu_blockalign(bs, pad->buf_len); 1666 pad->merge_reads = sum == pad->buf_len; 1667 if (pad->tail) { 1668 pad->tail_buf = pad->buf + pad->buf_len - align; 1669 } 1670 1671 return true; 1672 } 1673 1674 static int bdrv_padding_rmw_read(BdrvChild *child, 1675 BdrvTrackedRequest *req, 1676 BdrvRequestPadding *pad, 1677 bool zero_middle) 1678 { 1679 QEMUIOVector local_qiov; 1680 BlockDriverState *bs = child->bs; 1681 uint64_t align = bs->bl.request_alignment; 1682 int ret; 1683 1684 assert(req->serialising && pad->buf); 1685 1686 if (pad->head || pad->merge_reads) { 1687 int64_t bytes = pad->merge_reads ? pad->buf_len : align; 1688 1689 qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 1690 1691 if (pad->head) { 1692 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1693 } 1694 if (pad->merge_reads && pad->tail) { 1695 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1696 } 1697 ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 1698 align, &local_qiov, 0, 0); 1699 if (ret < 0) { 1700 return ret; 1701 } 1702 if (pad->head) { 1703 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1704 } 1705 if (pad->merge_reads && pad->tail) { 1706 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1707 } 1708 1709 if (pad->merge_reads) { 1710 goto zero_mem; 1711 } 1712 } 1713 1714 if (pad->tail) { 1715 qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 1716 1717 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1718 ret = bdrv_aligned_preadv( 1719 child, req, 1720 req->overlap_offset + req->overlap_bytes - align, 1721 align, align, &local_qiov, 0, 0); 1722 if (ret < 0) { 1723 return ret; 1724 } 1725 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1726 } 1727 1728 zero_mem: 1729 if (zero_middle) { 1730 memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 1731 } 1732 1733 return 0; 1734 } 1735 1736 static void bdrv_padding_destroy(BdrvRequestPadding *pad) 1737 { 1738 if (pad->buf) { 1739 qemu_vfree(pad->buf); 1740 qemu_iovec_destroy(&pad->local_qiov); 1741 } 1742 memset(pad, 0, sizeof(*pad)); 1743 } 1744 1745 /* 1746 * bdrv_pad_request 1747 * 1748 * Exchange request parameters with padded request if needed. Don't include RMW 1749 * read of padding, bdrv_padding_rmw_read() should be called separately if 1750 * needed. 1751 * 1752 * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: 1753 * - on function start they represent original request 1754 * - on failure or when padding is not needed they are unchanged 1755 * - on success when padding is needed they represent padded request 1756 */ 1757 static int bdrv_pad_request(BlockDriverState *bs, 1758 QEMUIOVector **qiov, size_t *qiov_offset, 1759 int64_t *offset, int64_t *bytes, 1760 BdrvRequestPadding *pad, bool *padded) 1761 { 1762 int ret; 1763 1764 bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); 1765 1766 if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 1767 if (padded) { 1768 *padded = false; 1769 } 1770 return 0; 1771 } 1772 1773 ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 1774 *qiov, *qiov_offset, *bytes, 1775 pad->buf + pad->buf_len - pad->tail, 1776 pad->tail); 1777 if (ret < 0) { 1778 bdrv_padding_destroy(pad); 1779 return ret; 1780 } 1781 *bytes += pad->head + pad->tail; 1782 *offset -= pad->head; 1783 *qiov = &pad->local_qiov; 1784 *qiov_offset = 0; 1785 if (padded) { 1786 *padded = true; 1787 } 1788 1789 return 0; 1790 } 1791 1792 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1793 int64_t offset, int64_t bytes, QEMUIOVector *qiov, 1794 BdrvRequestFlags flags) 1795 { 1796 return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 1797 } 1798 1799 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 1800 int64_t offset, int64_t bytes, 1801 QEMUIOVector *qiov, size_t qiov_offset, 1802 BdrvRequestFlags flags) 1803 { 1804 BlockDriverState *bs = child->bs; 1805 BdrvTrackedRequest req; 1806 BdrvRequestPadding pad; 1807 int ret; 1808 1809 trace_bdrv_co_preadv_part(bs, offset, bytes, flags); 1810 1811 if (!bdrv_is_inserted(bs)) { 1812 return -ENOMEDIUM; 1813 } 1814 1815 ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 1816 if (ret < 0) { 1817 return ret; 1818 } 1819 1820 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1821 /* 1822 * Aligning zero request is nonsense. Even if driver has special meaning 1823 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1824 * it to driver due to request_alignment. 1825 * 1826 * Still, no reason to return an error if someone do unaligned 1827 * zero-length read occasionally. 1828 */ 1829 return 0; 1830 } 1831 1832 bdrv_inc_in_flight(bs); 1833 1834 /* Don't do copy-on-read if we read data before write operation */ 1835 if (qatomic_read(&bs->copy_on_read)) { 1836 flags |= BDRV_REQ_COPY_ON_READ; 1837 } 1838 1839 ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 1840 NULL); 1841 if (ret < 0) { 1842 return ret; 1843 } 1844 1845 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1846 ret = bdrv_aligned_preadv(child, &req, offset, bytes, 1847 bs->bl.request_alignment, 1848 qiov, qiov_offset, flags); 1849 tracked_request_end(&req); 1850 bdrv_dec_in_flight(bs); 1851 1852 bdrv_padding_destroy(&pad); 1853 1854 return ret; 1855 } 1856 1857 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1858 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 1859 { 1860 BlockDriver *drv = bs->drv; 1861 QEMUIOVector qiov; 1862 void *buf = NULL; 1863 int ret = 0; 1864 bool need_flush = false; 1865 int head = 0; 1866 int tail = 0; 1867 1868 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1869 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1870 bs->bl.request_alignment); 1871 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1872 1873 bdrv_check_request(offset, bytes, &error_abort); 1874 1875 if (!drv) { 1876 return -ENOMEDIUM; 1877 } 1878 1879 if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1880 return -ENOTSUP; 1881 } 1882 1883 assert(alignment % bs->bl.request_alignment == 0); 1884 head = offset % alignment; 1885 tail = (offset + bytes) % alignment; 1886 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1887 assert(max_write_zeroes >= bs->bl.request_alignment); 1888 1889 while (bytes > 0 && !ret) { 1890 int64_t num = bytes; 1891 1892 /* Align request. Block drivers can expect the "bulk" of the request 1893 * to be aligned, and that unaligned requests do not cross cluster 1894 * boundaries. 1895 */ 1896 if (head) { 1897 /* Make a small request up to the first aligned sector. For 1898 * convenience, limit this request to max_transfer even if 1899 * we don't need to fall back to writes. */ 1900 num = MIN(MIN(bytes, max_transfer), alignment - head); 1901 head = (head + num) % alignment; 1902 assert(num < max_write_zeroes); 1903 } else if (tail && num > alignment) { 1904 /* Shorten the request to the last aligned sector. */ 1905 num -= tail; 1906 } 1907 1908 /* limit request size */ 1909 if (num > max_write_zeroes) { 1910 num = max_write_zeroes; 1911 } 1912 1913 ret = -ENOTSUP; 1914 /* First try the efficient write zeroes operation */ 1915 if (drv->bdrv_co_pwrite_zeroes) { 1916 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1917 flags & bs->supported_zero_flags); 1918 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1919 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1920 need_flush = true; 1921 } 1922 } else { 1923 assert(!bs->supported_zero_flags); 1924 } 1925 1926 if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 1927 /* Fall back to bounce buffer if write zeroes is unsupported */ 1928 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1929 1930 if ((flags & BDRV_REQ_FUA) && 1931 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1932 /* No need for bdrv_driver_pwrite() to do a fallback 1933 * flush on each chunk; use just one at the end */ 1934 write_flags &= ~BDRV_REQ_FUA; 1935 need_flush = true; 1936 } 1937 num = MIN(num, max_transfer); 1938 if (buf == NULL) { 1939 buf = qemu_try_blockalign0(bs, num); 1940 if (buf == NULL) { 1941 ret = -ENOMEM; 1942 goto fail; 1943 } 1944 } 1945 qemu_iovec_init_buf(&qiov, buf, num); 1946 1947 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 1948 1949 /* Keep bounce buffer around if it is big enough for all 1950 * all future requests. 1951 */ 1952 if (num < max_transfer) { 1953 qemu_vfree(buf); 1954 buf = NULL; 1955 } 1956 } 1957 1958 offset += num; 1959 bytes -= num; 1960 } 1961 1962 fail: 1963 if (ret == 0 && need_flush) { 1964 ret = bdrv_co_flush(bs); 1965 } 1966 qemu_vfree(buf); 1967 return ret; 1968 } 1969 1970 static inline int coroutine_fn 1971 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, 1972 BdrvTrackedRequest *req, int flags) 1973 { 1974 BlockDriverState *bs = child->bs; 1975 1976 bdrv_check_request(offset, bytes, &error_abort); 1977 1978 if (bdrv_is_read_only(bs)) { 1979 return -EPERM; 1980 } 1981 1982 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1983 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1984 assert(!(flags & ~BDRV_REQ_MASK)); 1985 assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); 1986 1987 if (flags & BDRV_REQ_SERIALISING) { 1988 QEMU_LOCK_GUARD(&bs->reqs_lock); 1989 1990 tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); 1991 1992 if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { 1993 return -EBUSY; 1994 } 1995 1996 bdrv_wait_serialising_requests_locked(req); 1997 } else { 1998 bdrv_wait_serialising_requests(req); 1999 } 2000 2001 assert(req->overlap_offset <= offset); 2002 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 2003 assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE || 2004 child->perm & BLK_PERM_RESIZE); 2005 2006 switch (req->type) { 2007 case BDRV_TRACKED_WRITE: 2008 case BDRV_TRACKED_DISCARD: 2009 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 2010 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 2011 } else { 2012 assert(child->perm & BLK_PERM_WRITE); 2013 } 2014 bdrv_write_threshold_check_write(bs, offset, bytes); 2015 return 0; 2016 case BDRV_TRACKED_TRUNCATE: 2017 assert(child->perm & BLK_PERM_RESIZE); 2018 return 0; 2019 default: 2020 abort(); 2021 } 2022 } 2023 2024 static inline void coroutine_fn 2025 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes, 2026 BdrvTrackedRequest *req, int ret) 2027 { 2028 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 2029 BlockDriverState *bs = child->bs; 2030 2031 bdrv_check_request(offset, bytes, &error_abort); 2032 2033 qatomic_inc(&bs->write_gen); 2034 2035 /* 2036 * Discard cannot extend the image, but in error handling cases, such as 2037 * when reverting a qcow2 cluster allocation, the discarded range can pass 2038 * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 2039 * here. Instead, just skip it, since semantically a discard request 2040 * beyond EOF cannot expand the image anyway. 2041 */ 2042 if (ret == 0 && 2043 (req->type == BDRV_TRACKED_TRUNCATE || 2044 end_sector > bs->total_sectors) && 2045 req->type != BDRV_TRACKED_DISCARD) { 2046 bs->total_sectors = end_sector; 2047 bdrv_parent_cb_resize(bs); 2048 bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 2049 } 2050 if (req->bytes) { 2051 switch (req->type) { 2052 case BDRV_TRACKED_WRITE: 2053 stat64_max(&bs->wr_highest_offset, offset + bytes); 2054 /* fall through, to set dirty bits */ 2055 case BDRV_TRACKED_DISCARD: 2056 bdrv_set_dirty(bs, offset, bytes); 2057 break; 2058 default: 2059 break; 2060 } 2061 } 2062 } 2063 2064 /* 2065 * Forwards an already correctly aligned write request to the BlockDriver, 2066 * after possibly fragmenting it. 2067 */ 2068 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 2069 BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 2070 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 2071 { 2072 BlockDriverState *bs = child->bs; 2073 BlockDriver *drv = bs->drv; 2074 int ret; 2075 2076 int64_t bytes_remaining = bytes; 2077 int max_transfer; 2078 2079 bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 2080 2081 if (!drv) { 2082 return -ENOMEDIUM; 2083 } 2084 2085 if (bdrv_has_readonly_bitmaps(bs)) { 2086 return -EPERM; 2087 } 2088 2089 assert(is_power_of_2(align)); 2090 assert((offset & (align - 1)) == 0); 2091 assert((bytes & (align - 1)) == 0); 2092 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 2093 align); 2094 2095 ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 2096 2097 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 2098 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 2099 qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 2100 flags |= BDRV_REQ_ZERO_WRITE; 2101 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 2102 flags |= BDRV_REQ_MAY_UNMAP; 2103 } 2104 } 2105 2106 if (ret < 0) { 2107 /* Do nothing, write notifier decided to fail this request */ 2108 } else if (flags & BDRV_REQ_ZERO_WRITE) { 2109 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 2110 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 2111 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 2112 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 2113 qiov, qiov_offset); 2114 } else if (bytes <= max_transfer) { 2115 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2116 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 2117 } else { 2118 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2119 while (bytes_remaining) { 2120 int num = MIN(bytes_remaining, max_transfer); 2121 int local_flags = flags; 2122 2123 assert(num); 2124 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 2125 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 2126 /* If FUA is going to be emulated by flush, we only 2127 * need to flush on the last iteration */ 2128 local_flags &= ~BDRV_REQ_FUA; 2129 } 2130 2131 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 2132 num, qiov, 2133 qiov_offset + bytes - bytes_remaining, 2134 local_flags); 2135 if (ret < 0) { 2136 break; 2137 } 2138 bytes_remaining -= num; 2139 } 2140 } 2141 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 2142 2143 if (ret >= 0) { 2144 ret = 0; 2145 } 2146 bdrv_co_write_req_finish(child, offset, bytes, req, ret); 2147 2148 return ret; 2149 } 2150 2151 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 2152 int64_t offset, 2153 int64_t bytes, 2154 BdrvRequestFlags flags, 2155 BdrvTrackedRequest *req) 2156 { 2157 BlockDriverState *bs = child->bs; 2158 QEMUIOVector local_qiov; 2159 uint64_t align = bs->bl.request_alignment; 2160 int ret = 0; 2161 bool padding; 2162 BdrvRequestPadding pad; 2163 2164 padding = bdrv_init_padding(bs, offset, bytes, &pad); 2165 if (padding) { 2166 bdrv_make_request_serialising(req, align); 2167 2168 bdrv_padding_rmw_read(child, req, &pad, true); 2169 2170 if (pad.head || pad.merge_reads) { 2171 int64_t aligned_offset = offset & ~(align - 1); 2172 int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 2173 2174 qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 2175 ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 2176 align, &local_qiov, 0, 2177 flags & ~BDRV_REQ_ZERO_WRITE); 2178 if (ret < 0 || pad.merge_reads) { 2179 /* Error or all work is done */ 2180 goto out; 2181 } 2182 offset += write_bytes - pad.head; 2183 bytes -= write_bytes - pad.head; 2184 } 2185 } 2186 2187 assert(!bytes || (offset & (align - 1)) == 0); 2188 if (bytes >= align) { 2189 /* Write the aligned part in the middle. */ 2190 int64_t aligned_bytes = bytes & ~(align - 1); 2191 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 2192 NULL, 0, flags); 2193 if (ret < 0) { 2194 goto out; 2195 } 2196 bytes -= aligned_bytes; 2197 offset += aligned_bytes; 2198 } 2199 2200 assert(!bytes || (offset & (align - 1)) == 0); 2201 if (bytes) { 2202 assert(align == pad.tail + bytes); 2203 2204 qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 2205 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 2206 &local_qiov, 0, 2207 flags & ~BDRV_REQ_ZERO_WRITE); 2208 } 2209 2210 out: 2211 bdrv_padding_destroy(&pad); 2212 2213 return ret; 2214 } 2215 2216 /* 2217 * Handle a write request in coroutine context 2218 */ 2219 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2220 int64_t offset, int64_t bytes, QEMUIOVector *qiov, 2221 BdrvRequestFlags flags) 2222 { 2223 return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 2224 } 2225 2226 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 2227 int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, 2228 BdrvRequestFlags flags) 2229 { 2230 BlockDriverState *bs = child->bs; 2231 BdrvTrackedRequest req; 2232 uint64_t align = bs->bl.request_alignment; 2233 BdrvRequestPadding pad; 2234 int ret; 2235 bool padded = false; 2236 2237 trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags); 2238 2239 if (!bdrv_is_inserted(bs)) { 2240 return -ENOMEDIUM; 2241 } 2242 2243 ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 2244 if (ret < 0) { 2245 return ret; 2246 } 2247 2248 /* If the request is misaligned then we can't make it efficient */ 2249 if ((flags & BDRV_REQ_NO_FALLBACK) && 2250 !QEMU_IS_ALIGNED(offset | bytes, align)) 2251 { 2252 return -ENOTSUP; 2253 } 2254 2255 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2256 /* 2257 * Aligning zero request is nonsense. Even if driver has special meaning 2258 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2259 * it to driver due to request_alignment. 2260 * 2261 * Still, no reason to return an error if someone do unaligned 2262 * zero-length write occasionally. 2263 */ 2264 return 0; 2265 } 2266 2267 if (!(flags & BDRV_REQ_ZERO_WRITE)) { 2268 /* 2269 * Pad request for following read-modify-write cycle. 2270 * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do 2271 * alignment only if there is no ZERO flag. 2272 */ 2273 ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 2274 &padded); 2275 if (ret < 0) { 2276 return ret; 2277 } 2278 } 2279 2280 bdrv_inc_in_flight(bs); 2281 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 2282 2283 if (flags & BDRV_REQ_ZERO_WRITE) { 2284 assert(!padded); 2285 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 2286 goto out; 2287 } 2288 2289 if (padded) { 2290 /* 2291 * Request was unaligned to request_alignment and therefore 2292 * padded. We are going to do read-modify-write, and must 2293 * serialize the request to prevent interactions of the 2294 * widened region with other transactions. 2295 */ 2296 bdrv_make_request_serialising(&req, align); 2297 bdrv_padding_rmw_read(child, &req, &pad, false); 2298 } 2299 2300 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 2301 qiov, qiov_offset, flags); 2302 2303 bdrv_padding_destroy(&pad); 2304 2305 out: 2306 tracked_request_end(&req); 2307 bdrv_dec_in_flight(bs); 2308 2309 return ret; 2310 } 2311 2312 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2313 int64_t bytes, BdrvRequestFlags flags) 2314 { 2315 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 2316 2317 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 2318 flags &= ~BDRV_REQ_MAY_UNMAP; 2319 } 2320 2321 return bdrv_co_pwritev(child, offset, bytes, NULL, 2322 BDRV_REQ_ZERO_WRITE | flags); 2323 } 2324 2325 /* 2326 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 2327 */ 2328 int bdrv_flush_all(void) 2329 { 2330 BdrvNextIterator it; 2331 BlockDriverState *bs = NULL; 2332 int result = 0; 2333 2334 /* 2335 * bdrv queue is managed by record/replay, 2336 * creating new flush request for stopping 2337 * the VM may break the determinism 2338 */ 2339 if (replay_events_enabled()) { 2340 return result; 2341 } 2342 2343 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 2344 AioContext *aio_context = bdrv_get_aio_context(bs); 2345 int ret; 2346 2347 aio_context_acquire(aio_context); 2348 ret = bdrv_flush(bs); 2349 if (ret < 0 && !result) { 2350 result = ret; 2351 } 2352 aio_context_release(aio_context); 2353 } 2354 2355 return result; 2356 } 2357 2358 /* 2359 * Returns the allocation status of the specified sectors. 2360 * Drivers not implementing the functionality are assumed to not support 2361 * backing files, hence all their sectors are reported as allocated. 2362 * 2363 * If 'want_zero' is true, the caller is querying for mapping 2364 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 2365 * _ZERO where possible; otherwise, the result favors larger 'pnum', 2366 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2367 * 2368 * If 'offset' is beyond the end of the disk image the return value is 2369 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 2370 * 2371 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2372 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2373 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 2374 * 2375 * 'pnum' is set to the number of bytes (including and immediately 2376 * following the specified offset) that are easily known to be in the 2377 * same allocated/unallocated state. Note that a second call starting 2378 * at the original offset plus returned pnum may have the same status. 2379 * The returned value is non-zero on success except at end-of-file. 2380 * 2381 * Returns negative errno on failure. Otherwise, if the 2382 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 2383 * set to the host mapping and BDS corresponding to the guest offset. 2384 */ 2385 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2386 bool want_zero, 2387 int64_t offset, int64_t bytes, 2388 int64_t *pnum, int64_t *map, 2389 BlockDriverState **file) 2390 { 2391 int64_t total_size; 2392 int64_t n; /* bytes */ 2393 int ret; 2394 int64_t local_map = 0; 2395 BlockDriverState *local_file = NULL; 2396 int64_t aligned_offset, aligned_bytes; 2397 uint32_t align; 2398 bool has_filtered_child; 2399 2400 assert(pnum); 2401 *pnum = 0; 2402 total_size = bdrv_getlength(bs); 2403 if (total_size < 0) { 2404 ret = total_size; 2405 goto early_out; 2406 } 2407 2408 if (offset >= total_size) { 2409 ret = BDRV_BLOCK_EOF; 2410 goto early_out; 2411 } 2412 if (!bytes) { 2413 ret = 0; 2414 goto early_out; 2415 } 2416 2417 n = total_size - offset; 2418 if (n < bytes) { 2419 bytes = n; 2420 } 2421 2422 /* Must be non-NULL or bdrv_getlength() would have failed */ 2423 assert(bs->drv); 2424 has_filtered_child = bdrv_filter_child(bs); 2425 if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { 2426 *pnum = bytes; 2427 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2428 if (offset + bytes == total_size) { 2429 ret |= BDRV_BLOCK_EOF; 2430 } 2431 if (bs->drv->protocol_name) { 2432 ret |= BDRV_BLOCK_OFFSET_VALID; 2433 local_map = offset; 2434 local_file = bs; 2435 } 2436 goto early_out; 2437 } 2438 2439 bdrv_inc_in_flight(bs); 2440 2441 /* Round out to request_alignment boundaries */ 2442 align = bs->bl.request_alignment; 2443 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2444 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2445 2446 if (bs->drv->bdrv_co_block_status) { 2447 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2448 aligned_bytes, pnum, &local_map, 2449 &local_file); 2450 } else { 2451 /* Default code for filters */ 2452 2453 local_file = bdrv_filter_bs(bs); 2454 assert(local_file); 2455 2456 *pnum = aligned_bytes; 2457 local_map = aligned_offset; 2458 ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2459 } 2460 if (ret < 0) { 2461 *pnum = 0; 2462 goto out; 2463 } 2464 2465 /* 2466 * The driver's result must be a non-zero multiple of request_alignment. 2467 * Clamp pnum and adjust map to original request. 2468 */ 2469 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2470 align > offset - aligned_offset); 2471 if (ret & BDRV_BLOCK_RECURSE) { 2472 assert(ret & BDRV_BLOCK_DATA); 2473 assert(ret & BDRV_BLOCK_OFFSET_VALID); 2474 assert(!(ret & BDRV_BLOCK_ZERO)); 2475 } 2476 2477 *pnum -= offset - aligned_offset; 2478 if (*pnum > bytes) { 2479 *pnum = bytes; 2480 } 2481 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2482 local_map += offset - aligned_offset; 2483 } 2484 2485 if (ret & BDRV_BLOCK_RAW) { 2486 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2487 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2488 *pnum, pnum, &local_map, &local_file); 2489 goto out; 2490 } 2491 2492 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2493 ret |= BDRV_BLOCK_ALLOCATED; 2494 } else if (bs->drv->supports_backing) { 2495 BlockDriverState *cow_bs = bdrv_cow_bs(bs); 2496 2497 if (!cow_bs) { 2498 ret |= BDRV_BLOCK_ZERO; 2499 } else if (want_zero) { 2500 int64_t size2 = bdrv_getlength(cow_bs); 2501 2502 if (size2 >= 0 && offset >= size2) { 2503 ret |= BDRV_BLOCK_ZERO; 2504 } 2505 } 2506 } 2507 2508 if (want_zero && ret & BDRV_BLOCK_RECURSE && 2509 local_file && local_file != bs && 2510 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2511 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2512 int64_t file_pnum; 2513 int ret2; 2514 2515 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2516 *pnum, &file_pnum, NULL, NULL); 2517 if (ret2 >= 0) { 2518 /* Ignore errors. This is just providing extra information, it 2519 * is useful but not necessary. 2520 */ 2521 if (ret2 & BDRV_BLOCK_EOF && 2522 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2523 /* 2524 * It is valid for the format block driver to read 2525 * beyond the end of the underlying file's current 2526 * size; such areas read as zero. 2527 */ 2528 ret |= BDRV_BLOCK_ZERO; 2529 } else { 2530 /* Limit request to the range reported by the protocol driver */ 2531 *pnum = file_pnum; 2532 ret |= (ret2 & BDRV_BLOCK_ZERO); 2533 } 2534 } 2535 } 2536 2537 out: 2538 bdrv_dec_in_flight(bs); 2539 if (ret >= 0 && offset + *pnum == total_size) { 2540 ret |= BDRV_BLOCK_EOF; 2541 } 2542 early_out: 2543 if (file) { 2544 *file = local_file; 2545 } 2546 if (map) { 2547 *map = local_map; 2548 } 2549 return ret; 2550 } 2551 2552 int coroutine_fn 2553 bdrv_co_common_block_status_above(BlockDriverState *bs, 2554 BlockDriverState *base, 2555 bool include_base, 2556 bool want_zero, 2557 int64_t offset, 2558 int64_t bytes, 2559 int64_t *pnum, 2560 int64_t *map, 2561 BlockDriverState **file, 2562 int *depth) 2563 { 2564 int ret; 2565 BlockDriverState *p; 2566 int64_t eof = 0; 2567 int dummy; 2568 2569 assert(!include_base || base); /* Can't include NULL base */ 2570 2571 if (!depth) { 2572 depth = &dummy; 2573 } 2574 *depth = 0; 2575 2576 if (!include_base && bs == base) { 2577 *pnum = bytes; 2578 return 0; 2579 } 2580 2581 ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file); 2582 ++*depth; 2583 if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { 2584 return ret; 2585 } 2586 2587 if (ret & BDRV_BLOCK_EOF) { 2588 eof = offset + *pnum; 2589 } 2590 2591 assert(*pnum <= bytes); 2592 bytes = *pnum; 2593 2594 for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; 2595 p = bdrv_filter_or_cow_bs(p)) 2596 { 2597 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2598 file); 2599 ++*depth; 2600 if (ret < 0) { 2601 return ret; 2602 } 2603 if (*pnum == 0) { 2604 /* 2605 * The top layer deferred to this layer, and because this layer is 2606 * short, any zeroes that we synthesize beyond EOF behave as if they 2607 * were allocated at this layer. 2608 * 2609 * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be 2610 * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 2611 * below. 2612 */ 2613 assert(ret & BDRV_BLOCK_EOF); 2614 *pnum = bytes; 2615 if (file) { 2616 *file = p; 2617 } 2618 ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED; 2619 break; 2620 } 2621 if (ret & BDRV_BLOCK_ALLOCATED) { 2622 /* 2623 * We've found the node and the status, we must break. 2624 * 2625 * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be 2626 * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 2627 * below. 2628 */ 2629 ret &= ~BDRV_BLOCK_EOF; 2630 break; 2631 } 2632 2633 if (p == base) { 2634 assert(include_base); 2635 break; 2636 } 2637 2638 /* 2639 * OK, [offset, offset + *pnum) region is unallocated on this layer, 2640 * let's continue the diving. 2641 */ 2642 assert(*pnum <= bytes); 2643 bytes = *pnum; 2644 } 2645 2646 if (offset + *pnum == eof) { 2647 ret |= BDRV_BLOCK_EOF; 2648 } 2649 2650 return ret; 2651 } 2652 2653 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2654 int64_t offset, int64_t bytes, int64_t *pnum, 2655 int64_t *map, BlockDriverState **file) 2656 { 2657 return bdrv_common_block_status_above(bs, base, false, true, offset, bytes, 2658 pnum, map, file, NULL); 2659 } 2660 2661 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2662 int64_t *pnum, int64_t *map, BlockDriverState **file) 2663 { 2664 return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), 2665 offset, bytes, pnum, map, file); 2666 } 2667 2668 /* 2669 * Check @bs (and its backing chain) to see if the range defined 2670 * by @offset and @bytes is known to read as zeroes. 2671 * Return 1 if that is the case, 0 otherwise and -errno on error. 2672 * This test is meant to be fast rather than accurate so returning 0 2673 * does not guarantee non-zero data. 2674 */ 2675 int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, 2676 int64_t bytes) 2677 { 2678 int ret; 2679 int64_t pnum = bytes; 2680 2681 if (!bytes) { 2682 return 1; 2683 } 2684 2685 ret = bdrv_common_block_status_above(bs, NULL, false, false, offset, 2686 bytes, &pnum, NULL, NULL, NULL); 2687 2688 if (ret < 0) { 2689 return ret; 2690 } 2691 2692 return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); 2693 } 2694 2695 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2696 int64_t bytes, int64_t *pnum) 2697 { 2698 int ret; 2699 int64_t dummy; 2700 2701 ret = bdrv_common_block_status_above(bs, bs, true, false, offset, 2702 bytes, pnum ? pnum : &dummy, NULL, 2703 NULL, NULL); 2704 if (ret < 0) { 2705 return ret; 2706 } 2707 return !!(ret & BDRV_BLOCK_ALLOCATED); 2708 } 2709 2710 /* 2711 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2712 * 2713 * Return a positive depth if (a prefix of) the given range is allocated 2714 * in any image between BASE and TOP (BASE is only included if include_base 2715 * is set). Depth 1 is TOP, 2 is the first backing layer, and so forth. 2716 * BASE can be NULL to check if the given offset is allocated in any 2717 * image of the chain. Return 0 otherwise, or negative errno on 2718 * failure. 2719 * 2720 * 'pnum' is set to the number of bytes (including and immediately 2721 * following the specified offset) that are known to be in the same 2722 * allocated/unallocated state. Note that a subsequent call starting 2723 * at 'offset + *pnum' may return the same allocation status (in other 2724 * words, the result is not necessarily the maximum possible range); 2725 * but 'pnum' will only be 0 when end of file is reached. 2726 */ 2727 int bdrv_is_allocated_above(BlockDriverState *top, 2728 BlockDriverState *base, 2729 bool include_base, int64_t offset, 2730 int64_t bytes, int64_t *pnum) 2731 { 2732 int depth; 2733 int ret = bdrv_common_block_status_above(top, base, include_base, false, 2734 offset, bytes, pnum, NULL, NULL, 2735 &depth); 2736 if (ret < 0) { 2737 return ret; 2738 } 2739 2740 if (ret & BDRV_BLOCK_ALLOCATED) { 2741 return depth; 2742 } 2743 return 0; 2744 } 2745 2746 int coroutine_fn 2747 bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2748 { 2749 BlockDriver *drv = bs->drv; 2750 BlockDriverState *child_bs = bdrv_primary_bs(bs); 2751 int ret = -ENOTSUP; 2752 2753 if (!drv) { 2754 return -ENOMEDIUM; 2755 } 2756 2757 bdrv_inc_in_flight(bs); 2758 2759 if (drv->bdrv_load_vmstate) { 2760 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2761 } else if (child_bs) { 2762 ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); 2763 } 2764 2765 bdrv_dec_in_flight(bs); 2766 2767 return ret; 2768 } 2769 2770 int coroutine_fn 2771 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2772 { 2773 BlockDriver *drv = bs->drv; 2774 BlockDriverState *child_bs = bdrv_primary_bs(bs); 2775 int ret = -ENOTSUP; 2776 2777 if (!drv) { 2778 return -ENOMEDIUM; 2779 } 2780 2781 bdrv_inc_in_flight(bs); 2782 2783 if (drv->bdrv_save_vmstate) { 2784 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2785 } else if (child_bs) { 2786 ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); 2787 } 2788 2789 bdrv_dec_in_flight(bs); 2790 2791 return ret; 2792 } 2793 2794 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2795 int64_t pos, int size) 2796 { 2797 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2798 int ret = bdrv_writev_vmstate(bs, &qiov, pos); 2799 2800 return ret < 0 ? ret : size; 2801 } 2802 2803 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2804 int64_t pos, int size) 2805 { 2806 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2807 int ret = bdrv_readv_vmstate(bs, &qiov, pos); 2808 2809 return ret < 0 ? ret : size; 2810 } 2811 2812 /**************************************************************/ 2813 /* async I/Os */ 2814 2815 void bdrv_aio_cancel(BlockAIOCB *acb) 2816 { 2817 qemu_aio_ref(acb); 2818 bdrv_aio_cancel_async(acb); 2819 while (acb->refcnt > 1) { 2820 if (acb->aiocb_info->get_aio_context) { 2821 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2822 } else if (acb->bs) { 2823 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2824 * assert that we're not using an I/O thread. Thread-safe 2825 * code should use bdrv_aio_cancel_async exclusively. 2826 */ 2827 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2828 aio_poll(bdrv_get_aio_context(acb->bs), true); 2829 } else { 2830 abort(); 2831 } 2832 } 2833 qemu_aio_unref(acb); 2834 } 2835 2836 /* Async version of aio cancel. The caller is not blocked if the acb implements 2837 * cancel_async, otherwise we do nothing and let the request normally complete. 2838 * In either case the completion callback must be called. */ 2839 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2840 { 2841 if (acb->aiocb_info->cancel_async) { 2842 acb->aiocb_info->cancel_async(acb); 2843 } 2844 } 2845 2846 /**************************************************************/ 2847 /* Coroutine block device emulation */ 2848 2849 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2850 { 2851 BdrvChild *primary_child = bdrv_primary_child(bs); 2852 BdrvChild *child; 2853 int current_gen; 2854 int ret = 0; 2855 2856 bdrv_inc_in_flight(bs); 2857 2858 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2859 bdrv_is_sg(bs)) { 2860 goto early_exit; 2861 } 2862 2863 qemu_co_mutex_lock(&bs->reqs_lock); 2864 current_gen = qatomic_read(&bs->write_gen); 2865 2866 /* Wait until any previous flushes are completed */ 2867 while (bs->active_flush_req) { 2868 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2869 } 2870 2871 /* Flushes reach this point in nondecreasing current_gen order. */ 2872 bs->active_flush_req = true; 2873 qemu_co_mutex_unlock(&bs->reqs_lock); 2874 2875 /* Write back all layers by calling one driver function */ 2876 if (bs->drv->bdrv_co_flush) { 2877 ret = bs->drv->bdrv_co_flush(bs); 2878 goto out; 2879 } 2880 2881 /* Write back cached data to the OS even with cache=unsafe */ 2882 BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); 2883 if (bs->drv->bdrv_co_flush_to_os) { 2884 ret = bs->drv->bdrv_co_flush_to_os(bs); 2885 if (ret < 0) { 2886 goto out; 2887 } 2888 } 2889 2890 /* But don't actually force it to the disk with cache=unsafe */ 2891 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2892 goto flush_children; 2893 } 2894 2895 /* Check if we really need to flush anything */ 2896 if (bs->flushed_gen == current_gen) { 2897 goto flush_children; 2898 } 2899 2900 BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); 2901 if (!bs->drv) { 2902 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2903 * (even in case of apparent success) */ 2904 ret = -ENOMEDIUM; 2905 goto out; 2906 } 2907 if (bs->drv->bdrv_co_flush_to_disk) { 2908 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2909 } else if (bs->drv->bdrv_aio_flush) { 2910 BlockAIOCB *acb; 2911 CoroutineIOCompletion co = { 2912 .coroutine = qemu_coroutine_self(), 2913 }; 2914 2915 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2916 if (acb == NULL) { 2917 ret = -EIO; 2918 } else { 2919 qemu_coroutine_yield(); 2920 ret = co.ret; 2921 } 2922 } else { 2923 /* 2924 * Some block drivers always operate in either writethrough or unsafe 2925 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2926 * know how the server works (because the behaviour is hardcoded or 2927 * depends on server-side configuration), so we can't ensure that 2928 * everything is safe on disk. Returning an error doesn't work because 2929 * that would break guests even if the server operates in writethrough 2930 * mode. 2931 * 2932 * Let's hope the user knows what he's doing. 2933 */ 2934 ret = 0; 2935 } 2936 2937 if (ret < 0) { 2938 goto out; 2939 } 2940 2941 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2942 * in the case of cache=unsafe, so there are no useless flushes. 2943 */ 2944 flush_children: 2945 ret = 0; 2946 QLIST_FOREACH(child, &bs->children, next) { 2947 if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { 2948 int this_child_ret = bdrv_co_flush(child->bs); 2949 if (!ret) { 2950 ret = this_child_ret; 2951 } 2952 } 2953 } 2954 2955 out: 2956 /* Notify any pending flushes that we have completed */ 2957 if (ret == 0) { 2958 bs->flushed_gen = current_gen; 2959 } 2960 2961 qemu_co_mutex_lock(&bs->reqs_lock); 2962 bs->active_flush_req = false; 2963 /* Return value is ignored - it's ok if wait queue is empty */ 2964 qemu_co_queue_next(&bs->flush_queue); 2965 qemu_co_mutex_unlock(&bs->reqs_lock); 2966 2967 early_exit: 2968 bdrv_dec_in_flight(bs); 2969 return ret; 2970 } 2971 2972 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2973 int64_t bytes) 2974 { 2975 BdrvTrackedRequest req; 2976 int max_pdiscard, ret; 2977 int head, tail, align; 2978 BlockDriverState *bs = child->bs; 2979 2980 if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 2981 return -ENOMEDIUM; 2982 } 2983 2984 if (bdrv_has_readonly_bitmaps(bs)) { 2985 return -EPERM; 2986 } 2987 2988 ret = bdrv_check_request(offset, bytes, NULL); 2989 if (ret < 0) { 2990 return ret; 2991 } 2992 2993 /* Do nothing if disabled. */ 2994 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2995 return 0; 2996 } 2997 2998 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2999 return 0; 3000 } 3001 3002 /* Discard is advisory, but some devices track and coalesce 3003 * unaligned requests, so we must pass everything down rather than 3004 * round here. Still, most devices will just silently ignore 3005 * unaligned requests (by returning -ENOTSUP), so we must fragment 3006 * the request accordingly. */ 3007 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 3008 assert(align % bs->bl.request_alignment == 0); 3009 head = offset % align; 3010 tail = (offset + bytes) % align; 3011 3012 bdrv_inc_in_flight(bs); 3013 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 3014 3015 ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 3016 if (ret < 0) { 3017 goto out; 3018 } 3019 3020 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 3021 align); 3022 assert(max_pdiscard >= bs->bl.request_alignment); 3023 3024 while (bytes > 0) { 3025 int64_t num = bytes; 3026 3027 if (head) { 3028 /* Make small requests to get to alignment boundaries. */ 3029 num = MIN(bytes, align - head); 3030 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 3031 num %= bs->bl.request_alignment; 3032 } 3033 head = (head + num) % align; 3034 assert(num < max_pdiscard); 3035 } else if (tail) { 3036 if (num > align) { 3037 /* Shorten the request to the last aligned cluster. */ 3038 num -= tail; 3039 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 3040 tail > bs->bl.request_alignment) { 3041 tail %= bs->bl.request_alignment; 3042 num -= tail; 3043 } 3044 } 3045 /* limit request size */ 3046 if (num > max_pdiscard) { 3047 num = max_pdiscard; 3048 } 3049 3050 if (!bs->drv) { 3051 ret = -ENOMEDIUM; 3052 goto out; 3053 } 3054 if (bs->drv->bdrv_co_pdiscard) { 3055 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 3056 } else { 3057 BlockAIOCB *acb; 3058 CoroutineIOCompletion co = { 3059 .coroutine = qemu_coroutine_self(), 3060 }; 3061 3062 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 3063 bdrv_co_io_em_complete, &co); 3064 if (acb == NULL) { 3065 ret = -EIO; 3066 goto out; 3067 } else { 3068 qemu_coroutine_yield(); 3069 ret = co.ret; 3070 } 3071 } 3072 if (ret && ret != -ENOTSUP) { 3073 goto out; 3074 } 3075 3076 offset += num; 3077 bytes -= num; 3078 } 3079 ret = 0; 3080 out: 3081 bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3082 tracked_request_end(&req); 3083 bdrv_dec_in_flight(bs); 3084 return ret; 3085 } 3086 3087 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 3088 { 3089 BlockDriver *drv = bs->drv; 3090 CoroutineIOCompletion co = { 3091 .coroutine = qemu_coroutine_self(), 3092 }; 3093 BlockAIOCB *acb; 3094 3095 bdrv_inc_in_flight(bs); 3096 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 3097 co.ret = -ENOTSUP; 3098 goto out; 3099 } 3100 3101 if (drv->bdrv_co_ioctl) { 3102 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 3103 } else { 3104 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 3105 if (!acb) { 3106 co.ret = -ENOTSUP; 3107 goto out; 3108 } 3109 qemu_coroutine_yield(); 3110 } 3111 out: 3112 bdrv_dec_in_flight(bs); 3113 return co.ret; 3114 } 3115 3116 void *qemu_blockalign(BlockDriverState *bs, size_t size) 3117 { 3118 return qemu_memalign(bdrv_opt_mem_align(bs), size); 3119 } 3120 3121 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 3122 { 3123 return memset(qemu_blockalign(bs, size), 0, size); 3124 } 3125 3126 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 3127 { 3128 size_t align = bdrv_opt_mem_align(bs); 3129 3130 /* Ensure that NULL is never returned on success */ 3131 assert(align > 0); 3132 if (size == 0) { 3133 size = align; 3134 } 3135 3136 return qemu_try_memalign(align, size); 3137 } 3138 3139 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 3140 { 3141 void *mem = qemu_try_blockalign(bs, size); 3142 3143 if (mem) { 3144 memset(mem, 0, size); 3145 } 3146 3147 return mem; 3148 } 3149 3150 /* 3151 * Check if all memory in this vector is sector aligned. 3152 */ 3153 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 3154 { 3155 int i; 3156 size_t alignment = bdrv_min_mem_align(bs); 3157 3158 for (i = 0; i < qiov->niov; i++) { 3159 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 3160 return false; 3161 } 3162 if (qiov->iov[i].iov_len % alignment) { 3163 return false; 3164 } 3165 } 3166 3167 return true; 3168 } 3169 3170 void bdrv_io_plug(BlockDriverState *bs) 3171 { 3172 BdrvChild *child; 3173 3174 QLIST_FOREACH(child, &bs->children, next) { 3175 bdrv_io_plug(child->bs); 3176 } 3177 3178 if (qatomic_fetch_inc(&bs->io_plugged) == 0) { 3179 BlockDriver *drv = bs->drv; 3180 if (drv && drv->bdrv_io_plug) { 3181 drv->bdrv_io_plug(bs); 3182 } 3183 } 3184 } 3185 3186 void bdrv_io_unplug(BlockDriverState *bs) 3187 { 3188 BdrvChild *child; 3189 3190 assert(bs->io_plugged); 3191 if (qatomic_fetch_dec(&bs->io_plugged) == 1) { 3192 BlockDriver *drv = bs->drv; 3193 if (drv && drv->bdrv_io_unplug) { 3194 drv->bdrv_io_unplug(bs); 3195 } 3196 } 3197 3198 QLIST_FOREACH(child, &bs->children, next) { 3199 bdrv_io_unplug(child->bs); 3200 } 3201 } 3202 3203 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 3204 { 3205 BdrvChild *child; 3206 3207 if (bs->drv && bs->drv->bdrv_register_buf) { 3208 bs->drv->bdrv_register_buf(bs, host, size); 3209 } 3210 QLIST_FOREACH(child, &bs->children, next) { 3211 bdrv_register_buf(child->bs, host, size); 3212 } 3213 } 3214 3215 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 3216 { 3217 BdrvChild *child; 3218 3219 if (bs->drv && bs->drv->bdrv_unregister_buf) { 3220 bs->drv->bdrv_unregister_buf(bs, host); 3221 } 3222 QLIST_FOREACH(child, &bs->children, next) { 3223 bdrv_unregister_buf(child->bs, host); 3224 } 3225 } 3226 3227 static int coroutine_fn bdrv_co_copy_range_internal( 3228 BdrvChild *src, int64_t src_offset, BdrvChild *dst, 3229 int64_t dst_offset, int64_t bytes, 3230 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3231 bool recurse_src) 3232 { 3233 BdrvTrackedRequest req; 3234 int ret; 3235 3236 /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3237 assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3238 assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3239 3240 if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) { 3241 return -ENOMEDIUM; 3242 } 3243 ret = bdrv_check_request32(dst_offset, bytes, NULL, 0); 3244 if (ret) { 3245 return ret; 3246 } 3247 if (write_flags & BDRV_REQ_ZERO_WRITE) { 3248 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3249 } 3250 3251 if (!src || !src->bs || !bdrv_is_inserted(src->bs)) { 3252 return -ENOMEDIUM; 3253 } 3254 ret = bdrv_check_request32(src_offset, bytes, NULL, 0); 3255 if (ret) { 3256 return ret; 3257 } 3258 3259 if (!src->bs->drv->bdrv_co_copy_range_from 3260 || !dst->bs->drv->bdrv_co_copy_range_to 3261 || src->bs->encrypted || dst->bs->encrypted) { 3262 return -ENOTSUP; 3263 } 3264 3265 if (recurse_src) { 3266 bdrv_inc_in_flight(src->bs); 3267 tracked_request_begin(&req, src->bs, src_offset, bytes, 3268 BDRV_TRACKED_READ); 3269 3270 /* BDRV_REQ_SERIALISING is only for write operation */ 3271 assert(!(read_flags & BDRV_REQ_SERIALISING)); 3272 bdrv_wait_serialising_requests(&req); 3273 3274 ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3275 src, src_offset, 3276 dst, dst_offset, 3277 bytes, 3278 read_flags, write_flags); 3279 3280 tracked_request_end(&req); 3281 bdrv_dec_in_flight(src->bs); 3282 } else { 3283 bdrv_inc_in_flight(dst->bs); 3284 tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3285 BDRV_TRACKED_WRITE); 3286 ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 3287 write_flags); 3288 if (!ret) { 3289 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3290 src, src_offset, 3291 dst, dst_offset, 3292 bytes, 3293 read_flags, write_flags); 3294 } 3295 bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3296 tracked_request_end(&req); 3297 bdrv_dec_in_flight(dst->bs); 3298 } 3299 3300 return ret; 3301 } 3302 3303 /* Copy range from @src to @dst. 3304 * 3305 * See the comment of bdrv_co_copy_range for the parameter and return value 3306 * semantics. */ 3307 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, 3308 BdrvChild *dst, int64_t dst_offset, 3309 int64_t bytes, 3310 BdrvRequestFlags read_flags, 3311 BdrvRequestFlags write_flags) 3312 { 3313 trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3314 read_flags, write_flags); 3315 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3316 bytes, read_flags, write_flags, true); 3317 } 3318 3319 /* Copy range from @src to @dst. 3320 * 3321 * See the comment of bdrv_co_copy_range for the parameter and return value 3322 * semantics. */ 3323 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, 3324 BdrvChild *dst, int64_t dst_offset, 3325 int64_t bytes, 3326 BdrvRequestFlags read_flags, 3327 BdrvRequestFlags write_flags) 3328 { 3329 trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3330 read_flags, write_flags); 3331 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3332 bytes, read_flags, write_flags, false); 3333 } 3334 3335 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, 3336 BdrvChild *dst, int64_t dst_offset, 3337 int64_t bytes, BdrvRequestFlags read_flags, 3338 BdrvRequestFlags write_flags) 3339 { 3340 return bdrv_co_copy_range_from(src, src_offset, 3341 dst, dst_offset, 3342 bytes, read_flags, write_flags); 3343 } 3344 3345 static void bdrv_parent_cb_resize(BlockDriverState *bs) 3346 { 3347 BdrvChild *c; 3348 QLIST_FOREACH(c, &bs->parents, next_parent) { 3349 if (c->klass->resize) { 3350 c->klass->resize(c); 3351 } 3352 } 3353 } 3354 3355 /** 3356 * Truncate file to 'offset' bytes (needed only for file protocols) 3357 * 3358 * If 'exact' is true, the file must be resized to exactly the given 3359 * 'offset'. Otherwise, it is sufficient for the node to be at least 3360 * 'offset' bytes in length. 3361 */ 3362 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 3363 PreallocMode prealloc, BdrvRequestFlags flags, 3364 Error **errp) 3365 { 3366 BlockDriverState *bs = child->bs; 3367 BdrvChild *filtered, *backing; 3368 BlockDriver *drv = bs->drv; 3369 BdrvTrackedRequest req; 3370 int64_t old_size, new_bytes; 3371 int ret; 3372 3373 3374 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 3375 if (!drv) { 3376 error_setg(errp, "No medium inserted"); 3377 return -ENOMEDIUM; 3378 } 3379 if (offset < 0) { 3380 error_setg(errp, "Image size cannot be negative"); 3381 return -EINVAL; 3382 } 3383 3384 ret = bdrv_check_request(offset, 0, errp); 3385 if (ret < 0) { 3386 return ret; 3387 } 3388 3389 old_size = bdrv_getlength(bs); 3390 if (old_size < 0) { 3391 error_setg_errno(errp, -old_size, "Failed to get old image size"); 3392 return old_size; 3393 } 3394 3395 if (bdrv_is_read_only(bs)) { 3396 error_setg(errp, "Image is read-only"); 3397 return -EACCES; 3398 } 3399 3400 if (offset > old_size) { 3401 new_bytes = offset - old_size; 3402 } else { 3403 new_bytes = 0; 3404 } 3405 3406 bdrv_inc_in_flight(bs); 3407 tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 3408 BDRV_TRACKED_TRUNCATE); 3409 3410 /* If we are growing the image and potentially using preallocation for the 3411 * new area, we need to make sure that no write requests are made to it 3412 * concurrently or they might be overwritten by preallocation. */ 3413 if (new_bytes) { 3414 bdrv_make_request_serialising(&req, 1); 3415 } 3416 ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3417 0); 3418 if (ret < 0) { 3419 error_setg_errno(errp, -ret, 3420 "Failed to prepare request for truncation"); 3421 goto out; 3422 } 3423 3424 filtered = bdrv_filter_child(bs); 3425 backing = bdrv_cow_child(bs); 3426 3427 /* 3428 * If the image has a backing file that is large enough that it would 3429 * provide data for the new area, we cannot leave it unallocated because 3430 * then the backing file content would become visible. Instead, zero-fill 3431 * the new area. 3432 * 3433 * Note that if the image has a backing file, but was opened without the 3434 * backing file, taking care of keeping things consistent with that backing 3435 * file is the user's responsibility. 3436 */ 3437 if (new_bytes && backing) { 3438 int64_t backing_len; 3439 3440 backing_len = bdrv_getlength(backing->bs); 3441 if (backing_len < 0) { 3442 ret = backing_len; 3443 error_setg_errno(errp, -ret, "Could not get backing file size"); 3444 goto out; 3445 } 3446 3447 if (backing_len > old_size) { 3448 flags |= BDRV_REQ_ZERO_WRITE; 3449 } 3450 } 3451 3452 if (drv->bdrv_co_truncate) { 3453 if (flags & ~bs->supported_truncate_flags) { 3454 error_setg(errp, "Block driver does not support requested flags"); 3455 ret = -ENOTSUP; 3456 goto out; 3457 } 3458 ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 3459 } else if (filtered) { 3460 ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); 3461 } else { 3462 error_setg(errp, "Image format driver does not support resize"); 3463 ret = -ENOTSUP; 3464 goto out; 3465 } 3466 if (ret < 0) { 3467 goto out; 3468 } 3469 3470 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3471 if (ret < 0) { 3472 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 3473 } else { 3474 offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3475 } 3476 /* It's possible that truncation succeeded but refresh_total_sectors 3477 * failed, but the latter doesn't affect how we should finish the request. 3478 * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3479 bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 3480 3481 out: 3482 tracked_request_end(&req); 3483 bdrv_dec_in_flight(bs); 3484 3485 return ret; 3486 } 3487 3488 void bdrv_cancel_in_flight(BlockDriverState *bs) 3489 { 3490 if (!bs || !bs->drv) { 3491 return; 3492 } 3493 3494 if (bs->drv->bdrv_cancel_in_flight) { 3495 bs->drv->bdrv_cancel_in_flight(bs); 3496 } 3497 } 3498