1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "sysemu/replay.h" 37 38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40 41 static void bdrv_parent_cb_resize(BlockDriverState *bs); 42 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 43 int64_t offset, int bytes, BdrvRequestFlags flags); 44 45 static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 46 bool ignore_bds_parents) 47 { 48 BdrvChild *c, *next; 49 50 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 51 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 52 continue; 53 } 54 bdrv_parent_drained_begin_single(c, false); 55 } 56 } 57 58 static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 59 int *drained_end_counter) 60 { 61 assert(c->parent_quiesce_counter > 0); 62 c->parent_quiesce_counter--; 63 if (c->klass->drained_end) { 64 c->klass->drained_end(c, drained_end_counter); 65 } 66 } 67 68 void bdrv_parent_drained_end_single(BdrvChild *c) 69 { 70 int drained_end_counter = 0; 71 bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 72 BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0); 73 } 74 75 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 76 bool ignore_bds_parents, 77 int *drained_end_counter) 78 { 79 BdrvChild *c; 80 81 QLIST_FOREACH(c, &bs->parents, next_parent) { 82 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 83 continue; 84 } 85 bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 86 } 87 } 88 89 static bool bdrv_parent_drained_poll_single(BdrvChild *c) 90 { 91 if (c->klass->drained_poll) { 92 return c->klass->drained_poll(c); 93 } 94 return false; 95 } 96 97 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 98 bool ignore_bds_parents) 99 { 100 BdrvChild *c, *next; 101 bool busy = false; 102 103 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 104 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 105 continue; 106 } 107 busy |= bdrv_parent_drained_poll_single(c); 108 } 109 110 return busy; 111 } 112 113 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 114 { 115 c->parent_quiesce_counter++; 116 if (c->klass->drained_begin) { 117 c->klass->drained_begin(c); 118 } 119 if (poll) { 120 BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 121 } 122 } 123 124 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 125 { 126 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 127 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 128 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 129 src->opt_mem_alignment); 130 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 131 src->min_mem_alignment); 132 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 133 } 134 135 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 136 { 137 BlockDriver *drv = bs->drv; 138 BdrvChild *c; 139 bool have_limits; 140 Error *local_err = NULL; 141 142 memset(&bs->bl, 0, sizeof(bs->bl)); 143 144 if (!drv) { 145 return; 146 } 147 148 /* Default alignment based on whether driver has byte interface */ 149 bs->bl.request_alignment = (drv->bdrv_co_preadv || 150 drv->bdrv_aio_preadv || 151 drv->bdrv_co_preadv_part) ? 1 : 512; 152 153 /* Take some limits from the children as a default */ 154 have_limits = false; 155 QLIST_FOREACH(c, &bs->children, next) { 156 if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) 157 { 158 bdrv_refresh_limits(c->bs, &local_err); 159 if (local_err) { 160 error_propagate(errp, local_err); 161 return; 162 } 163 bdrv_merge_limits(&bs->bl, &c->bs->bl); 164 have_limits = true; 165 } 166 } 167 168 if (!have_limits) { 169 bs->bl.min_mem_alignment = 512; 170 bs->bl.opt_mem_alignment = qemu_real_host_page_size; 171 172 /* Safe default since most protocols use readv()/writev()/etc */ 173 bs->bl.max_iov = IOV_MAX; 174 } 175 176 /* Then let the driver override it */ 177 if (drv->bdrv_refresh_limits) { 178 drv->bdrv_refresh_limits(bs, errp); 179 } 180 } 181 182 /** 183 * The copy-on-read flag is actually a reference count so multiple users may 184 * use the feature without worrying about clobbering its previous state. 185 * Copy-on-read stays enabled until all users have called to disable it. 186 */ 187 void bdrv_enable_copy_on_read(BlockDriverState *bs) 188 { 189 atomic_inc(&bs->copy_on_read); 190 } 191 192 void bdrv_disable_copy_on_read(BlockDriverState *bs) 193 { 194 int old = atomic_fetch_dec(&bs->copy_on_read); 195 assert(old >= 1); 196 } 197 198 typedef struct { 199 Coroutine *co; 200 BlockDriverState *bs; 201 bool done; 202 bool begin; 203 bool recursive; 204 bool poll; 205 BdrvChild *parent; 206 bool ignore_bds_parents; 207 int *drained_end_counter; 208 } BdrvCoDrainData; 209 210 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 211 { 212 BdrvCoDrainData *data = opaque; 213 BlockDriverState *bs = data->bs; 214 215 if (data->begin) { 216 bs->drv->bdrv_co_drain_begin(bs); 217 } else { 218 bs->drv->bdrv_co_drain_end(bs); 219 } 220 221 /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 222 atomic_mb_set(&data->done, true); 223 if (!data->begin) { 224 atomic_dec(data->drained_end_counter); 225 } 226 bdrv_dec_in_flight(bs); 227 228 g_free(data); 229 } 230 231 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 232 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 233 int *drained_end_counter) 234 { 235 BdrvCoDrainData *data; 236 237 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 238 (!begin && !bs->drv->bdrv_co_drain_end)) { 239 return; 240 } 241 242 data = g_new(BdrvCoDrainData, 1); 243 *data = (BdrvCoDrainData) { 244 .bs = bs, 245 .done = false, 246 .begin = begin, 247 .drained_end_counter = drained_end_counter, 248 }; 249 250 if (!begin) { 251 atomic_inc(drained_end_counter); 252 } 253 254 /* Make sure the driver callback completes during the polling phase for 255 * drain_begin. */ 256 bdrv_inc_in_flight(bs); 257 data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 258 aio_co_schedule(bdrv_get_aio_context(bs), data->co); 259 } 260 261 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 262 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 263 BdrvChild *ignore_parent, bool ignore_bds_parents) 264 { 265 BdrvChild *child, *next; 266 267 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 268 return true; 269 } 270 271 if (atomic_read(&bs->in_flight)) { 272 return true; 273 } 274 275 if (recursive) { 276 assert(!ignore_bds_parents); 277 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 278 if (bdrv_drain_poll(child->bs, recursive, child, false)) { 279 return true; 280 } 281 } 282 } 283 284 return false; 285 } 286 287 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 288 BdrvChild *ignore_parent) 289 { 290 return bdrv_drain_poll(bs, recursive, ignore_parent, false); 291 } 292 293 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 294 BdrvChild *parent, bool ignore_bds_parents, 295 bool poll); 296 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 297 BdrvChild *parent, bool ignore_bds_parents, 298 int *drained_end_counter); 299 300 static void bdrv_co_drain_bh_cb(void *opaque) 301 { 302 BdrvCoDrainData *data = opaque; 303 Coroutine *co = data->co; 304 BlockDriverState *bs = data->bs; 305 306 if (bs) { 307 AioContext *ctx = bdrv_get_aio_context(bs); 308 AioContext *co_ctx = qemu_coroutine_get_aio_context(co); 309 310 /* 311 * When the coroutine yielded, the lock for its home context was 312 * released, so we need to re-acquire it here. If it explicitly 313 * acquired a different context, the lock is still held and we don't 314 * want to lock it a second time (or AIO_WAIT_WHILE() would hang). 315 */ 316 if (ctx == co_ctx) { 317 aio_context_acquire(ctx); 318 } 319 bdrv_dec_in_flight(bs); 320 if (data->begin) { 321 assert(!data->drained_end_counter); 322 bdrv_do_drained_begin(bs, data->recursive, data->parent, 323 data->ignore_bds_parents, data->poll); 324 } else { 325 assert(!data->poll); 326 bdrv_do_drained_end(bs, data->recursive, data->parent, 327 data->ignore_bds_parents, 328 data->drained_end_counter); 329 } 330 if (ctx == co_ctx) { 331 aio_context_release(ctx); 332 } 333 } else { 334 assert(data->begin); 335 bdrv_drain_all_begin(); 336 } 337 338 data->done = true; 339 aio_co_wake(co); 340 } 341 342 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 343 bool begin, bool recursive, 344 BdrvChild *parent, 345 bool ignore_bds_parents, 346 bool poll, 347 int *drained_end_counter) 348 { 349 BdrvCoDrainData data; 350 351 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 352 * other coroutines run if they were queued by aio_co_enter(). */ 353 354 assert(qemu_in_coroutine()); 355 data = (BdrvCoDrainData) { 356 .co = qemu_coroutine_self(), 357 .bs = bs, 358 .done = false, 359 .begin = begin, 360 .recursive = recursive, 361 .parent = parent, 362 .ignore_bds_parents = ignore_bds_parents, 363 .poll = poll, 364 .drained_end_counter = drained_end_counter, 365 }; 366 367 if (bs) { 368 bdrv_inc_in_flight(bs); 369 } 370 replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs), 371 bdrv_co_drain_bh_cb, &data); 372 373 qemu_coroutine_yield(); 374 /* If we are resumed from some other event (such as an aio completion or a 375 * timer callback), it is a bug in the caller that should be fixed. */ 376 assert(data.done); 377 } 378 379 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 380 BdrvChild *parent, bool ignore_bds_parents) 381 { 382 assert(!qemu_in_coroutine()); 383 384 /* Stop things in parent-to-child order */ 385 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 386 aio_disable_external(bdrv_get_aio_context(bs)); 387 } 388 389 bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 390 bdrv_drain_invoke(bs, true, NULL); 391 } 392 393 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 394 BdrvChild *parent, bool ignore_bds_parents, 395 bool poll) 396 { 397 BdrvChild *child, *next; 398 399 if (qemu_in_coroutine()) { 400 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 401 poll, NULL); 402 return; 403 } 404 405 bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 406 407 if (recursive) { 408 assert(!ignore_bds_parents); 409 bs->recursive_quiesce_counter++; 410 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 411 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 412 false); 413 } 414 } 415 416 /* 417 * Wait for drained requests to finish. 418 * 419 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 420 * call is needed so things in this AioContext can make progress even 421 * though we don't return to the main AioContext loop - this automatically 422 * includes other nodes in the same AioContext and therefore all child 423 * nodes. 424 */ 425 if (poll) { 426 assert(!ignore_bds_parents); 427 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 428 } 429 } 430 431 void bdrv_drained_begin(BlockDriverState *bs) 432 { 433 bdrv_do_drained_begin(bs, false, NULL, false, true); 434 } 435 436 void bdrv_subtree_drained_begin(BlockDriverState *bs) 437 { 438 bdrv_do_drained_begin(bs, true, NULL, false, true); 439 } 440 441 /** 442 * This function does not poll, nor must any of its recursively called 443 * functions. The *drained_end_counter pointee will be incremented 444 * once for every background operation scheduled, and decremented once 445 * the operation settles. Therefore, the pointer must remain valid 446 * until the pointee reaches 0. That implies that whoever sets up the 447 * pointee has to poll until it is 0. 448 * 449 * We use atomic operations to access *drained_end_counter, because 450 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 451 * @bs may contain nodes in different AioContexts, 452 * (2) bdrv_drain_all_end() uses the same counter for all nodes, 453 * regardless of which AioContext they are in. 454 */ 455 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 456 BdrvChild *parent, bool ignore_bds_parents, 457 int *drained_end_counter) 458 { 459 BdrvChild *child; 460 int old_quiesce_counter; 461 462 assert(drained_end_counter != NULL); 463 464 if (qemu_in_coroutine()) { 465 bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 466 false, drained_end_counter); 467 return; 468 } 469 assert(bs->quiesce_counter > 0); 470 471 /* Re-enable things in child-to-parent order */ 472 bdrv_drain_invoke(bs, false, drained_end_counter); 473 bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 474 drained_end_counter); 475 476 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 477 if (old_quiesce_counter == 1) { 478 aio_enable_external(bdrv_get_aio_context(bs)); 479 } 480 481 if (recursive) { 482 assert(!ignore_bds_parents); 483 bs->recursive_quiesce_counter--; 484 QLIST_FOREACH(child, &bs->children, next) { 485 bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 486 drained_end_counter); 487 } 488 } 489 } 490 491 void bdrv_drained_end(BlockDriverState *bs) 492 { 493 int drained_end_counter = 0; 494 bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 495 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 496 } 497 498 void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 499 { 500 bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 501 } 502 503 void bdrv_subtree_drained_end(BlockDriverState *bs) 504 { 505 int drained_end_counter = 0; 506 bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 507 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 508 } 509 510 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 511 { 512 int i; 513 514 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 515 bdrv_do_drained_begin(child->bs, true, child, false, true); 516 } 517 } 518 519 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 520 { 521 int drained_end_counter = 0; 522 int i; 523 524 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 525 bdrv_do_drained_end(child->bs, true, child, false, 526 &drained_end_counter); 527 } 528 529 BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0); 530 } 531 532 /* 533 * Wait for pending requests to complete on a single BlockDriverState subtree, 534 * and suspend block driver's internal I/O until next request arrives. 535 * 536 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 537 * AioContext. 538 */ 539 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 540 { 541 assert(qemu_in_coroutine()); 542 bdrv_drained_begin(bs); 543 bdrv_drained_end(bs); 544 } 545 546 void bdrv_drain(BlockDriverState *bs) 547 { 548 bdrv_drained_begin(bs); 549 bdrv_drained_end(bs); 550 } 551 552 static void bdrv_drain_assert_idle(BlockDriverState *bs) 553 { 554 BdrvChild *child, *next; 555 556 assert(atomic_read(&bs->in_flight) == 0); 557 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 558 bdrv_drain_assert_idle(child->bs); 559 } 560 } 561 562 unsigned int bdrv_drain_all_count = 0; 563 564 static bool bdrv_drain_all_poll(void) 565 { 566 BlockDriverState *bs = NULL; 567 bool result = false; 568 569 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 570 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 571 while ((bs = bdrv_next_all_states(bs))) { 572 AioContext *aio_context = bdrv_get_aio_context(bs); 573 aio_context_acquire(aio_context); 574 result |= bdrv_drain_poll(bs, false, NULL, true); 575 aio_context_release(aio_context); 576 } 577 578 return result; 579 } 580 581 /* 582 * Wait for pending requests to complete across all BlockDriverStates 583 * 584 * This function does not flush data to disk, use bdrv_flush_all() for that 585 * after calling this function. 586 * 587 * This pauses all block jobs and disables external clients. It must 588 * be paired with bdrv_drain_all_end(). 589 * 590 * NOTE: no new block jobs or BlockDriverStates can be created between 591 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 592 */ 593 void bdrv_drain_all_begin(void) 594 { 595 BlockDriverState *bs = NULL; 596 597 if (qemu_in_coroutine()) { 598 bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 599 return; 600 } 601 602 /* 603 * bdrv queue is managed by record/replay, 604 * waiting for finishing the I/O requests may 605 * be infinite 606 */ 607 if (replay_events_enabled()) { 608 return; 609 } 610 611 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 612 * loop AioContext, so make sure we're in the main context. */ 613 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 614 assert(bdrv_drain_all_count < INT_MAX); 615 bdrv_drain_all_count++; 616 617 /* Quiesce all nodes, without polling in-flight requests yet. The graph 618 * cannot change during this loop. */ 619 while ((bs = bdrv_next_all_states(bs))) { 620 AioContext *aio_context = bdrv_get_aio_context(bs); 621 622 aio_context_acquire(aio_context); 623 bdrv_do_drained_begin(bs, false, NULL, true, false); 624 aio_context_release(aio_context); 625 } 626 627 /* Now poll the in-flight requests */ 628 AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 629 630 while ((bs = bdrv_next_all_states(bs))) { 631 bdrv_drain_assert_idle(bs); 632 } 633 } 634 635 void bdrv_drain_all_end(void) 636 { 637 BlockDriverState *bs = NULL; 638 int drained_end_counter = 0; 639 640 /* 641 * bdrv queue is managed by record/replay, 642 * waiting for finishing the I/O requests may 643 * be endless 644 */ 645 if (replay_events_enabled()) { 646 return; 647 } 648 649 while ((bs = bdrv_next_all_states(bs))) { 650 AioContext *aio_context = bdrv_get_aio_context(bs); 651 652 aio_context_acquire(aio_context); 653 bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 654 aio_context_release(aio_context); 655 } 656 657 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 658 AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0); 659 660 assert(bdrv_drain_all_count > 0); 661 bdrv_drain_all_count--; 662 } 663 664 void bdrv_drain_all(void) 665 { 666 bdrv_drain_all_begin(); 667 bdrv_drain_all_end(); 668 } 669 670 /** 671 * Remove an active request from the tracked requests list 672 * 673 * This function should be called when a tracked request is completing. 674 */ 675 static void tracked_request_end(BdrvTrackedRequest *req) 676 { 677 if (req->serialising) { 678 atomic_dec(&req->bs->serialising_in_flight); 679 } 680 681 qemu_co_mutex_lock(&req->bs->reqs_lock); 682 QLIST_REMOVE(req, list); 683 qemu_co_queue_restart_all(&req->wait_queue); 684 qemu_co_mutex_unlock(&req->bs->reqs_lock); 685 } 686 687 /** 688 * Add an active request to the tracked requests list 689 */ 690 static void tracked_request_begin(BdrvTrackedRequest *req, 691 BlockDriverState *bs, 692 int64_t offset, 693 uint64_t bytes, 694 enum BdrvTrackedRequestType type) 695 { 696 assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); 697 698 *req = (BdrvTrackedRequest){ 699 .bs = bs, 700 .offset = offset, 701 .bytes = bytes, 702 .type = type, 703 .co = qemu_coroutine_self(), 704 .serialising = false, 705 .overlap_offset = offset, 706 .overlap_bytes = bytes, 707 }; 708 709 qemu_co_queue_init(&req->wait_queue); 710 711 qemu_co_mutex_lock(&bs->reqs_lock); 712 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 713 qemu_co_mutex_unlock(&bs->reqs_lock); 714 } 715 716 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 717 int64_t offset, uint64_t bytes) 718 { 719 /* aaaa bbbb */ 720 if (offset >= req->overlap_offset + req->overlap_bytes) { 721 return false; 722 } 723 /* bbbb aaaa */ 724 if (req->overlap_offset >= offset + bytes) { 725 return false; 726 } 727 return true; 728 } 729 730 static bool coroutine_fn 731 bdrv_wait_serialising_requests_locked(BlockDriverState *bs, 732 BdrvTrackedRequest *self) 733 { 734 BdrvTrackedRequest *req; 735 bool retry; 736 bool waited = false; 737 738 do { 739 retry = false; 740 QLIST_FOREACH(req, &bs->tracked_requests, list) { 741 if (req == self || (!req->serialising && !self->serialising)) { 742 continue; 743 } 744 if (tracked_request_overlaps(req, self->overlap_offset, 745 self->overlap_bytes)) 746 { 747 /* Hitting this means there was a reentrant request, for 748 * example, a block driver issuing nested requests. This must 749 * never happen since it means deadlock. 750 */ 751 assert(qemu_coroutine_self() != req->co); 752 753 /* If the request is already (indirectly) waiting for us, or 754 * will wait for us as soon as it wakes up, then just go on 755 * (instead of producing a deadlock in the former case). */ 756 if (!req->waiting_for) { 757 self->waiting_for = req; 758 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 759 self->waiting_for = NULL; 760 retry = true; 761 waited = true; 762 break; 763 } 764 } 765 } 766 } while (retry); 767 return waited; 768 } 769 770 bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 771 { 772 BlockDriverState *bs = req->bs; 773 int64_t overlap_offset = req->offset & ~(align - 1); 774 uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 775 - overlap_offset; 776 bool waited; 777 778 qemu_co_mutex_lock(&bs->reqs_lock); 779 if (!req->serialising) { 780 atomic_inc(&req->bs->serialising_in_flight); 781 req->serialising = true; 782 } 783 784 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 785 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 786 waited = bdrv_wait_serialising_requests_locked(bs, req); 787 qemu_co_mutex_unlock(&bs->reqs_lock); 788 return waited; 789 } 790 791 /** 792 * Return the tracked request on @bs for the current coroutine, or 793 * NULL if there is none. 794 */ 795 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 796 { 797 BdrvTrackedRequest *req; 798 Coroutine *self = qemu_coroutine_self(); 799 800 QLIST_FOREACH(req, &bs->tracked_requests, list) { 801 if (req->co == self) { 802 return req; 803 } 804 } 805 806 return NULL; 807 } 808 809 /** 810 * Round a region to cluster boundaries 811 */ 812 void bdrv_round_to_clusters(BlockDriverState *bs, 813 int64_t offset, int64_t bytes, 814 int64_t *cluster_offset, 815 int64_t *cluster_bytes) 816 { 817 BlockDriverInfo bdi; 818 819 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 820 *cluster_offset = offset; 821 *cluster_bytes = bytes; 822 } else { 823 int64_t c = bdi.cluster_size; 824 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 825 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 826 } 827 } 828 829 static int bdrv_get_cluster_size(BlockDriverState *bs) 830 { 831 BlockDriverInfo bdi; 832 int ret; 833 834 ret = bdrv_get_info(bs, &bdi); 835 if (ret < 0 || bdi.cluster_size == 0) { 836 return bs->bl.request_alignment; 837 } else { 838 return bdi.cluster_size; 839 } 840 } 841 842 void bdrv_inc_in_flight(BlockDriverState *bs) 843 { 844 atomic_inc(&bs->in_flight); 845 } 846 847 void bdrv_wakeup(BlockDriverState *bs) 848 { 849 aio_wait_kick(); 850 } 851 852 void bdrv_dec_in_flight(BlockDriverState *bs) 853 { 854 atomic_dec(&bs->in_flight); 855 bdrv_wakeup(bs); 856 } 857 858 static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 859 { 860 BlockDriverState *bs = self->bs; 861 bool waited = false; 862 863 if (!atomic_read(&bs->serialising_in_flight)) { 864 return false; 865 } 866 867 qemu_co_mutex_lock(&bs->reqs_lock); 868 waited = bdrv_wait_serialising_requests_locked(bs, self); 869 qemu_co_mutex_unlock(&bs->reqs_lock); 870 871 return waited; 872 } 873 874 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 875 size_t size) 876 { 877 if (size > BDRV_REQUEST_MAX_BYTES) { 878 return -EIO; 879 } 880 881 if (!bdrv_is_inserted(bs)) { 882 return -ENOMEDIUM; 883 } 884 885 if (offset < 0) { 886 return -EIO; 887 } 888 889 return 0; 890 } 891 892 typedef int coroutine_fn BdrvRequestEntry(void *opaque); 893 typedef struct BdrvRunCo { 894 BdrvRequestEntry *entry; 895 void *opaque; 896 int ret; 897 bool done; 898 Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */ 899 } BdrvRunCo; 900 901 static void coroutine_fn bdrv_run_co_entry(void *opaque) 902 { 903 BdrvRunCo *arg = opaque; 904 905 arg->ret = arg->entry(arg->opaque); 906 arg->done = true; 907 aio_wait_kick(); 908 } 909 910 static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry, 911 void *opaque) 912 { 913 if (qemu_in_coroutine()) { 914 /* Fast-path if already in coroutine context */ 915 return entry(opaque); 916 } else { 917 BdrvRunCo s = { .entry = entry, .opaque = opaque }; 918 919 s.co = qemu_coroutine_create(bdrv_run_co_entry, &s); 920 bdrv_coroutine_enter(bs, s.co); 921 922 BDRV_POLL_WHILE(bs, !s.done); 923 924 return s.ret; 925 } 926 } 927 928 typedef struct RwCo { 929 BdrvChild *child; 930 int64_t offset; 931 QEMUIOVector *qiov; 932 bool is_write; 933 BdrvRequestFlags flags; 934 } RwCo; 935 936 static int coroutine_fn bdrv_rw_co_entry(void *opaque) 937 { 938 RwCo *rwco = opaque; 939 940 if (!rwco->is_write) { 941 return bdrv_co_preadv(rwco->child, rwco->offset, 942 rwco->qiov->size, rwco->qiov, 943 rwco->flags); 944 } else { 945 return bdrv_co_pwritev(rwco->child, rwco->offset, 946 rwco->qiov->size, rwco->qiov, 947 rwco->flags); 948 } 949 } 950 951 /* 952 * Process a vectored synchronous request using coroutines 953 */ 954 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 955 QEMUIOVector *qiov, bool is_write, 956 BdrvRequestFlags flags) 957 { 958 RwCo rwco = { 959 .child = child, 960 .offset = offset, 961 .qiov = qiov, 962 .is_write = is_write, 963 .flags = flags, 964 }; 965 966 return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco); 967 } 968 969 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 970 int bytes, BdrvRequestFlags flags) 971 { 972 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); 973 974 return bdrv_prwv_co(child, offset, &qiov, true, 975 BDRV_REQ_ZERO_WRITE | flags); 976 } 977 978 /* 979 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 980 * The operation is sped up by checking the block status and only writing 981 * zeroes to the device if they currently do not return zeroes. Optional 982 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 983 * BDRV_REQ_FUA). 984 * 985 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 986 */ 987 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 988 { 989 int ret; 990 int64_t target_size, bytes, offset = 0; 991 BlockDriverState *bs = child->bs; 992 993 target_size = bdrv_getlength(bs); 994 if (target_size < 0) { 995 return target_size; 996 } 997 998 for (;;) { 999 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 1000 if (bytes <= 0) { 1001 return 0; 1002 } 1003 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 1004 if (ret < 0) { 1005 return ret; 1006 } 1007 if (ret & BDRV_BLOCK_ZERO) { 1008 offset += bytes; 1009 continue; 1010 } 1011 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 1012 if (ret < 0) { 1013 return ret; 1014 } 1015 offset += bytes; 1016 } 1017 } 1018 1019 /* return < 0 if error. See bdrv_pwrite() for the return codes */ 1020 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 1021 { 1022 int ret; 1023 1024 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 1025 if (ret < 0) { 1026 return ret; 1027 } 1028 1029 return qiov->size; 1030 } 1031 1032 /* See bdrv_pwrite() for the return codes */ 1033 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 1034 { 1035 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1036 1037 if (bytes < 0) { 1038 return -EINVAL; 1039 } 1040 1041 return bdrv_preadv(child, offset, &qiov); 1042 } 1043 1044 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 1045 { 1046 int ret; 1047 1048 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 1049 if (ret < 0) { 1050 return ret; 1051 } 1052 1053 return qiov->size; 1054 } 1055 1056 /* Return no. of bytes on success or < 0 on error. Important errors are: 1057 -EIO generic I/O error (may happen for all errors) 1058 -ENOMEDIUM No media inserted. 1059 -EINVAL Invalid offset or number of bytes 1060 -EACCES Trying to write a read-only device 1061 */ 1062 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 1063 { 1064 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1065 1066 if (bytes < 0) { 1067 return -EINVAL; 1068 } 1069 1070 return bdrv_pwritev(child, offset, &qiov); 1071 } 1072 1073 /* 1074 * Writes to the file and ensures that no writes are reordered across this 1075 * request (acts as a barrier) 1076 * 1077 * Returns 0 on success, -errno in error cases. 1078 */ 1079 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 1080 const void *buf, int count) 1081 { 1082 int ret; 1083 1084 ret = bdrv_pwrite(child, offset, buf, count); 1085 if (ret < 0) { 1086 return ret; 1087 } 1088 1089 ret = bdrv_flush(child->bs); 1090 if (ret < 0) { 1091 return ret; 1092 } 1093 1094 return 0; 1095 } 1096 1097 typedef struct CoroutineIOCompletion { 1098 Coroutine *coroutine; 1099 int ret; 1100 } CoroutineIOCompletion; 1101 1102 static void bdrv_co_io_em_complete(void *opaque, int ret) 1103 { 1104 CoroutineIOCompletion *co = opaque; 1105 1106 co->ret = ret; 1107 aio_co_wake(co->coroutine); 1108 } 1109 1110 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1111 uint64_t offset, uint64_t bytes, 1112 QEMUIOVector *qiov, 1113 size_t qiov_offset, int flags) 1114 { 1115 BlockDriver *drv = bs->drv; 1116 int64_t sector_num; 1117 unsigned int nb_sectors; 1118 QEMUIOVector local_qiov; 1119 int ret; 1120 1121 assert(!(flags & ~BDRV_REQ_MASK)); 1122 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1123 1124 if (!drv) { 1125 return -ENOMEDIUM; 1126 } 1127 1128 if (drv->bdrv_co_preadv_part) { 1129 return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1130 flags); 1131 } 1132 1133 if (qiov_offset > 0 || bytes != qiov->size) { 1134 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1135 qiov = &local_qiov; 1136 } 1137 1138 if (drv->bdrv_co_preadv) { 1139 ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1140 goto out; 1141 } 1142 1143 if (drv->bdrv_aio_preadv) { 1144 BlockAIOCB *acb; 1145 CoroutineIOCompletion co = { 1146 .coroutine = qemu_coroutine_self(), 1147 }; 1148 1149 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 1150 bdrv_co_io_em_complete, &co); 1151 if (acb == NULL) { 1152 ret = -EIO; 1153 goto out; 1154 } else { 1155 qemu_coroutine_yield(); 1156 ret = co.ret; 1157 goto out; 1158 } 1159 } 1160 1161 sector_num = offset >> BDRV_SECTOR_BITS; 1162 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1163 1164 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1165 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1166 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1167 assert(drv->bdrv_co_readv); 1168 1169 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1170 1171 out: 1172 if (qiov == &local_qiov) { 1173 qemu_iovec_destroy(&local_qiov); 1174 } 1175 1176 return ret; 1177 } 1178 1179 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1180 uint64_t offset, uint64_t bytes, 1181 QEMUIOVector *qiov, 1182 size_t qiov_offset, int flags) 1183 { 1184 BlockDriver *drv = bs->drv; 1185 int64_t sector_num; 1186 unsigned int nb_sectors; 1187 QEMUIOVector local_qiov; 1188 int ret; 1189 1190 assert(!(flags & ~BDRV_REQ_MASK)); 1191 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1192 1193 if (!drv) { 1194 return -ENOMEDIUM; 1195 } 1196 1197 if (drv->bdrv_co_pwritev_part) { 1198 ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1199 flags & bs->supported_write_flags); 1200 flags &= ~bs->supported_write_flags; 1201 goto emulate_flags; 1202 } 1203 1204 if (qiov_offset > 0 || bytes != qiov->size) { 1205 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1206 qiov = &local_qiov; 1207 } 1208 1209 if (drv->bdrv_co_pwritev) { 1210 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1211 flags & bs->supported_write_flags); 1212 flags &= ~bs->supported_write_flags; 1213 goto emulate_flags; 1214 } 1215 1216 if (drv->bdrv_aio_pwritev) { 1217 BlockAIOCB *acb; 1218 CoroutineIOCompletion co = { 1219 .coroutine = qemu_coroutine_self(), 1220 }; 1221 1222 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1223 flags & bs->supported_write_flags, 1224 bdrv_co_io_em_complete, &co); 1225 flags &= ~bs->supported_write_flags; 1226 if (acb == NULL) { 1227 ret = -EIO; 1228 } else { 1229 qemu_coroutine_yield(); 1230 ret = co.ret; 1231 } 1232 goto emulate_flags; 1233 } 1234 1235 sector_num = offset >> BDRV_SECTOR_BITS; 1236 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1237 1238 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1239 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1240 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1241 1242 assert(drv->bdrv_co_writev); 1243 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1244 flags & bs->supported_write_flags); 1245 flags &= ~bs->supported_write_flags; 1246 1247 emulate_flags: 1248 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1249 ret = bdrv_co_flush(bs); 1250 } 1251 1252 if (qiov == &local_qiov) { 1253 qemu_iovec_destroy(&local_qiov); 1254 } 1255 1256 return ret; 1257 } 1258 1259 static int coroutine_fn 1260 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1261 uint64_t bytes, QEMUIOVector *qiov, 1262 size_t qiov_offset) 1263 { 1264 BlockDriver *drv = bs->drv; 1265 QEMUIOVector local_qiov; 1266 int ret; 1267 1268 if (!drv) { 1269 return -ENOMEDIUM; 1270 } 1271 1272 if (!block_driver_can_compress(drv)) { 1273 return -ENOTSUP; 1274 } 1275 1276 if (drv->bdrv_co_pwritev_compressed_part) { 1277 return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1278 qiov, qiov_offset); 1279 } 1280 1281 if (qiov_offset == 0) { 1282 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1283 } 1284 1285 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1286 ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1287 qemu_iovec_destroy(&local_qiov); 1288 1289 return ret; 1290 } 1291 1292 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1293 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1294 size_t qiov_offset, int flags) 1295 { 1296 BlockDriverState *bs = child->bs; 1297 1298 /* Perform I/O through a temporary buffer so that users who scribble over 1299 * their read buffer while the operation is in progress do not end up 1300 * modifying the image file. This is critical for zero-copy guest I/O 1301 * where anything might happen inside guest memory. 1302 */ 1303 void *bounce_buffer = NULL; 1304 1305 BlockDriver *drv = bs->drv; 1306 int64_t cluster_offset; 1307 int64_t cluster_bytes; 1308 size_t skip_bytes; 1309 int ret; 1310 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1311 BDRV_REQUEST_MAX_BYTES); 1312 unsigned int progress = 0; 1313 bool skip_write; 1314 1315 if (!drv) { 1316 return -ENOMEDIUM; 1317 } 1318 1319 /* 1320 * Do not write anything when the BDS is inactive. That is not 1321 * allowed, and it would not help. 1322 */ 1323 skip_write = (bs->open_flags & BDRV_O_INACTIVE); 1324 1325 /* FIXME We cannot require callers to have write permissions when all they 1326 * are doing is a read request. If we did things right, write permissions 1327 * would be obtained anyway, but internally by the copy-on-read code. As 1328 * long as it is implemented here rather than in a separate filter driver, 1329 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1330 * it could request permissions. Therefore we have to bypass the permission 1331 * system for the moment. */ 1332 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1333 1334 /* Cover entire cluster so no additional backing file I/O is required when 1335 * allocating cluster in the image file. Note that this value may exceed 1336 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1337 * is one reason we loop rather than doing it all at once. 1338 */ 1339 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1340 skip_bytes = offset - cluster_offset; 1341 1342 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1343 cluster_offset, cluster_bytes); 1344 1345 while (cluster_bytes) { 1346 int64_t pnum; 1347 1348 if (skip_write) { 1349 ret = 1; /* "already allocated", so nothing will be copied */ 1350 pnum = MIN(cluster_bytes, max_transfer); 1351 } else { 1352 ret = bdrv_is_allocated(bs, cluster_offset, 1353 MIN(cluster_bytes, max_transfer), &pnum); 1354 if (ret < 0) { 1355 /* 1356 * Safe to treat errors in querying allocation as if 1357 * unallocated; we'll probably fail again soon on the 1358 * read, but at least that will set a decent errno. 1359 */ 1360 pnum = MIN(cluster_bytes, max_transfer); 1361 } 1362 1363 /* Stop at EOF if the image ends in the middle of the cluster */ 1364 if (ret == 0 && pnum == 0) { 1365 assert(progress >= bytes); 1366 break; 1367 } 1368 1369 assert(skip_bytes < pnum); 1370 } 1371 1372 if (ret <= 0) { 1373 QEMUIOVector local_qiov; 1374 1375 /* Must copy-on-read; use the bounce buffer */ 1376 pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1377 if (!bounce_buffer) { 1378 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 1379 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 1380 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 1381 1382 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 1383 if (!bounce_buffer) { 1384 ret = -ENOMEM; 1385 goto err; 1386 } 1387 } 1388 qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1389 1390 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1391 &local_qiov, 0, 0); 1392 if (ret < 0) { 1393 goto err; 1394 } 1395 1396 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1397 if (drv->bdrv_co_pwrite_zeroes && 1398 buffer_is_zero(bounce_buffer, pnum)) { 1399 /* FIXME: Should we (perhaps conditionally) be setting 1400 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1401 * that still correctly reads as zero? */ 1402 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1403 BDRV_REQ_WRITE_UNCHANGED); 1404 } else { 1405 /* This does not change the data on the disk, it is not 1406 * necessary to flush even in cache=writethrough mode. 1407 */ 1408 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1409 &local_qiov, 0, 1410 BDRV_REQ_WRITE_UNCHANGED); 1411 } 1412 1413 if (ret < 0) { 1414 /* It might be okay to ignore write errors for guest 1415 * requests. If this is a deliberate copy-on-read 1416 * then we don't want to ignore the error. Simply 1417 * report it in all cases. 1418 */ 1419 goto err; 1420 } 1421 1422 if (!(flags & BDRV_REQ_PREFETCH)) { 1423 qemu_iovec_from_buf(qiov, qiov_offset + progress, 1424 bounce_buffer + skip_bytes, 1425 MIN(pnum - skip_bytes, bytes - progress)); 1426 } 1427 } else if (!(flags & BDRV_REQ_PREFETCH)) { 1428 /* Read directly into the destination */ 1429 ret = bdrv_driver_preadv(bs, offset + progress, 1430 MIN(pnum - skip_bytes, bytes - progress), 1431 qiov, qiov_offset + progress, 0); 1432 if (ret < 0) { 1433 goto err; 1434 } 1435 } 1436 1437 cluster_offset += pnum; 1438 cluster_bytes -= pnum; 1439 progress += pnum - skip_bytes; 1440 skip_bytes = 0; 1441 } 1442 ret = 0; 1443 1444 err: 1445 qemu_vfree(bounce_buffer); 1446 return ret; 1447 } 1448 1449 /* 1450 * Forwards an already correctly aligned request to the BlockDriver. This 1451 * handles copy on read, zeroing after EOF, and fragmentation of large 1452 * reads; any other features must be implemented by the caller. 1453 */ 1454 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1455 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1456 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1457 { 1458 BlockDriverState *bs = child->bs; 1459 int64_t total_bytes, max_bytes; 1460 int ret = 0; 1461 uint64_t bytes_remaining = bytes; 1462 int max_transfer; 1463 1464 assert(is_power_of_2(align)); 1465 assert((offset & (align - 1)) == 0); 1466 assert((bytes & (align - 1)) == 0); 1467 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1468 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1469 align); 1470 1471 /* TODO: We would need a per-BDS .supported_read_flags and 1472 * potential fallback support, if we ever implement any read flags 1473 * to pass through to drivers. For now, there aren't any 1474 * passthrough flags. */ 1475 assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); 1476 1477 /* Handle Copy on Read and associated serialisation */ 1478 if (flags & BDRV_REQ_COPY_ON_READ) { 1479 /* If we touch the same cluster it counts as an overlap. This 1480 * guarantees that allocating writes will be serialized and not race 1481 * with each other for the same cluster. For example, in copy-on-read 1482 * it ensures that the CoR read and write operations are atomic and 1483 * guest writes cannot interleave between them. */ 1484 bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1485 } else { 1486 bdrv_wait_serialising_requests(req); 1487 } 1488 1489 if (flags & BDRV_REQ_COPY_ON_READ) { 1490 int64_t pnum; 1491 1492 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1493 if (ret < 0) { 1494 goto out; 1495 } 1496 1497 if (!ret || pnum != bytes) { 1498 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 1499 qiov, qiov_offset, flags); 1500 goto out; 1501 } else if (flags & BDRV_REQ_PREFETCH) { 1502 goto out; 1503 } 1504 } 1505 1506 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1507 total_bytes = bdrv_getlength(bs); 1508 if (total_bytes < 0) { 1509 ret = total_bytes; 1510 goto out; 1511 } 1512 1513 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1514 if (bytes <= max_bytes && bytes <= max_transfer) { 1515 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); 1516 goto out; 1517 } 1518 1519 while (bytes_remaining) { 1520 int num; 1521 1522 if (max_bytes) { 1523 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1524 assert(num); 1525 1526 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1527 num, qiov, 1528 qiov_offset + bytes - bytes_remaining, 0); 1529 max_bytes -= num; 1530 } else { 1531 num = bytes_remaining; 1532 ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1533 0, bytes_remaining); 1534 } 1535 if (ret < 0) { 1536 goto out; 1537 } 1538 bytes_remaining -= num; 1539 } 1540 1541 out: 1542 return ret < 0 ? ret : 0; 1543 } 1544 1545 /* 1546 * Request padding 1547 * 1548 * |<---- align ----->| |<----- align ---->| 1549 * |<- head ->|<------------- bytes ------------->|<-- tail -->| 1550 * | | | | | | 1551 * -*----------$-------*-------- ... --------*-----$------------*--- 1552 * | | | | | | 1553 * | offset | | end | 1554 * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 1555 * [buf ... ) [tail_buf ) 1556 * 1557 * @buf is an aligned allocation needed to store @head and @tail paddings. @head 1558 * is placed at the beginning of @buf and @tail at the @end. 1559 * 1560 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 1561 * around tail, if tail exists. 1562 * 1563 * @merge_reads is true for small requests, 1564 * if @buf_len == @head + bytes + @tail. In this case it is possible that both 1565 * head and tail exist but @buf_len == align and @tail_buf == @buf. 1566 */ 1567 typedef struct BdrvRequestPadding { 1568 uint8_t *buf; 1569 size_t buf_len; 1570 uint8_t *tail_buf; 1571 size_t head; 1572 size_t tail; 1573 bool merge_reads; 1574 QEMUIOVector local_qiov; 1575 } BdrvRequestPadding; 1576 1577 static bool bdrv_init_padding(BlockDriverState *bs, 1578 int64_t offset, int64_t bytes, 1579 BdrvRequestPadding *pad) 1580 { 1581 uint64_t align = bs->bl.request_alignment; 1582 size_t sum; 1583 1584 memset(pad, 0, sizeof(*pad)); 1585 1586 pad->head = offset & (align - 1); 1587 pad->tail = ((offset + bytes) & (align - 1)); 1588 if (pad->tail) { 1589 pad->tail = align - pad->tail; 1590 } 1591 1592 if (!pad->head && !pad->tail) { 1593 return false; 1594 } 1595 1596 assert(bytes); /* Nothing good in aligning zero-length requests */ 1597 1598 sum = pad->head + bytes + pad->tail; 1599 pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 1600 pad->buf = qemu_blockalign(bs, pad->buf_len); 1601 pad->merge_reads = sum == pad->buf_len; 1602 if (pad->tail) { 1603 pad->tail_buf = pad->buf + pad->buf_len - align; 1604 } 1605 1606 return true; 1607 } 1608 1609 static int bdrv_padding_rmw_read(BdrvChild *child, 1610 BdrvTrackedRequest *req, 1611 BdrvRequestPadding *pad, 1612 bool zero_middle) 1613 { 1614 QEMUIOVector local_qiov; 1615 BlockDriverState *bs = child->bs; 1616 uint64_t align = bs->bl.request_alignment; 1617 int ret; 1618 1619 assert(req->serialising && pad->buf); 1620 1621 if (pad->head || pad->merge_reads) { 1622 uint64_t bytes = pad->merge_reads ? pad->buf_len : align; 1623 1624 qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 1625 1626 if (pad->head) { 1627 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1628 } 1629 if (pad->merge_reads && pad->tail) { 1630 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1631 } 1632 ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 1633 align, &local_qiov, 0, 0); 1634 if (ret < 0) { 1635 return ret; 1636 } 1637 if (pad->head) { 1638 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1639 } 1640 if (pad->merge_reads && pad->tail) { 1641 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1642 } 1643 1644 if (pad->merge_reads) { 1645 goto zero_mem; 1646 } 1647 } 1648 1649 if (pad->tail) { 1650 qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 1651 1652 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1653 ret = bdrv_aligned_preadv( 1654 child, req, 1655 req->overlap_offset + req->overlap_bytes - align, 1656 align, align, &local_qiov, 0, 0); 1657 if (ret < 0) { 1658 return ret; 1659 } 1660 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1661 } 1662 1663 zero_mem: 1664 if (zero_middle) { 1665 memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 1666 } 1667 1668 return 0; 1669 } 1670 1671 static void bdrv_padding_destroy(BdrvRequestPadding *pad) 1672 { 1673 if (pad->buf) { 1674 qemu_vfree(pad->buf); 1675 qemu_iovec_destroy(&pad->local_qiov); 1676 } 1677 } 1678 1679 /* 1680 * bdrv_pad_request 1681 * 1682 * Exchange request parameters with padded request if needed. Don't include RMW 1683 * read of padding, bdrv_padding_rmw_read() should be called separately if 1684 * needed. 1685 * 1686 * All parameters except @bs are in-out: they represent original request at 1687 * function call and padded (if padding needed) at function finish. 1688 * 1689 * Function always succeeds. 1690 */ 1691 static bool bdrv_pad_request(BlockDriverState *bs, 1692 QEMUIOVector **qiov, size_t *qiov_offset, 1693 int64_t *offset, unsigned int *bytes, 1694 BdrvRequestPadding *pad) 1695 { 1696 if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 1697 return false; 1698 } 1699 1700 qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 1701 *qiov, *qiov_offset, *bytes, 1702 pad->buf + pad->buf_len - pad->tail, pad->tail); 1703 *bytes += pad->head + pad->tail; 1704 *offset -= pad->head; 1705 *qiov = &pad->local_qiov; 1706 *qiov_offset = 0; 1707 1708 return true; 1709 } 1710 1711 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1712 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1713 BdrvRequestFlags flags) 1714 { 1715 return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 1716 } 1717 1718 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 1719 int64_t offset, unsigned int bytes, 1720 QEMUIOVector *qiov, size_t qiov_offset, 1721 BdrvRequestFlags flags) 1722 { 1723 BlockDriverState *bs = child->bs; 1724 BdrvTrackedRequest req; 1725 BdrvRequestPadding pad; 1726 int ret; 1727 1728 trace_bdrv_co_preadv(bs, offset, bytes, flags); 1729 1730 ret = bdrv_check_byte_request(bs, offset, bytes); 1731 if (ret < 0) { 1732 return ret; 1733 } 1734 1735 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1736 /* 1737 * Aligning zero request is nonsense. Even if driver has special meaning 1738 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1739 * it to driver due to request_alignment. 1740 * 1741 * Still, no reason to return an error if someone do unaligned 1742 * zero-length read occasionally. 1743 */ 1744 return 0; 1745 } 1746 1747 bdrv_inc_in_flight(bs); 1748 1749 /* Don't do copy-on-read if we read data before write operation */ 1750 if (atomic_read(&bs->copy_on_read)) { 1751 flags |= BDRV_REQ_COPY_ON_READ; 1752 } 1753 1754 bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad); 1755 1756 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1757 ret = bdrv_aligned_preadv(child, &req, offset, bytes, 1758 bs->bl.request_alignment, 1759 qiov, qiov_offset, flags); 1760 tracked_request_end(&req); 1761 bdrv_dec_in_flight(bs); 1762 1763 bdrv_padding_destroy(&pad); 1764 1765 return ret; 1766 } 1767 1768 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1769 int64_t offset, int bytes, BdrvRequestFlags flags) 1770 { 1771 BlockDriver *drv = bs->drv; 1772 QEMUIOVector qiov; 1773 void *buf = NULL; 1774 int ret = 0; 1775 bool need_flush = false; 1776 int head = 0; 1777 int tail = 0; 1778 1779 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1780 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1781 bs->bl.request_alignment); 1782 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1783 1784 if (!drv) { 1785 return -ENOMEDIUM; 1786 } 1787 1788 if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1789 return -ENOTSUP; 1790 } 1791 1792 assert(alignment % bs->bl.request_alignment == 0); 1793 head = offset % alignment; 1794 tail = (offset + bytes) % alignment; 1795 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1796 assert(max_write_zeroes >= bs->bl.request_alignment); 1797 1798 while (bytes > 0 && !ret) { 1799 int num = bytes; 1800 1801 /* Align request. Block drivers can expect the "bulk" of the request 1802 * to be aligned, and that unaligned requests do not cross cluster 1803 * boundaries. 1804 */ 1805 if (head) { 1806 /* Make a small request up to the first aligned sector. For 1807 * convenience, limit this request to max_transfer even if 1808 * we don't need to fall back to writes. */ 1809 num = MIN(MIN(bytes, max_transfer), alignment - head); 1810 head = (head + num) % alignment; 1811 assert(num < max_write_zeroes); 1812 } else if (tail && num > alignment) { 1813 /* Shorten the request to the last aligned sector. */ 1814 num -= tail; 1815 } 1816 1817 /* limit request size */ 1818 if (num > max_write_zeroes) { 1819 num = max_write_zeroes; 1820 } 1821 1822 ret = -ENOTSUP; 1823 /* First try the efficient write zeroes operation */ 1824 if (drv->bdrv_co_pwrite_zeroes) { 1825 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1826 flags & bs->supported_zero_flags); 1827 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1828 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1829 need_flush = true; 1830 } 1831 } else { 1832 assert(!bs->supported_zero_flags); 1833 } 1834 1835 if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 1836 /* Fall back to bounce buffer if write zeroes is unsupported */ 1837 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1838 1839 if ((flags & BDRV_REQ_FUA) && 1840 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1841 /* No need for bdrv_driver_pwrite() to do a fallback 1842 * flush on each chunk; use just one at the end */ 1843 write_flags &= ~BDRV_REQ_FUA; 1844 need_flush = true; 1845 } 1846 num = MIN(num, max_transfer); 1847 if (buf == NULL) { 1848 buf = qemu_try_blockalign0(bs, num); 1849 if (buf == NULL) { 1850 ret = -ENOMEM; 1851 goto fail; 1852 } 1853 } 1854 qemu_iovec_init_buf(&qiov, buf, num); 1855 1856 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 1857 1858 /* Keep bounce buffer around if it is big enough for all 1859 * all future requests. 1860 */ 1861 if (num < max_transfer) { 1862 qemu_vfree(buf); 1863 buf = NULL; 1864 } 1865 } 1866 1867 offset += num; 1868 bytes -= num; 1869 } 1870 1871 fail: 1872 if (ret == 0 && need_flush) { 1873 ret = bdrv_co_flush(bs); 1874 } 1875 qemu_vfree(buf); 1876 return ret; 1877 } 1878 1879 static inline int coroutine_fn 1880 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, 1881 BdrvTrackedRequest *req, int flags) 1882 { 1883 BlockDriverState *bs = child->bs; 1884 bool waited; 1885 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1886 1887 if (bs->read_only) { 1888 return -EPERM; 1889 } 1890 1891 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1892 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1893 assert(!(flags & ~BDRV_REQ_MASK)); 1894 1895 if (flags & BDRV_REQ_SERIALISING) { 1896 waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1897 /* 1898 * For a misaligned request we should have already waited earlier, 1899 * because we come after bdrv_padding_rmw_read which must be called 1900 * with the request already marked as serialising. 1901 */ 1902 assert(!waited || 1903 (req->offset == req->overlap_offset && 1904 req->bytes == req->overlap_bytes)); 1905 } else { 1906 bdrv_wait_serialising_requests(req); 1907 } 1908 1909 assert(req->overlap_offset <= offset); 1910 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1911 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1912 1913 switch (req->type) { 1914 case BDRV_TRACKED_WRITE: 1915 case BDRV_TRACKED_DISCARD: 1916 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1917 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1918 } else { 1919 assert(child->perm & BLK_PERM_WRITE); 1920 } 1921 return notifier_with_return_list_notify(&bs->before_write_notifiers, 1922 req); 1923 case BDRV_TRACKED_TRUNCATE: 1924 assert(child->perm & BLK_PERM_RESIZE); 1925 return 0; 1926 default: 1927 abort(); 1928 } 1929 } 1930 1931 static inline void coroutine_fn 1932 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, 1933 BdrvTrackedRequest *req, int ret) 1934 { 1935 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1936 BlockDriverState *bs = child->bs; 1937 1938 atomic_inc(&bs->write_gen); 1939 1940 /* 1941 * Discard cannot extend the image, but in error handling cases, such as 1942 * when reverting a qcow2 cluster allocation, the discarded range can pass 1943 * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 1944 * here. Instead, just skip it, since semantically a discard request 1945 * beyond EOF cannot expand the image anyway. 1946 */ 1947 if (ret == 0 && 1948 (req->type == BDRV_TRACKED_TRUNCATE || 1949 end_sector > bs->total_sectors) && 1950 req->type != BDRV_TRACKED_DISCARD) { 1951 bs->total_sectors = end_sector; 1952 bdrv_parent_cb_resize(bs); 1953 bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 1954 } 1955 if (req->bytes) { 1956 switch (req->type) { 1957 case BDRV_TRACKED_WRITE: 1958 stat64_max(&bs->wr_highest_offset, offset + bytes); 1959 /* fall through, to set dirty bits */ 1960 case BDRV_TRACKED_DISCARD: 1961 bdrv_set_dirty(bs, offset, bytes); 1962 break; 1963 default: 1964 break; 1965 } 1966 } 1967 } 1968 1969 /* 1970 * Forwards an already correctly aligned write request to the BlockDriver, 1971 * after possibly fragmenting it. 1972 */ 1973 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1974 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1975 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1976 { 1977 BlockDriverState *bs = child->bs; 1978 BlockDriver *drv = bs->drv; 1979 int ret; 1980 1981 uint64_t bytes_remaining = bytes; 1982 int max_transfer; 1983 1984 if (!drv) { 1985 return -ENOMEDIUM; 1986 } 1987 1988 if (bdrv_has_readonly_bitmaps(bs)) { 1989 return -EPERM; 1990 } 1991 1992 assert(is_power_of_2(align)); 1993 assert((offset & (align - 1)) == 0); 1994 assert((bytes & (align - 1)) == 0); 1995 assert(!qiov || qiov_offset + bytes <= qiov->size); 1996 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1997 align); 1998 1999 ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 2000 2001 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 2002 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 2003 qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 2004 flags |= BDRV_REQ_ZERO_WRITE; 2005 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 2006 flags |= BDRV_REQ_MAY_UNMAP; 2007 } 2008 } 2009 2010 if (ret < 0) { 2011 /* Do nothing, write notifier decided to fail this request */ 2012 } else if (flags & BDRV_REQ_ZERO_WRITE) { 2013 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 2014 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 2015 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 2016 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 2017 qiov, qiov_offset); 2018 } else if (bytes <= max_transfer) { 2019 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2020 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 2021 } else { 2022 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2023 while (bytes_remaining) { 2024 int num = MIN(bytes_remaining, max_transfer); 2025 int local_flags = flags; 2026 2027 assert(num); 2028 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 2029 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 2030 /* If FUA is going to be emulated by flush, we only 2031 * need to flush on the last iteration */ 2032 local_flags &= ~BDRV_REQ_FUA; 2033 } 2034 2035 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 2036 num, qiov, 2037 qiov_offset + bytes - bytes_remaining, 2038 local_flags); 2039 if (ret < 0) { 2040 break; 2041 } 2042 bytes_remaining -= num; 2043 } 2044 } 2045 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 2046 2047 if (ret >= 0) { 2048 ret = 0; 2049 } 2050 bdrv_co_write_req_finish(child, offset, bytes, req, ret); 2051 2052 return ret; 2053 } 2054 2055 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 2056 int64_t offset, 2057 unsigned int bytes, 2058 BdrvRequestFlags flags, 2059 BdrvTrackedRequest *req) 2060 { 2061 BlockDriverState *bs = child->bs; 2062 QEMUIOVector local_qiov; 2063 uint64_t align = bs->bl.request_alignment; 2064 int ret = 0; 2065 bool padding; 2066 BdrvRequestPadding pad; 2067 2068 padding = bdrv_init_padding(bs, offset, bytes, &pad); 2069 if (padding) { 2070 bdrv_mark_request_serialising(req, align); 2071 2072 bdrv_padding_rmw_read(child, req, &pad, true); 2073 2074 if (pad.head || pad.merge_reads) { 2075 int64_t aligned_offset = offset & ~(align - 1); 2076 int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 2077 2078 qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 2079 ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 2080 align, &local_qiov, 0, 2081 flags & ~BDRV_REQ_ZERO_WRITE); 2082 if (ret < 0 || pad.merge_reads) { 2083 /* Error or all work is done */ 2084 goto out; 2085 } 2086 offset += write_bytes - pad.head; 2087 bytes -= write_bytes - pad.head; 2088 } 2089 } 2090 2091 assert(!bytes || (offset & (align - 1)) == 0); 2092 if (bytes >= align) { 2093 /* Write the aligned part in the middle. */ 2094 uint64_t aligned_bytes = bytes & ~(align - 1); 2095 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 2096 NULL, 0, flags); 2097 if (ret < 0) { 2098 goto out; 2099 } 2100 bytes -= aligned_bytes; 2101 offset += aligned_bytes; 2102 } 2103 2104 assert(!bytes || (offset & (align - 1)) == 0); 2105 if (bytes) { 2106 assert(align == pad.tail + bytes); 2107 2108 qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 2109 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 2110 &local_qiov, 0, 2111 flags & ~BDRV_REQ_ZERO_WRITE); 2112 } 2113 2114 out: 2115 bdrv_padding_destroy(&pad); 2116 2117 return ret; 2118 } 2119 2120 /* 2121 * Handle a write request in coroutine context 2122 */ 2123 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2124 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 2125 BdrvRequestFlags flags) 2126 { 2127 return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 2128 } 2129 2130 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 2131 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, 2132 BdrvRequestFlags flags) 2133 { 2134 BlockDriverState *bs = child->bs; 2135 BdrvTrackedRequest req; 2136 uint64_t align = bs->bl.request_alignment; 2137 BdrvRequestPadding pad; 2138 int ret; 2139 2140 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 2141 2142 if (!bs->drv) { 2143 return -ENOMEDIUM; 2144 } 2145 2146 ret = bdrv_check_byte_request(bs, offset, bytes); 2147 if (ret < 0) { 2148 return ret; 2149 } 2150 2151 /* If the request is misaligned then we can't make it efficient */ 2152 if ((flags & BDRV_REQ_NO_FALLBACK) && 2153 !QEMU_IS_ALIGNED(offset | bytes, align)) 2154 { 2155 return -ENOTSUP; 2156 } 2157 2158 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2159 /* 2160 * Aligning zero request is nonsense. Even if driver has special meaning 2161 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2162 * it to driver due to request_alignment. 2163 * 2164 * Still, no reason to return an error if someone do unaligned 2165 * zero-length write occasionally. 2166 */ 2167 return 0; 2168 } 2169 2170 bdrv_inc_in_flight(bs); 2171 /* 2172 * Align write if necessary by performing a read-modify-write cycle. 2173 * Pad qiov with the read parts and be sure to have a tracked request not 2174 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 2175 */ 2176 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 2177 2178 if (flags & BDRV_REQ_ZERO_WRITE) { 2179 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 2180 goto out; 2181 } 2182 2183 if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { 2184 bdrv_mark_request_serialising(&req, align); 2185 bdrv_padding_rmw_read(child, &req, &pad, false); 2186 } 2187 2188 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 2189 qiov, qiov_offset, flags); 2190 2191 bdrv_padding_destroy(&pad); 2192 2193 out: 2194 tracked_request_end(&req); 2195 bdrv_dec_in_flight(bs); 2196 2197 return ret; 2198 } 2199 2200 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2201 int bytes, BdrvRequestFlags flags) 2202 { 2203 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 2204 2205 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 2206 flags &= ~BDRV_REQ_MAY_UNMAP; 2207 } 2208 2209 return bdrv_co_pwritev(child, offset, bytes, NULL, 2210 BDRV_REQ_ZERO_WRITE | flags); 2211 } 2212 2213 /* 2214 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 2215 */ 2216 int bdrv_flush_all(void) 2217 { 2218 BdrvNextIterator it; 2219 BlockDriverState *bs = NULL; 2220 int result = 0; 2221 2222 /* 2223 * bdrv queue is managed by record/replay, 2224 * creating new flush request for stopping 2225 * the VM may break the determinism 2226 */ 2227 if (replay_events_enabled()) { 2228 return result; 2229 } 2230 2231 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 2232 AioContext *aio_context = bdrv_get_aio_context(bs); 2233 int ret; 2234 2235 aio_context_acquire(aio_context); 2236 ret = bdrv_flush(bs); 2237 if (ret < 0 && !result) { 2238 result = ret; 2239 } 2240 aio_context_release(aio_context); 2241 } 2242 2243 return result; 2244 } 2245 2246 2247 typedef struct BdrvCoBlockStatusData { 2248 BlockDriverState *bs; 2249 BlockDriverState *base; 2250 bool want_zero; 2251 int64_t offset; 2252 int64_t bytes; 2253 int64_t *pnum; 2254 int64_t *map; 2255 BlockDriverState **file; 2256 } BdrvCoBlockStatusData; 2257 2258 /* 2259 * Returns the allocation status of the specified sectors. 2260 * Drivers not implementing the functionality are assumed to not support 2261 * backing files, hence all their sectors are reported as allocated. 2262 * 2263 * If 'want_zero' is true, the caller is querying for mapping 2264 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 2265 * _ZERO where possible; otherwise, the result favors larger 'pnum', 2266 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2267 * 2268 * If 'offset' is beyond the end of the disk image the return value is 2269 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 2270 * 2271 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2272 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2273 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 2274 * 2275 * 'pnum' is set to the number of bytes (including and immediately 2276 * following the specified offset) that are easily known to be in the 2277 * same allocated/unallocated state. Note that a second call starting 2278 * at the original offset plus returned pnum may have the same status. 2279 * The returned value is non-zero on success except at end-of-file. 2280 * 2281 * Returns negative errno on failure. Otherwise, if the 2282 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 2283 * set to the host mapping and BDS corresponding to the guest offset. 2284 */ 2285 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2286 bool want_zero, 2287 int64_t offset, int64_t bytes, 2288 int64_t *pnum, int64_t *map, 2289 BlockDriverState **file) 2290 { 2291 int64_t total_size; 2292 int64_t n; /* bytes */ 2293 int ret; 2294 int64_t local_map = 0; 2295 BlockDriverState *local_file = NULL; 2296 int64_t aligned_offset, aligned_bytes; 2297 uint32_t align; 2298 bool has_filtered_child; 2299 2300 assert(pnum); 2301 *pnum = 0; 2302 total_size = bdrv_getlength(bs); 2303 if (total_size < 0) { 2304 ret = total_size; 2305 goto early_out; 2306 } 2307 2308 if (offset >= total_size) { 2309 ret = BDRV_BLOCK_EOF; 2310 goto early_out; 2311 } 2312 if (!bytes) { 2313 ret = 0; 2314 goto early_out; 2315 } 2316 2317 n = total_size - offset; 2318 if (n < bytes) { 2319 bytes = n; 2320 } 2321 2322 /* Must be non-NULL or bdrv_getlength() would have failed */ 2323 assert(bs->drv); 2324 has_filtered_child = bdrv_filter_child(bs); 2325 if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { 2326 *pnum = bytes; 2327 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2328 if (offset + bytes == total_size) { 2329 ret |= BDRV_BLOCK_EOF; 2330 } 2331 if (bs->drv->protocol_name) { 2332 ret |= BDRV_BLOCK_OFFSET_VALID; 2333 local_map = offset; 2334 local_file = bs; 2335 } 2336 goto early_out; 2337 } 2338 2339 bdrv_inc_in_flight(bs); 2340 2341 /* Round out to request_alignment boundaries */ 2342 align = bs->bl.request_alignment; 2343 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2344 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2345 2346 if (bs->drv->bdrv_co_block_status) { 2347 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2348 aligned_bytes, pnum, &local_map, 2349 &local_file); 2350 } else { 2351 /* Default code for filters */ 2352 2353 local_file = bdrv_filter_bs(bs); 2354 assert(local_file); 2355 2356 *pnum = aligned_bytes; 2357 local_map = aligned_offset; 2358 ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2359 } 2360 if (ret < 0) { 2361 *pnum = 0; 2362 goto out; 2363 } 2364 2365 /* 2366 * The driver's result must be a non-zero multiple of request_alignment. 2367 * Clamp pnum and adjust map to original request. 2368 */ 2369 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2370 align > offset - aligned_offset); 2371 if (ret & BDRV_BLOCK_RECURSE) { 2372 assert(ret & BDRV_BLOCK_DATA); 2373 assert(ret & BDRV_BLOCK_OFFSET_VALID); 2374 assert(!(ret & BDRV_BLOCK_ZERO)); 2375 } 2376 2377 *pnum -= offset - aligned_offset; 2378 if (*pnum > bytes) { 2379 *pnum = bytes; 2380 } 2381 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2382 local_map += offset - aligned_offset; 2383 } 2384 2385 if (ret & BDRV_BLOCK_RAW) { 2386 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2387 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2388 *pnum, pnum, &local_map, &local_file); 2389 goto out; 2390 } 2391 2392 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2393 ret |= BDRV_BLOCK_ALLOCATED; 2394 } else if (want_zero && bs->drv->supports_backing) { 2395 BlockDriverState *cow_bs = bdrv_cow_bs(bs); 2396 2397 if (cow_bs) { 2398 int64_t size2 = bdrv_getlength(cow_bs); 2399 2400 if (size2 >= 0 && offset >= size2) { 2401 ret |= BDRV_BLOCK_ZERO; 2402 } 2403 } else { 2404 ret |= BDRV_BLOCK_ZERO; 2405 } 2406 } 2407 2408 if (want_zero && ret & BDRV_BLOCK_RECURSE && 2409 local_file && local_file != bs && 2410 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2411 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2412 int64_t file_pnum; 2413 int ret2; 2414 2415 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2416 *pnum, &file_pnum, NULL, NULL); 2417 if (ret2 >= 0) { 2418 /* Ignore errors. This is just providing extra information, it 2419 * is useful but not necessary. 2420 */ 2421 if (ret2 & BDRV_BLOCK_EOF && 2422 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2423 /* 2424 * It is valid for the format block driver to read 2425 * beyond the end of the underlying file's current 2426 * size; such areas read as zero. 2427 */ 2428 ret |= BDRV_BLOCK_ZERO; 2429 } else { 2430 /* Limit request to the range reported by the protocol driver */ 2431 *pnum = file_pnum; 2432 ret |= (ret2 & BDRV_BLOCK_ZERO); 2433 } 2434 } 2435 } 2436 2437 out: 2438 bdrv_dec_in_flight(bs); 2439 if (ret >= 0 && offset + *pnum == total_size) { 2440 ret |= BDRV_BLOCK_EOF; 2441 } 2442 early_out: 2443 if (file) { 2444 *file = local_file; 2445 } 2446 if (map) { 2447 *map = local_map; 2448 } 2449 return ret; 2450 } 2451 2452 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2453 BlockDriverState *base, 2454 bool want_zero, 2455 int64_t offset, 2456 int64_t bytes, 2457 int64_t *pnum, 2458 int64_t *map, 2459 BlockDriverState **file) 2460 { 2461 BlockDriverState *p; 2462 int ret = 0; 2463 bool first = true; 2464 2465 assert(bs != base); 2466 for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) { 2467 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2468 file); 2469 if (ret < 0) { 2470 break; 2471 } 2472 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2473 /* 2474 * Reading beyond the end of the file continues to read 2475 * zeroes, but we can only widen the result to the 2476 * unallocated length we learned from an earlier 2477 * iteration. 2478 */ 2479 *pnum = bytes; 2480 } 2481 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2482 break; 2483 } 2484 /* [offset, pnum] unallocated on this layer, which could be only 2485 * the first part of [offset, bytes]. */ 2486 bytes = MIN(bytes, *pnum); 2487 first = false; 2488 } 2489 return ret; 2490 } 2491 2492 /* Coroutine wrapper for bdrv_block_status_above() */ 2493 static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2494 { 2495 BdrvCoBlockStatusData *data = opaque; 2496 2497 return bdrv_co_block_status_above(data->bs, data->base, 2498 data->want_zero, 2499 data->offset, data->bytes, 2500 data->pnum, data->map, data->file); 2501 } 2502 2503 /* 2504 * Synchronous wrapper around bdrv_co_block_status_above(). 2505 * 2506 * See bdrv_co_block_status_above() for details. 2507 */ 2508 static int bdrv_common_block_status_above(BlockDriverState *bs, 2509 BlockDriverState *base, 2510 bool want_zero, int64_t offset, 2511 int64_t bytes, int64_t *pnum, 2512 int64_t *map, 2513 BlockDriverState **file) 2514 { 2515 BdrvCoBlockStatusData data = { 2516 .bs = bs, 2517 .base = base, 2518 .want_zero = want_zero, 2519 .offset = offset, 2520 .bytes = bytes, 2521 .pnum = pnum, 2522 .map = map, 2523 .file = file, 2524 }; 2525 2526 return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data); 2527 } 2528 2529 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2530 int64_t offset, int64_t bytes, int64_t *pnum, 2531 int64_t *map, BlockDriverState **file) 2532 { 2533 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2534 pnum, map, file); 2535 } 2536 2537 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2538 int64_t *pnum, int64_t *map, BlockDriverState **file) 2539 { 2540 return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), 2541 offset, bytes, pnum, map, file); 2542 } 2543 2544 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2545 int64_t bytes, int64_t *pnum) 2546 { 2547 int ret; 2548 int64_t dummy; 2549 2550 ret = bdrv_common_block_status_above(bs, bdrv_filter_or_cow_bs(bs), false, 2551 offset, bytes, pnum ? pnum : &dummy, 2552 NULL, NULL); 2553 if (ret < 0) { 2554 return ret; 2555 } 2556 return !!(ret & BDRV_BLOCK_ALLOCATED); 2557 } 2558 2559 /* 2560 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2561 * 2562 * Return 1 if (a prefix of) the given range is allocated in any image 2563 * between BASE and TOP (BASE is only included if include_base is set). 2564 * BASE can be NULL to check if the given offset is allocated in any 2565 * image of the chain. Return 0 otherwise, or negative errno on 2566 * failure. 2567 * 2568 * 'pnum' is set to the number of bytes (including and immediately 2569 * following the specified offset) that are known to be in the same 2570 * allocated/unallocated state. Note that a subsequent call starting 2571 * at 'offset + *pnum' may return the same allocation status (in other 2572 * words, the result is not necessarily the maximum possible range); 2573 * but 'pnum' will only be 0 when end of file is reached. 2574 * 2575 */ 2576 int bdrv_is_allocated_above(BlockDriverState *top, 2577 BlockDriverState *base, 2578 bool include_base, int64_t offset, 2579 int64_t bytes, int64_t *pnum) 2580 { 2581 BlockDriverState *intermediate; 2582 int ret; 2583 int64_t n = bytes; 2584 2585 assert(base || !include_base); 2586 2587 intermediate = top; 2588 while (include_base || intermediate != base) { 2589 int64_t pnum_inter; 2590 int64_t size_inter; 2591 2592 assert(intermediate); 2593 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2594 if (ret < 0) { 2595 return ret; 2596 } 2597 if (ret) { 2598 *pnum = pnum_inter; 2599 return 1; 2600 } 2601 2602 size_inter = bdrv_getlength(intermediate); 2603 if (size_inter < 0) { 2604 return size_inter; 2605 } 2606 if (n > pnum_inter && 2607 (intermediate == top || offset + pnum_inter < size_inter)) { 2608 n = pnum_inter; 2609 } 2610 2611 if (intermediate == base) { 2612 break; 2613 } 2614 2615 intermediate = bdrv_filter_or_cow_bs(intermediate); 2616 } 2617 2618 *pnum = n; 2619 return 0; 2620 } 2621 2622 typedef struct BdrvVmstateCo { 2623 BlockDriverState *bs; 2624 QEMUIOVector *qiov; 2625 int64_t pos; 2626 bool is_read; 2627 } BdrvVmstateCo; 2628 2629 static int coroutine_fn 2630 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2631 bool is_read) 2632 { 2633 BlockDriver *drv = bs->drv; 2634 BlockDriverState *child_bs = bdrv_primary_bs(bs); 2635 int ret = -ENOTSUP; 2636 2637 bdrv_inc_in_flight(bs); 2638 2639 if (!drv) { 2640 ret = -ENOMEDIUM; 2641 } else if (drv->bdrv_load_vmstate) { 2642 if (is_read) { 2643 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2644 } else { 2645 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2646 } 2647 } else if (child_bs) { 2648 ret = bdrv_co_rw_vmstate(child_bs, qiov, pos, is_read); 2649 } 2650 2651 bdrv_dec_in_flight(bs); 2652 return ret; 2653 } 2654 2655 static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2656 { 2657 BdrvVmstateCo *co = opaque; 2658 2659 return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2660 } 2661 2662 static inline int 2663 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2664 bool is_read) 2665 { 2666 BdrvVmstateCo data = { 2667 .bs = bs, 2668 .qiov = qiov, 2669 .pos = pos, 2670 .is_read = is_read, 2671 }; 2672 2673 return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data); 2674 } 2675 2676 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2677 int64_t pos, int size) 2678 { 2679 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2680 int ret; 2681 2682 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2683 if (ret < 0) { 2684 return ret; 2685 } 2686 2687 return size; 2688 } 2689 2690 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2691 { 2692 return bdrv_rw_vmstate(bs, qiov, pos, false); 2693 } 2694 2695 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2696 int64_t pos, int size) 2697 { 2698 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2699 int ret; 2700 2701 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2702 if (ret < 0) { 2703 return ret; 2704 } 2705 2706 return size; 2707 } 2708 2709 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2710 { 2711 return bdrv_rw_vmstate(bs, qiov, pos, true); 2712 } 2713 2714 /**************************************************************/ 2715 /* async I/Os */ 2716 2717 void bdrv_aio_cancel(BlockAIOCB *acb) 2718 { 2719 qemu_aio_ref(acb); 2720 bdrv_aio_cancel_async(acb); 2721 while (acb->refcnt > 1) { 2722 if (acb->aiocb_info->get_aio_context) { 2723 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2724 } else if (acb->bs) { 2725 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2726 * assert that we're not using an I/O thread. Thread-safe 2727 * code should use bdrv_aio_cancel_async exclusively. 2728 */ 2729 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2730 aio_poll(bdrv_get_aio_context(acb->bs), true); 2731 } else { 2732 abort(); 2733 } 2734 } 2735 qemu_aio_unref(acb); 2736 } 2737 2738 /* Async version of aio cancel. The caller is not blocked if the acb implements 2739 * cancel_async, otherwise we do nothing and let the request normally complete. 2740 * In either case the completion callback must be called. */ 2741 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2742 { 2743 if (acb->aiocb_info->cancel_async) { 2744 acb->aiocb_info->cancel_async(acb); 2745 } 2746 } 2747 2748 /**************************************************************/ 2749 /* Coroutine block device emulation */ 2750 2751 static int coroutine_fn bdrv_flush_co_entry(void *opaque) 2752 { 2753 return bdrv_co_flush(opaque); 2754 } 2755 2756 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2757 { 2758 BdrvChild *primary_child = bdrv_primary_child(bs); 2759 BdrvChild *child; 2760 int current_gen; 2761 int ret = 0; 2762 2763 bdrv_inc_in_flight(bs); 2764 2765 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2766 bdrv_is_sg(bs)) { 2767 goto early_exit; 2768 } 2769 2770 qemu_co_mutex_lock(&bs->reqs_lock); 2771 current_gen = atomic_read(&bs->write_gen); 2772 2773 /* Wait until any previous flushes are completed */ 2774 while (bs->active_flush_req) { 2775 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2776 } 2777 2778 /* Flushes reach this point in nondecreasing current_gen order. */ 2779 bs->active_flush_req = true; 2780 qemu_co_mutex_unlock(&bs->reqs_lock); 2781 2782 /* Write back all layers by calling one driver function */ 2783 if (bs->drv->bdrv_co_flush) { 2784 ret = bs->drv->bdrv_co_flush(bs); 2785 goto out; 2786 } 2787 2788 /* Write back cached data to the OS even with cache=unsafe */ 2789 BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); 2790 if (bs->drv->bdrv_co_flush_to_os) { 2791 ret = bs->drv->bdrv_co_flush_to_os(bs); 2792 if (ret < 0) { 2793 goto out; 2794 } 2795 } 2796 2797 /* But don't actually force it to the disk with cache=unsafe */ 2798 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2799 goto flush_children; 2800 } 2801 2802 /* Check if we really need to flush anything */ 2803 if (bs->flushed_gen == current_gen) { 2804 goto flush_children; 2805 } 2806 2807 BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); 2808 if (!bs->drv) { 2809 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2810 * (even in case of apparent success) */ 2811 ret = -ENOMEDIUM; 2812 goto out; 2813 } 2814 if (bs->drv->bdrv_co_flush_to_disk) { 2815 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2816 } else if (bs->drv->bdrv_aio_flush) { 2817 BlockAIOCB *acb; 2818 CoroutineIOCompletion co = { 2819 .coroutine = qemu_coroutine_self(), 2820 }; 2821 2822 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2823 if (acb == NULL) { 2824 ret = -EIO; 2825 } else { 2826 qemu_coroutine_yield(); 2827 ret = co.ret; 2828 } 2829 } else { 2830 /* 2831 * Some block drivers always operate in either writethrough or unsafe 2832 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2833 * know how the server works (because the behaviour is hardcoded or 2834 * depends on server-side configuration), so we can't ensure that 2835 * everything is safe on disk. Returning an error doesn't work because 2836 * that would break guests even if the server operates in writethrough 2837 * mode. 2838 * 2839 * Let's hope the user knows what he's doing. 2840 */ 2841 ret = 0; 2842 } 2843 2844 if (ret < 0) { 2845 goto out; 2846 } 2847 2848 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2849 * in the case of cache=unsafe, so there are no useless flushes. 2850 */ 2851 flush_children: 2852 ret = 0; 2853 QLIST_FOREACH(child, &bs->children, next) { 2854 if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { 2855 int this_child_ret = bdrv_co_flush(child->bs); 2856 if (!ret) { 2857 ret = this_child_ret; 2858 } 2859 } 2860 } 2861 2862 out: 2863 /* Notify any pending flushes that we have completed */ 2864 if (ret == 0) { 2865 bs->flushed_gen = current_gen; 2866 } 2867 2868 qemu_co_mutex_lock(&bs->reqs_lock); 2869 bs->active_flush_req = false; 2870 /* Return value is ignored - it's ok if wait queue is empty */ 2871 qemu_co_queue_next(&bs->flush_queue); 2872 qemu_co_mutex_unlock(&bs->reqs_lock); 2873 2874 early_exit: 2875 bdrv_dec_in_flight(bs); 2876 return ret; 2877 } 2878 2879 int bdrv_flush(BlockDriverState *bs) 2880 { 2881 return bdrv_run_co(bs, bdrv_flush_co_entry, bs); 2882 } 2883 2884 typedef struct DiscardCo { 2885 BdrvChild *child; 2886 int64_t offset; 2887 int64_t bytes; 2888 } DiscardCo; 2889 2890 static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2891 { 2892 DiscardCo *rwco = opaque; 2893 2894 return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); 2895 } 2896 2897 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2898 int64_t bytes) 2899 { 2900 BdrvTrackedRequest req; 2901 int max_pdiscard, ret; 2902 int head, tail, align; 2903 BlockDriverState *bs = child->bs; 2904 2905 if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 2906 return -ENOMEDIUM; 2907 } 2908 2909 if (bdrv_has_readonly_bitmaps(bs)) { 2910 return -EPERM; 2911 } 2912 2913 if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) { 2914 return -EIO; 2915 } 2916 2917 /* Do nothing if disabled. */ 2918 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2919 return 0; 2920 } 2921 2922 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2923 return 0; 2924 } 2925 2926 /* Discard is advisory, but some devices track and coalesce 2927 * unaligned requests, so we must pass everything down rather than 2928 * round here. Still, most devices will just silently ignore 2929 * unaligned requests (by returning -ENOTSUP), so we must fragment 2930 * the request accordingly. */ 2931 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2932 assert(align % bs->bl.request_alignment == 0); 2933 head = offset % align; 2934 tail = (offset + bytes) % align; 2935 2936 bdrv_inc_in_flight(bs); 2937 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2938 2939 ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2940 if (ret < 0) { 2941 goto out; 2942 } 2943 2944 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2945 align); 2946 assert(max_pdiscard >= bs->bl.request_alignment); 2947 2948 while (bytes > 0) { 2949 int64_t num = bytes; 2950 2951 if (head) { 2952 /* Make small requests to get to alignment boundaries. */ 2953 num = MIN(bytes, align - head); 2954 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2955 num %= bs->bl.request_alignment; 2956 } 2957 head = (head + num) % align; 2958 assert(num < max_pdiscard); 2959 } else if (tail) { 2960 if (num > align) { 2961 /* Shorten the request to the last aligned cluster. */ 2962 num -= tail; 2963 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2964 tail > bs->bl.request_alignment) { 2965 tail %= bs->bl.request_alignment; 2966 num -= tail; 2967 } 2968 } 2969 /* limit request size */ 2970 if (num > max_pdiscard) { 2971 num = max_pdiscard; 2972 } 2973 2974 if (!bs->drv) { 2975 ret = -ENOMEDIUM; 2976 goto out; 2977 } 2978 if (bs->drv->bdrv_co_pdiscard) { 2979 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2980 } else { 2981 BlockAIOCB *acb; 2982 CoroutineIOCompletion co = { 2983 .coroutine = qemu_coroutine_self(), 2984 }; 2985 2986 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2987 bdrv_co_io_em_complete, &co); 2988 if (acb == NULL) { 2989 ret = -EIO; 2990 goto out; 2991 } else { 2992 qemu_coroutine_yield(); 2993 ret = co.ret; 2994 } 2995 } 2996 if (ret && ret != -ENOTSUP) { 2997 goto out; 2998 } 2999 3000 offset += num; 3001 bytes -= num; 3002 } 3003 ret = 0; 3004 out: 3005 bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3006 tracked_request_end(&req); 3007 bdrv_dec_in_flight(bs); 3008 return ret; 3009 } 3010 3011 int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) 3012 { 3013 DiscardCo rwco = { 3014 .child = child, 3015 .offset = offset, 3016 .bytes = bytes, 3017 }; 3018 3019 return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco); 3020 } 3021 3022 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 3023 { 3024 BlockDriver *drv = bs->drv; 3025 CoroutineIOCompletion co = { 3026 .coroutine = qemu_coroutine_self(), 3027 }; 3028 BlockAIOCB *acb; 3029 3030 bdrv_inc_in_flight(bs); 3031 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 3032 co.ret = -ENOTSUP; 3033 goto out; 3034 } 3035 3036 if (drv->bdrv_co_ioctl) { 3037 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 3038 } else { 3039 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 3040 if (!acb) { 3041 co.ret = -ENOTSUP; 3042 goto out; 3043 } 3044 qemu_coroutine_yield(); 3045 } 3046 out: 3047 bdrv_dec_in_flight(bs); 3048 return co.ret; 3049 } 3050 3051 void *qemu_blockalign(BlockDriverState *bs, size_t size) 3052 { 3053 return qemu_memalign(bdrv_opt_mem_align(bs), size); 3054 } 3055 3056 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 3057 { 3058 return memset(qemu_blockalign(bs, size), 0, size); 3059 } 3060 3061 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 3062 { 3063 size_t align = bdrv_opt_mem_align(bs); 3064 3065 /* Ensure that NULL is never returned on success */ 3066 assert(align > 0); 3067 if (size == 0) { 3068 size = align; 3069 } 3070 3071 return qemu_try_memalign(align, size); 3072 } 3073 3074 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 3075 { 3076 void *mem = qemu_try_blockalign(bs, size); 3077 3078 if (mem) { 3079 memset(mem, 0, size); 3080 } 3081 3082 return mem; 3083 } 3084 3085 /* 3086 * Check if all memory in this vector is sector aligned. 3087 */ 3088 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 3089 { 3090 int i; 3091 size_t alignment = bdrv_min_mem_align(bs); 3092 3093 for (i = 0; i < qiov->niov; i++) { 3094 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 3095 return false; 3096 } 3097 if (qiov->iov[i].iov_len % alignment) { 3098 return false; 3099 } 3100 } 3101 3102 return true; 3103 } 3104 3105 void bdrv_add_before_write_notifier(BlockDriverState *bs, 3106 NotifierWithReturn *notifier) 3107 { 3108 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 3109 } 3110 3111 void bdrv_io_plug(BlockDriverState *bs) 3112 { 3113 BdrvChild *child; 3114 3115 QLIST_FOREACH(child, &bs->children, next) { 3116 bdrv_io_plug(child->bs); 3117 } 3118 3119 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 3120 BlockDriver *drv = bs->drv; 3121 if (drv && drv->bdrv_io_plug) { 3122 drv->bdrv_io_plug(bs); 3123 } 3124 } 3125 } 3126 3127 void bdrv_io_unplug(BlockDriverState *bs) 3128 { 3129 BdrvChild *child; 3130 3131 assert(bs->io_plugged); 3132 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 3133 BlockDriver *drv = bs->drv; 3134 if (drv && drv->bdrv_io_unplug) { 3135 drv->bdrv_io_unplug(bs); 3136 } 3137 } 3138 3139 QLIST_FOREACH(child, &bs->children, next) { 3140 bdrv_io_unplug(child->bs); 3141 } 3142 } 3143 3144 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 3145 { 3146 BdrvChild *child; 3147 3148 if (bs->drv && bs->drv->bdrv_register_buf) { 3149 bs->drv->bdrv_register_buf(bs, host, size); 3150 } 3151 QLIST_FOREACH(child, &bs->children, next) { 3152 bdrv_register_buf(child->bs, host, size); 3153 } 3154 } 3155 3156 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 3157 { 3158 BdrvChild *child; 3159 3160 if (bs->drv && bs->drv->bdrv_unregister_buf) { 3161 bs->drv->bdrv_unregister_buf(bs, host); 3162 } 3163 QLIST_FOREACH(child, &bs->children, next) { 3164 bdrv_unregister_buf(child->bs, host); 3165 } 3166 } 3167 3168 static int coroutine_fn bdrv_co_copy_range_internal( 3169 BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 3170 uint64_t dst_offset, uint64_t bytes, 3171 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3172 bool recurse_src) 3173 { 3174 BdrvTrackedRequest req; 3175 int ret; 3176 3177 /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3178 assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3179 assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3180 3181 if (!dst || !dst->bs) { 3182 return -ENOMEDIUM; 3183 } 3184 ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 3185 if (ret) { 3186 return ret; 3187 } 3188 if (write_flags & BDRV_REQ_ZERO_WRITE) { 3189 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3190 } 3191 3192 if (!src || !src->bs) { 3193 return -ENOMEDIUM; 3194 } 3195 ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 3196 if (ret) { 3197 return ret; 3198 } 3199 3200 if (!src->bs->drv->bdrv_co_copy_range_from 3201 || !dst->bs->drv->bdrv_co_copy_range_to 3202 || src->bs->encrypted || dst->bs->encrypted) { 3203 return -ENOTSUP; 3204 } 3205 3206 if (recurse_src) { 3207 bdrv_inc_in_flight(src->bs); 3208 tracked_request_begin(&req, src->bs, src_offset, bytes, 3209 BDRV_TRACKED_READ); 3210 3211 /* BDRV_REQ_SERIALISING is only for write operation */ 3212 assert(!(read_flags & BDRV_REQ_SERIALISING)); 3213 bdrv_wait_serialising_requests(&req); 3214 3215 ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3216 src, src_offset, 3217 dst, dst_offset, 3218 bytes, 3219 read_flags, write_flags); 3220 3221 tracked_request_end(&req); 3222 bdrv_dec_in_flight(src->bs); 3223 } else { 3224 bdrv_inc_in_flight(dst->bs); 3225 tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3226 BDRV_TRACKED_WRITE); 3227 ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 3228 write_flags); 3229 if (!ret) { 3230 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3231 src, src_offset, 3232 dst, dst_offset, 3233 bytes, 3234 read_flags, write_flags); 3235 } 3236 bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3237 tracked_request_end(&req); 3238 bdrv_dec_in_flight(dst->bs); 3239 } 3240 3241 return ret; 3242 } 3243 3244 /* Copy range from @src to @dst. 3245 * 3246 * See the comment of bdrv_co_copy_range for the parameter and return value 3247 * semantics. */ 3248 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 3249 BdrvChild *dst, uint64_t dst_offset, 3250 uint64_t bytes, 3251 BdrvRequestFlags read_flags, 3252 BdrvRequestFlags write_flags) 3253 { 3254 trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3255 read_flags, write_flags); 3256 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3257 bytes, read_flags, write_flags, true); 3258 } 3259 3260 /* Copy range from @src to @dst. 3261 * 3262 * See the comment of bdrv_co_copy_range for the parameter and return value 3263 * semantics. */ 3264 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3265 BdrvChild *dst, uint64_t dst_offset, 3266 uint64_t bytes, 3267 BdrvRequestFlags read_flags, 3268 BdrvRequestFlags write_flags) 3269 { 3270 trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3271 read_flags, write_flags); 3272 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3273 bytes, read_flags, write_flags, false); 3274 } 3275 3276 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3277 BdrvChild *dst, uint64_t dst_offset, 3278 uint64_t bytes, BdrvRequestFlags read_flags, 3279 BdrvRequestFlags write_flags) 3280 { 3281 return bdrv_co_copy_range_from(src, src_offset, 3282 dst, dst_offset, 3283 bytes, read_flags, write_flags); 3284 } 3285 3286 static void bdrv_parent_cb_resize(BlockDriverState *bs) 3287 { 3288 BdrvChild *c; 3289 QLIST_FOREACH(c, &bs->parents, next_parent) { 3290 if (c->klass->resize) { 3291 c->klass->resize(c); 3292 } 3293 } 3294 } 3295 3296 /** 3297 * Truncate file to 'offset' bytes (needed only for file protocols) 3298 * 3299 * If 'exact' is true, the file must be resized to exactly the given 3300 * 'offset'. Otherwise, it is sufficient for the node to be at least 3301 * 'offset' bytes in length. 3302 */ 3303 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 3304 PreallocMode prealloc, BdrvRequestFlags flags, 3305 Error **errp) 3306 { 3307 BlockDriverState *bs = child->bs; 3308 BdrvChild *filtered, *backing; 3309 BlockDriver *drv = bs->drv; 3310 BdrvTrackedRequest req; 3311 int64_t old_size, new_bytes; 3312 int ret; 3313 3314 3315 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 3316 if (!drv) { 3317 error_setg(errp, "No medium inserted"); 3318 return -ENOMEDIUM; 3319 } 3320 if (offset < 0) { 3321 error_setg(errp, "Image size cannot be negative"); 3322 return -EINVAL; 3323 } 3324 3325 old_size = bdrv_getlength(bs); 3326 if (old_size < 0) { 3327 error_setg_errno(errp, -old_size, "Failed to get old image size"); 3328 return old_size; 3329 } 3330 3331 if (offset > old_size) { 3332 new_bytes = offset - old_size; 3333 } else { 3334 new_bytes = 0; 3335 } 3336 3337 bdrv_inc_in_flight(bs); 3338 tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 3339 BDRV_TRACKED_TRUNCATE); 3340 3341 /* If we are growing the image and potentially using preallocation for the 3342 * new area, we need to make sure that no write requests are made to it 3343 * concurrently or they might be overwritten by preallocation. */ 3344 if (new_bytes) { 3345 bdrv_mark_request_serialising(&req, 1); 3346 } 3347 if (bs->read_only) { 3348 error_setg(errp, "Image is read-only"); 3349 ret = -EACCES; 3350 goto out; 3351 } 3352 ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3353 0); 3354 if (ret < 0) { 3355 error_setg_errno(errp, -ret, 3356 "Failed to prepare request for truncation"); 3357 goto out; 3358 } 3359 3360 filtered = bdrv_filter_child(bs); 3361 backing = bdrv_cow_child(bs); 3362 3363 /* 3364 * If the image has a backing file that is large enough that it would 3365 * provide data for the new area, we cannot leave it unallocated because 3366 * then the backing file content would become visible. Instead, zero-fill 3367 * the new area. 3368 * 3369 * Note that if the image has a backing file, but was opened without the 3370 * backing file, taking care of keeping things consistent with that backing 3371 * file is the user's responsibility. 3372 */ 3373 if (new_bytes && backing) { 3374 int64_t backing_len; 3375 3376 backing_len = bdrv_getlength(backing->bs); 3377 if (backing_len < 0) { 3378 ret = backing_len; 3379 error_setg_errno(errp, -ret, "Could not get backing file size"); 3380 goto out; 3381 } 3382 3383 if (backing_len > old_size) { 3384 flags |= BDRV_REQ_ZERO_WRITE; 3385 } 3386 } 3387 3388 if (drv->bdrv_co_truncate) { 3389 if (flags & ~bs->supported_truncate_flags) { 3390 error_setg(errp, "Block driver does not support requested flags"); 3391 ret = -ENOTSUP; 3392 goto out; 3393 } 3394 ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 3395 } else if (filtered) { 3396 ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); 3397 } else { 3398 error_setg(errp, "Image format driver does not support resize"); 3399 ret = -ENOTSUP; 3400 goto out; 3401 } 3402 if (ret < 0) { 3403 goto out; 3404 } 3405 3406 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3407 if (ret < 0) { 3408 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 3409 } else { 3410 offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3411 } 3412 /* It's possible that truncation succeeded but refresh_total_sectors 3413 * failed, but the latter doesn't affect how we should finish the request. 3414 * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3415 bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 3416 3417 out: 3418 tracked_request_end(&req); 3419 bdrv_dec_in_flight(bs); 3420 3421 return ret; 3422 } 3423 3424 typedef struct TruncateCo { 3425 BdrvChild *child; 3426 int64_t offset; 3427 bool exact; 3428 PreallocMode prealloc; 3429 BdrvRequestFlags flags; 3430 Error **errp; 3431 } TruncateCo; 3432 3433 static int coroutine_fn bdrv_truncate_co_entry(void *opaque) 3434 { 3435 TruncateCo *tco = opaque; 3436 3437 return bdrv_co_truncate(tco->child, tco->offset, tco->exact, 3438 tco->prealloc, tco->flags, tco->errp); 3439 } 3440 3441 int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, 3442 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) 3443 { 3444 TruncateCo tco = { 3445 .child = child, 3446 .offset = offset, 3447 .exact = exact, 3448 .prealloc = prealloc, 3449 .flags = flags, 3450 .errp = errp, 3451 }; 3452 3453 return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco); 3454 } 3455