1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "sysemu/replay.h" 37 38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40 41 static void bdrv_parent_cb_resize(BlockDriverState *bs); 42 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 43 int64_t offset, int bytes, BdrvRequestFlags flags); 44 45 static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 46 bool ignore_bds_parents) 47 { 48 BdrvChild *c, *next; 49 50 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 51 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 52 continue; 53 } 54 bdrv_parent_drained_begin_single(c, false); 55 } 56 } 57 58 static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 59 int *drained_end_counter) 60 { 61 assert(c->parent_quiesce_counter > 0); 62 c->parent_quiesce_counter--; 63 if (c->klass->drained_end) { 64 c->klass->drained_end(c, drained_end_counter); 65 } 66 } 67 68 void bdrv_parent_drained_end_single(BdrvChild *c) 69 { 70 int drained_end_counter = 0; 71 bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 72 BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0); 73 } 74 75 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 76 bool ignore_bds_parents, 77 int *drained_end_counter) 78 { 79 BdrvChild *c; 80 81 QLIST_FOREACH(c, &bs->parents, next_parent) { 82 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 83 continue; 84 } 85 bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 86 } 87 } 88 89 static bool bdrv_parent_drained_poll_single(BdrvChild *c) 90 { 91 if (c->klass->drained_poll) { 92 return c->klass->drained_poll(c); 93 } 94 return false; 95 } 96 97 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 98 bool ignore_bds_parents) 99 { 100 BdrvChild *c, *next; 101 bool busy = false; 102 103 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 104 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 105 continue; 106 } 107 busy |= bdrv_parent_drained_poll_single(c); 108 } 109 110 return busy; 111 } 112 113 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 114 { 115 c->parent_quiesce_counter++; 116 if (c->klass->drained_begin) { 117 c->klass->drained_begin(c); 118 } 119 if (poll) { 120 BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 121 } 122 } 123 124 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 125 { 126 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 127 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 128 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 129 src->opt_mem_alignment); 130 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 131 src->min_mem_alignment); 132 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 133 } 134 135 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 136 { 137 BlockDriver *drv = bs->drv; 138 Error *local_err = NULL; 139 140 memset(&bs->bl, 0, sizeof(bs->bl)); 141 142 if (!drv) { 143 return; 144 } 145 146 /* Default alignment based on whether driver has byte interface */ 147 bs->bl.request_alignment = (drv->bdrv_co_preadv || 148 drv->bdrv_aio_preadv || 149 drv->bdrv_co_preadv_part) ? 1 : 512; 150 151 /* Take some limits from the children as a default */ 152 if (bs->file) { 153 bdrv_refresh_limits(bs->file->bs, &local_err); 154 if (local_err) { 155 error_propagate(errp, local_err); 156 return; 157 } 158 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 159 } else { 160 bs->bl.min_mem_alignment = 512; 161 bs->bl.opt_mem_alignment = qemu_real_host_page_size; 162 163 /* Safe default since most protocols use readv()/writev()/etc */ 164 bs->bl.max_iov = IOV_MAX; 165 } 166 167 if (bs->backing) { 168 bdrv_refresh_limits(bs->backing->bs, &local_err); 169 if (local_err) { 170 error_propagate(errp, local_err); 171 return; 172 } 173 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 174 } 175 176 /* Then let the driver override it */ 177 if (drv->bdrv_refresh_limits) { 178 drv->bdrv_refresh_limits(bs, errp); 179 } 180 } 181 182 /** 183 * The copy-on-read flag is actually a reference count so multiple users may 184 * use the feature without worrying about clobbering its previous state. 185 * Copy-on-read stays enabled until all users have called to disable it. 186 */ 187 void bdrv_enable_copy_on_read(BlockDriverState *bs) 188 { 189 atomic_inc(&bs->copy_on_read); 190 } 191 192 void bdrv_disable_copy_on_read(BlockDriverState *bs) 193 { 194 int old = atomic_fetch_dec(&bs->copy_on_read); 195 assert(old >= 1); 196 } 197 198 typedef struct { 199 Coroutine *co; 200 BlockDriverState *bs; 201 bool done; 202 bool begin; 203 bool recursive; 204 bool poll; 205 BdrvChild *parent; 206 bool ignore_bds_parents; 207 int *drained_end_counter; 208 } BdrvCoDrainData; 209 210 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 211 { 212 BdrvCoDrainData *data = opaque; 213 BlockDriverState *bs = data->bs; 214 215 if (data->begin) { 216 bs->drv->bdrv_co_drain_begin(bs); 217 } else { 218 bs->drv->bdrv_co_drain_end(bs); 219 } 220 221 /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 222 atomic_mb_set(&data->done, true); 223 if (!data->begin) { 224 atomic_dec(data->drained_end_counter); 225 } 226 bdrv_dec_in_flight(bs); 227 228 g_free(data); 229 } 230 231 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 232 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 233 int *drained_end_counter) 234 { 235 BdrvCoDrainData *data; 236 237 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 238 (!begin && !bs->drv->bdrv_co_drain_end)) { 239 return; 240 } 241 242 data = g_new(BdrvCoDrainData, 1); 243 *data = (BdrvCoDrainData) { 244 .bs = bs, 245 .done = false, 246 .begin = begin, 247 .drained_end_counter = drained_end_counter, 248 }; 249 250 if (!begin) { 251 atomic_inc(drained_end_counter); 252 } 253 254 /* Make sure the driver callback completes during the polling phase for 255 * drain_begin. */ 256 bdrv_inc_in_flight(bs); 257 data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 258 aio_co_schedule(bdrv_get_aio_context(bs), data->co); 259 } 260 261 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 262 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 263 BdrvChild *ignore_parent, bool ignore_bds_parents) 264 { 265 BdrvChild *child, *next; 266 267 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 268 return true; 269 } 270 271 if (atomic_read(&bs->in_flight)) { 272 return true; 273 } 274 275 if (recursive) { 276 assert(!ignore_bds_parents); 277 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 278 if (bdrv_drain_poll(child->bs, recursive, child, false)) { 279 return true; 280 } 281 } 282 } 283 284 return false; 285 } 286 287 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 288 BdrvChild *ignore_parent) 289 { 290 return bdrv_drain_poll(bs, recursive, ignore_parent, false); 291 } 292 293 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 294 BdrvChild *parent, bool ignore_bds_parents, 295 bool poll); 296 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 297 BdrvChild *parent, bool ignore_bds_parents, 298 int *drained_end_counter); 299 300 static void bdrv_co_drain_bh_cb(void *opaque) 301 { 302 BdrvCoDrainData *data = opaque; 303 Coroutine *co = data->co; 304 BlockDriverState *bs = data->bs; 305 306 if (bs) { 307 AioContext *ctx = bdrv_get_aio_context(bs); 308 AioContext *co_ctx = qemu_coroutine_get_aio_context(co); 309 310 /* 311 * When the coroutine yielded, the lock for its home context was 312 * released, so we need to re-acquire it here. If it explicitly 313 * acquired a different context, the lock is still held and we don't 314 * want to lock it a second time (or AIO_WAIT_WHILE() would hang). 315 */ 316 if (ctx == co_ctx) { 317 aio_context_acquire(ctx); 318 } 319 bdrv_dec_in_flight(bs); 320 if (data->begin) { 321 assert(!data->drained_end_counter); 322 bdrv_do_drained_begin(bs, data->recursive, data->parent, 323 data->ignore_bds_parents, data->poll); 324 } else { 325 assert(!data->poll); 326 bdrv_do_drained_end(bs, data->recursive, data->parent, 327 data->ignore_bds_parents, 328 data->drained_end_counter); 329 } 330 if (ctx == co_ctx) { 331 aio_context_release(ctx); 332 } 333 } else { 334 assert(data->begin); 335 bdrv_drain_all_begin(); 336 } 337 338 data->done = true; 339 aio_co_wake(co); 340 } 341 342 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 343 bool begin, bool recursive, 344 BdrvChild *parent, 345 bool ignore_bds_parents, 346 bool poll, 347 int *drained_end_counter) 348 { 349 BdrvCoDrainData data; 350 351 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 352 * other coroutines run if they were queued by aio_co_enter(). */ 353 354 assert(qemu_in_coroutine()); 355 data = (BdrvCoDrainData) { 356 .co = qemu_coroutine_self(), 357 .bs = bs, 358 .done = false, 359 .begin = begin, 360 .recursive = recursive, 361 .parent = parent, 362 .ignore_bds_parents = ignore_bds_parents, 363 .poll = poll, 364 .drained_end_counter = drained_end_counter, 365 }; 366 367 if (bs) { 368 bdrv_inc_in_flight(bs); 369 } 370 replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs), 371 bdrv_co_drain_bh_cb, &data); 372 373 qemu_coroutine_yield(); 374 /* If we are resumed from some other event (such as an aio completion or a 375 * timer callback), it is a bug in the caller that should be fixed. */ 376 assert(data.done); 377 } 378 379 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 380 BdrvChild *parent, bool ignore_bds_parents) 381 { 382 assert(!qemu_in_coroutine()); 383 384 /* Stop things in parent-to-child order */ 385 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 386 aio_disable_external(bdrv_get_aio_context(bs)); 387 } 388 389 bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 390 bdrv_drain_invoke(bs, true, NULL); 391 } 392 393 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 394 BdrvChild *parent, bool ignore_bds_parents, 395 bool poll) 396 { 397 BdrvChild *child, *next; 398 399 if (qemu_in_coroutine()) { 400 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 401 poll, NULL); 402 return; 403 } 404 405 bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 406 407 if (recursive) { 408 assert(!ignore_bds_parents); 409 bs->recursive_quiesce_counter++; 410 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 411 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 412 false); 413 } 414 } 415 416 /* 417 * Wait for drained requests to finish. 418 * 419 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 420 * call is needed so things in this AioContext can make progress even 421 * though we don't return to the main AioContext loop - this automatically 422 * includes other nodes in the same AioContext and therefore all child 423 * nodes. 424 */ 425 if (poll) { 426 assert(!ignore_bds_parents); 427 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 428 } 429 } 430 431 void bdrv_drained_begin(BlockDriverState *bs) 432 { 433 bdrv_do_drained_begin(bs, false, NULL, false, true); 434 } 435 436 void bdrv_subtree_drained_begin(BlockDriverState *bs) 437 { 438 bdrv_do_drained_begin(bs, true, NULL, false, true); 439 } 440 441 /** 442 * This function does not poll, nor must any of its recursively called 443 * functions. The *drained_end_counter pointee will be incremented 444 * once for every background operation scheduled, and decremented once 445 * the operation settles. Therefore, the pointer must remain valid 446 * until the pointee reaches 0. That implies that whoever sets up the 447 * pointee has to poll until it is 0. 448 * 449 * We use atomic operations to access *drained_end_counter, because 450 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 451 * @bs may contain nodes in different AioContexts, 452 * (2) bdrv_drain_all_end() uses the same counter for all nodes, 453 * regardless of which AioContext they are in. 454 */ 455 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 456 BdrvChild *parent, bool ignore_bds_parents, 457 int *drained_end_counter) 458 { 459 BdrvChild *child; 460 int old_quiesce_counter; 461 462 assert(drained_end_counter != NULL); 463 464 if (qemu_in_coroutine()) { 465 bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 466 false, drained_end_counter); 467 return; 468 } 469 assert(bs->quiesce_counter > 0); 470 471 /* Re-enable things in child-to-parent order */ 472 bdrv_drain_invoke(bs, false, drained_end_counter); 473 bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 474 drained_end_counter); 475 476 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 477 if (old_quiesce_counter == 1) { 478 aio_enable_external(bdrv_get_aio_context(bs)); 479 } 480 481 if (recursive) { 482 assert(!ignore_bds_parents); 483 bs->recursive_quiesce_counter--; 484 QLIST_FOREACH(child, &bs->children, next) { 485 bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 486 drained_end_counter); 487 } 488 } 489 } 490 491 void bdrv_drained_end(BlockDriverState *bs) 492 { 493 int drained_end_counter = 0; 494 bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 495 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 496 } 497 498 void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 499 { 500 bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 501 } 502 503 void bdrv_subtree_drained_end(BlockDriverState *bs) 504 { 505 int drained_end_counter = 0; 506 bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 507 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 508 } 509 510 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 511 { 512 int i; 513 514 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 515 bdrv_do_drained_begin(child->bs, true, child, false, true); 516 } 517 } 518 519 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 520 { 521 int drained_end_counter = 0; 522 int i; 523 524 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 525 bdrv_do_drained_end(child->bs, true, child, false, 526 &drained_end_counter); 527 } 528 529 BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0); 530 } 531 532 /* 533 * Wait for pending requests to complete on a single BlockDriverState subtree, 534 * and suspend block driver's internal I/O until next request arrives. 535 * 536 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 537 * AioContext. 538 */ 539 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 540 { 541 assert(qemu_in_coroutine()); 542 bdrv_drained_begin(bs); 543 bdrv_drained_end(bs); 544 } 545 546 void bdrv_drain(BlockDriverState *bs) 547 { 548 bdrv_drained_begin(bs); 549 bdrv_drained_end(bs); 550 } 551 552 static void bdrv_drain_assert_idle(BlockDriverState *bs) 553 { 554 BdrvChild *child, *next; 555 556 assert(atomic_read(&bs->in_flight) == 0); 557 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 558 bdrv_drain_assert_idle(child->bs); 559 } 560 } 561 562 unsigned int bdrv_drain_all_count = 0; 563 564 static bool bdrv_drain_all_poll(void) 565 { 566 BlockDriverState *bs = NULL; 567 bool result = false; 568 569 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 570 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 571 while ((bs = bdrv_next_all_states(bs))) { 572 AioContext *aio_context = bdrv_get_aio_context(bs); 573 aio_context_acquire(aio_context); 574 result |= bdrv_drain_poll(bs, false, NULL, true); 575 aio_context_release(aio_context); 576 } 577 578 return result; 579 } 580 581 /* 582 * Wait for pending requests to complete across all BlockDriverStates 583 * 584 * This function does not flush data to disk, use bdrv_flush_all() for that 585 * after calling this function. 586 * 587 * This pauses all block jobs and disables external clients. It must 588 * be paired with bdrv_drain_all_end(). 589 * 590 * NOTE: no new block jobs or BlockDriverStates can be created between 591 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 592 */ 593 void bdrv_drain_all_begin(void) 594 { 595 BlockDriverState *bs = NULL; 596 597 if (qemu_in_coroutine()) { 598 bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 599 return; 600 } 601 602 /* 603 * bdrv queue is managed by record/replay, 604 * waiting for finishing the I/O requests may 605 * be infinite 606 */ 607 if (replay_events_enabled()) { 608 return; 609 } 610 611 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 612 * loop AioContext, so make sure we're in the main context. */ 613 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 614 assert(bdrv_drain_all_count < INT_MAX); 615 bdrv_drain_all_count++; 616 617 /* Quiesce all nodes, without polling in-flight requests yet. The graph 618 * cannot change during this loop. */ 619 while ((bs = bdrv_next_all_states(bs))) { 620 AioContext *aio_context = bdrv_get_aio_context(bs); 621 622 aio_context_acquire(aio_context); 623 bdrv_do_drained_begin(bs, false, NULL, true, false); 624 aio_context_release(aio_context); 625 } 626 627 /* Now poll the in-flight requests */ 628 AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 629 630 while ((bs = bdrv_next_all_states(bs))) { 631 bdrv_drain_assert_idle(bs); 632 } 633 } 634 635 void bdrv_drain_all_end(void) 636 { 637 BlockDriverState *bs = NULL; 638 int drained_end_counter = 0; 639 640 /* 641 * bdrv queue is managed by record/replay, 642 * waiting for finishing the I/O requests may 643 * be endless 644 */ 645 if (replay_events_enabled()) { 646 return; 647 } 648 649 while ((bs = bdrv_next_all_states(bs))) { 650 AioContext *aio_context = bdrv_get_aio_context(bs); 651 652 aio_context_acquire(aio_context); 653 bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 654 aio_context_release(aio_context); 655 } 656 657 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 658 AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0); 659 660 assert(bdrv_drain_all_count > 0); 661 bdrv_drain_all_count--; 662 } 663 664 void bdrv_drain_all(void) 665 { 666 bdrv_drain_all_begin(); 667 bdrv_drain_all_end(); 668 } 669 670 /** 671 * Remove an active request from the tracked requests list 672 * 673 * This function should be called when a tracked request is completing. 674 */ 675 static void tracked_request_end(BdrvTrackedRequest *req) 676 { 677 if (req->serialising) { 678 atomic_dec(&req->bs->serialising_in_flight); 679 } 680 681 qemu_co_mutex_lock(&req->bs->reqs_lock); 682 QLIST_REMOVE(req, list); 683 qemu_co_queue_restart_all(&req->wait_queue); 684 qemu_co_mutex_unlock(&req->bs->reqs_lock); 685 } 686 687 /** 688 * Add an active request to the tracked requests list 689 */ 690 static void tracked_request_begin(BdrvTrackedRequest *req, 691 BlockDriverState *bs, 692 int64_t offset, 693 uint64_t bytes, 694 enum BdrvTrackedRequestType type) 695 { 696 assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); 697 698 *req = (BdrvTrackedRequest){ 699 .bs = bs, 700 .offset = offset, 701 .bytes = bytes, 702 .type = type, 703 .co = qemu_coroutine_self(), 704 .serialising = false, 705 .overlap_offset = offset, 706 .overlap_bytes = bytes, 707 }; 708 709 qemu_co_queue_init(&req->wait_queue); 710 711 qemu_co_mutex_lock(&bs->reqs_lock); 712 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 713 qemu_co_mutex_unlock(&bs->reqs_lock); 714 } 715 716 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 717 int64_t offset, uint64_t bytes) 718 { 719 /* aaaa bbbb */ 720 if (offset >= req->overlap_offset + req->overlap_bytes) { 721 return false; 722 } 723 /* bbbb aaaa */ 724 if (req->overlap_offset >= offset + bytes) { 725 return false; 726 } 727 return true; 728 } 729 730 static bool coroutine_fn 731 bdrv_wait_serialising_requests_locked(BlockDriverState *bs, 732 BdrvTrackedRequest *self) 733 { 734 BdrvTrackedRequest *req; 735 bool retry; 736 bool waited = false; 737 738 do { 739 retry = false; 740 QLIST_FOREACH(req, &bs->tracked_requests, list) { 741 if (req == self || (!req->serialising && !self->serialising)) { 742 continue; 743 } 744 if (tracked_request_overlaps(req, self->overlap_offset, 745 self->overlap_bytes)) 746 { 747 /* Hitting this means there was a reentrant request, for 748 * example, a block driver issuing nested requests. This must 749 * never happen since it means deadlock. 750 */ 751 assert(qemu_coroutine_self() != req->co); 752 753 /* If the request is already (indirectly) waiting for us, or 754 * will wait for us as soon as it wakes up, then just go on 755 * (instead of producing a deadlock in the former case). */ 756 if (!req->waiting_for) { 757 self->waiting_for = req; 758 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 759 self->waiting_for = NULL; 760 retry = true; 761 waited = true; 762 break; 763 } 764 } 765 } 766 } while (retry); 767 return waited; 768 } 769 770 bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 771 { 772 BlockDriverState *bs = req->bs; 773 int64_t overlap_offset = req->offset & ~(align - 1); 774 uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 775 - overlap_offset; 776 bool waited; 777 778 qemu_co_mutex_lock(&bs->reqs_lock); 779 if (!req->serialising) { 780 atomic_inc(&req->bs->serialising_in_flight); 781 req->serialising = true; 782 } 783 784 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 785 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 786 waited = bdrv_wait_serialising_requests_locked(bs, req); 787 qemu_co_mutex_unlock(&bs->reqs_lock); 788 return waited; 789 } 790 791 /** 792 * Return the tracked request on @bs for the current coroutine, or 793 * NULL if there is none. 794 */ 795 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 796 { 797 BdrvTrackedRequest *req; 798 Coroutine *self = qemu_coroutine_self(); 799 800 QLIST_FOREACH(req, &bs->tracked_requests, list) { 801 if (req->co == self) { 802 return req; 803 } 804 } 805 806 return NULL; 807 } 808 809 /** 810 * Round a region to cluster boundaries 811 */ 812 void bdrv_round_to_clusters(BlockDriverState *bs, 813 int64_t offset, int64_t bytes, 814 int64_t *cluster_offset, 815 int64_t *cluster_bytes) 816 { 817 BlockDriverInfo bdi; 818 819 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 820 *cluster_offset = offset; 821 *cluster_bytes = bytes; 822 } else { 823 int64_t c = bdi.cluster_size; 824 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 825 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 826 } 827 } 828 829 static int bdrv_get_cluster_size(BlockDriverState *bs) 830 { 831 BlockDriverInfo bdi; 832 int ret; 833 834 ret = bdrv_get_info(bs, &bdi); 835 if (ret < 0 || bdi.cluster_size == 0) { 836 return bs->bl.request_alignment; 837 } else { 838 return bdi.cluster_size; 839 } 840 } 841 842 void bdrv_inc_in_flight(BlockDriverState *bs) 843 { 844 atomic_inc(&bs->in_flight); 845 } 846 847 void bdrv_wakeup(BlockDriverState *bs) 848 { 849 aio_wait_kick(); 850 } 851 852 void bdrv_dec_in_flight(BlockDriverState *bs) 853 { 854 atomic_dec(&bs->in_flight); 855 bdrv_wakeup(bs); 856 } 857 858 static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 859 { 860 BlockDriverState *bs = self->bs; 861 bool waited = false; 862 863 if (!atomic_read(&bs->serialising_in_flight)) { 864 return false; 865 } 866 867 qemu_co_mutex_lock(&bs->reqs_lock); 868 waited = bdrv_wait_serialising_requests_locked(bs, self); 869 qemu_co_mutex_unlock(&bs->reqs_lock); 870 871 return waited; 872 } 873 874 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 875 size_t size) 876 { 877 if (size > BDRV_REQUEST_MAX_BYTES) { 878 return -EIO; 879 } 880 881 if (!bdrv_is_inserted(bs)) { 882 return -ENOMEDIUM; 883 } 884 885 if (offset < 0) { 886 return -EIO; 887 } 888 889 return 0; 890 } 891 892 typedef int coroutine_fn BdrvRequestEntry(void *opaque); 893 typedef struct BdrvRunCo { 894 BdrvRequestEntry *entry; 895 void *opaque; 896 int ret; 897 bool done; 898 Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */ 899 } BdrvRunCo; 900 901 static void coroutine_fn bdrv_run_co_entry(void *opaque) 902 { 903 BdrvRunCo *arg = opaque; 904 905 arg->ret = arg->entry(arg->opaque); 906 arg->done = true; 907 aio_wait_kick(); 908 } 909 910 static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry, 911 void *opaque) 912 { 913 if (qemu_in_coroutine()) { 914 /* Fast-path if already in coroutine context */ 915 return entry(opaque); 916 } else { 917 BdrvRunCo s = { .entry = entry, .opaque = opaque }; 918 919 s.co = qemu_coroutine_create(bdrv_run_co_entry, &s); 920 bdrv_coroutine_enter(bs, s.co); 921 922 BDRV_POLL_WHILE(bs, !s.done); 923 924 return s.ret; 925 } 926 } 927 928 typedef struct RwCo { 929 BdrvChild *child; 930 int64_t offset; 931 QEMUIOVector *qiov; 932 bool is_write; 933 BdrvRequestFlags flags; 934 } RwCo; 935 936 static int coroutine_fn bdrv_rw_co_entry(void *opaque) 937 { 938 RwCo *rwco = opaque; 939 940 if (!rwco->is_write) { 941 return bdrv_co_preadv(rwco->child, rwco->offset, 942 rwco->qiov->size, rwco->qiov, 943 rwco->flags); 944 } else { 945 return bdrv_co_pwritev(rwco->child, rwco->offset, 946 rwco->qiov->size, rwco->qiov, 947 rwco->flags); 948 } 949 } 950 951 /* 952 * Process a vectored synchronous request using coroutines 953 */ 954 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 955 QEMUIOVector *qiov, bool is_write, 956 BdrvRequestFlags flags) 957 { 958 RwCo rwco = { 959 .child = child, 960 .offset = offset, 961 .qiov = qiov, 962 .is_write = is_write, 963 .flags = flags, 964 }; 965 966 return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco); 967 } 968 969 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 970 int bytes, BdrvRequestFlags flags) 971 { 972 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); 973 974 return bdrv_prwv_co(child, offset, &qiov, true, 975 BDRV_REQ_ZERO_WRITE | flags); 976 } 977 978 /* 979 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 980 * The operation is sped up by checking the block status and only writing 981 * zeroes to the device if they currently do not return zeroes. Optional 982 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 983 * BDRV_REQ_FUA). 984 * 985 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 986 */ 987 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 988 { 989 int ret; 990 int64_t target_size, bytes, offset = 0; 991 BlockDriverState *bs = child->bs; 992 993 target_size = bdrv_getlength(bs); 994 if (target_size < 0) { 995 return target_size; 996 } 997 998 for (;;) { 999 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 1000 if (bytes <= 0) { 1001 return 0; 1002 } 1003 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 1004 if (ret < 0) { 1005 return ret; 1006 } 1007 if (ret & BDRV_BLOCK_ZERO) { 1008 offset += bytes; 1009 continue; 1010 } 1011 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 1012 if (ret < 0) { 1013 return ret; 1014 } 1015 offset += bytes; 1016 } 1017 } 1018 1019 /* return < 0 if error. See bdrv_pwrite() for the return codes */ 1020 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 1021 { 1022 int ret; 1023 1024 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 1025 if (ret < 0) { 1026 return ret; 1027 } 1028 1029 return qiov->size; 1030 } 1031 1032 /* See bdrv_pwrite() for the return codes */ 1033 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 1034 { 1035 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1036 1037 if (bytes < 0) { 1038 return -EINVAL; 1039 } 1040 1041 return bdrv_preadv(child, offset, &qiov); 1042 } 1043 1044 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 1045 { 1046 int ret; 1047 1048 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 1049 if (ret < 0) { 1050 return ret; 1051 } 1052 1053 return qiov->size; 1054 } 1055 1056 /* Return no. of bytes on success or < 0 on error. Important errors are: 1057 -EIO generic I/O error (may happen for all errors) 1058 -ENOMEDIUM No media inserted. 1059 -EINVAL Invalid offset or number of bytes 1060 -EACCES Trying to write a read-only device 1061 */ 1062 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 1063 { 1064 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1065 1066 if (bytes < 0) { 1067 return -EINVAL; 1068 } 1069 1070 return bdrv_pwritev(child, offset, &qiov); 1071 } 1072 1073 /* 1074 * Writes to the file and ensures that no writes are reordered across this 1075 * request (acts as a barrier) 1076 * 1077 * Returns 0 on success, -errno in error cases. 1078 */ 1079 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 1080 const void *buf, int count) 1081 { 1082 int ret; 1083 1084 ret = bdrv_pwrite(child, offset, buf, count); 1085 if (ret < 0) { 1086 return ret; 1087 } 1088 1089 ret = bdrv_flush(child->bs); 1090 if (ret < 0) { 1091 return ret; 1092 } 1093 1094 return 0; 1095 } 1096 1097 typedef struct CoroutineIOCompletion { 1098 Coroutine *coroutine; 1099 int ret; 1100 } CoroutineIOCompletion; 1101 1102 static void bdrv_co_io_em_complete(void *opaque, int ret) 1103 { 1104 CoroutineIOCompletion *co = opaque; 1105 1106 co->ret = ret; 1107 aio_co_wake(co->coroutine); 1108 } 1109 1110 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1111 uint64_t offset, uint64_t bytes, 1112 QEMUIOVector *qiov, 1113 size_t qiov_offset, int flags) 1114 { 1115 BlockDriver *drv = bs->drv; 1116 int64_t sector_num; 1117 unsigned int nb_sectors; 1118 QEMUIOVector local_qiov; 1119 int ret; 1120 1121 assert(!(flags & ~BDRV_REQ_MASK)); 1122 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1123 1124 if (!drv) { 1125 return -ENOMEDIUM; 1126 } 1127 1128 if (drv->bdrv_co_preadv_part) { 1129 return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1130 flags); 1131 } 1132 1133 if (qiov_offset > 0 || bytes != qiov->size) { 1134 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1135 qiov = &local_qiov; 1136 } 1137 1138 if (drv->bdrv_co_preadv) { 1139 ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1140 goto out; 1141 } 1142 1143 if (drv->bdrv_aio_preadv) { 1144 BlockAIOCB *acb; 1145 CoroutineIOCompletion co = { 1146 .coroutine = qemu_coroutine_self(), 1147 }; 1148 1149 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 1150 bdrv_co_io_em_complete, &co); 1151 if (acb == NULL) { 1152 ret = -EIO; 1153 goto out; 1154 } else { 1155 qemu_coroutine_yield(); 1156 ret = co.ret; 1157 goto out; 1158 } 1159 } 1160 1161 sector_num = offset >> BDRV_SECTOR_BITS; 1162 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1163 1164 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1165 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1166 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1167 assert(drv->bdrv_co_readv); 1168 1169 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1170 1171 out: 1172 if (qiov == &local_qiov) { 1173 qemu_iovec_destroy(&local_qiov); 1174 } 1175 1176 return ret; 1177 } 1178 1179 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1180 uint64_t offset, uint64_t bytes, 1181 QEMUIOVector *qiov, 1182 size_t qiov_offset, int flags) 1183 { 1184 BlockDriver *drv = bs->drv; 1185 int64_t sector_num; 1186 unsigned int nb_sectors; 1187 QEMUIOVector local_qiov; 1188 int ret; 1189 1190 assert(!(flags & ~BDRV_REQ_MASK)); 1191 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1192 1193 if (!drv) { 1194 return -ENOMEDIUM; 1195 } 1196 1197 if (drv->bdrv_co_pwritev_part) { 1198 ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1199 flags & bs->supported_write_flags); 1200 flags &= ~bs->supported_write_flags; 1201 goto emulate_flags; 1202 } 1203 1204 if (qiov_offset > 0 || bytes != qiov->size) { 1205 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1206 qiov = &local_qiov; 1207 } 1208 1209 if (drv->bdrv_co_pwritev) { 1210 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1211 flags & bs->supported_write_flags); 1212 flags &= ~bs->supported_write_flags; 1213 goto emulate_flags; 1214 } 1215 1216 if (drv->bdrv_aio_pwritev) { 1217 BlockAIOCB *acb; 1218 CoroutineIOCompletion co = { 1219 .coroutine = qemu_coroutine_self(), 1220 }; 1221 1222 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1223 flags & bs->supported_write_flags, 1224 bdrv_co_io_em_complete, &co); 1225 flags &= ~bs->supported_write_flags; 1226 if (acb == NULL) { 1227 ret = -EIO; 1228 } else { 1229 qemu_coroutine_yield(); 1230 ret = co.ret; 1231 } 1232 goto emulate_flags; 1233 } 1234 1235 sector_num = offset >> BDRV_SECTOR_BITS; 1236 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1237 1238 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1239 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1240 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1241 1242 assert(drv->bdrv_co_writev); 1243 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1244 flags & bs->supported_write_flags); 1245 flags &= ~bs->supported_write_flags; 1246 1247 emulate_flags: 1248 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1249 ret = bdrv_co_flush(bs); 1250 } 1251 1252 if (qiov == &local_qiov) { 1253 qemu_iovec_destroy(&local_qiov); 1254 } 1255 1256 return ret; 1257 } 1258 1259 static int coroutine_fn 1260 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1261 uint64_t bytes, QEMUIOVector *qiov, 1262 size_t qiov_offset) 1263 { 1264 BlockDriver *drv = bs->drv; 1265 QEMUIOVector local_qiov; 1266 int ret; 1267 1268 if (!drv) { 1269 return -ENOMEDIUM; 1270 } 1271 1272 if (!block_driver_can_compress(drv)) { 1273 return -ENOTSUP; 1274 } 1275 1276 if (drv->bdrv_co_pwritev_compressed_part) { 1277 return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1278 qiov, qiov_offset); 1279 } 1280 1281 if (qiov_offset == 0) { 1282 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1283 } 1284 1285 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1286 ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1287 qemu_iovec_destroy(&local_qiov); 1288 1289 return ret; 1290 } 1291 1292 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1293 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1294 size_t qiov_offset, int flags) 1295 { 1296 BlockDriverState *bs = child->bs; 1297 1298 /* Perform I/O through a temporary buffer so that users who scribble over 1299 * their read buffer while the operation is in progress do not end up 1300 * modifying the image file. This is critical for zero-copy guest I/O 1301 * where anything might happen inside guest memory. 1302 */ 1303 void *bounce_buffer = NULL; 1304 1305 BlockDriver *drv = bs->drv; 1306 int64_t cluster_offset; 1307 int64_t cluster_bytes; 1308 size_t skip_bytes; 1309 int ret; 1310 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1311 BDRV_REQUEST_MAX_BYTES); 1312 unsigned int progress = 0; 1313 bool skip_write; 1314 1315 if (!drv) { 1316 return -ENOMEDIUM; 1317 } 1318 1319 /* 1320 * Do not write anything when the BDS is inactive. That is not 1321 * allowed, and it would not help. 1322 */ 1323 skip_write = (bs->open_flags & BDRV_O_INACTIVE); 1324 1325 /* FIXME We cannot require callers to have write permissions when all they 1326 * are doing is a read request. If we did things right, write permissions 1327 * would be obtained anyway, but internally by the copy-on-read code. As 1328 * long as it is implemented here rather than in a separate filter driver, 1329 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1330 * it could request permissions. Therefore we have to bypass the permission 1331 * system for the moment. */ 1332 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1333 1334 /* Cover entire cluster so no additional backing file I/O is required when 1335 * allocating cluster in the image file. Note that this value may exceed 1336 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1337 * is one reason we loop rather than doing it all at once. 1338 */ 1339 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1340 skip_bytes = offset - cluster_offset; 1341 1342 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1343 cluster_offset, cluster_bytes); 1344 1345 while (cluster_bytes) { 1346 int64_t pnum; 1347 1348 if (skip_write) { 1349 ret = 1; /* "already allocated", so nothing will be copied */ 1350 pnum = MIN(cluster_bytes, max_transfer); 1351 } else { 1352 ret = bdrv_is_allocated(bs, cluster_offset, 1353 MIN(cluster_bytes, max_transfer), &pnum); 1354 if (ret < 0) { 1355 /* 1356 * Safe to treat errors in querying allocation as if 1357 * unallocated; we'll probably fail again soon on the 1358 * read, but at least that will set a decent errno. 1359 */ 1360 pnum = MIN(cluster_bytes, max_transfer); 1361 } 1362 1363 /* Stop at EOF if the image ends in the middle of the cluster */ 1364 if (ret == 0 && pnum == 0) { 1365 assert(progress >= bytes); 1366 break; 1367 } 1368 1369 assert(skip_bytes < pnum); 1370 } 1371 1372 if (ret <= 0) { 1373 QEMUIOVector local_qiov; 1374 1375 /* Must copy-on-read; use the bounce buffer */ 1376 pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1377 if (!bounce_buffer) { 1378 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 1379 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 1380 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 1381 1382 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 1383 if (!bounce_buffer) { 1384 ret = -ENOMEM; 1385 goto err; 1386 } 1387 } 1388 qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1389 1390 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1391 &local_qiov, 0, 0); 1392 if (ret < 0) { 1393 goto err; 1394 } 1395 1396 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1397 if (drv->bdrv_co_pwrite_zeroes && 1398 buffer_is_zero(bounce_buffer, pnum)) { 1399 /* FIXME: Should we (perhaps conditionally) be setting 1400 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1401 * that still correctly reads as zero? */ 1402 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1403 BDRV_REQ_WRITE_UNCHANGED); 1404 } else { 1405 /* This does not change the data on the disk, it is not 1406 * necessary to flush even in cache=writethrough mode. 1407 */ 1408 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1409 &local_qiov, 0, 1410 BDRV_REQ_WRITE_UNCHANGED); 1411 } 1412 1413 if (ret < 0) { 1414 /* It might be okay to ignore write errors for guest 1415 * requests. If this is a deliberate copy-on-read 1416 * then we don't want to ignore the error. Simply 1417 * report it in all cases. 1418 */ 1419 goto err; 1420 } 1421 1422 if (!(flags & BDRV_REQ_PREFETCH)) { 1423 qemu_iovec_from_buf(qiov, qiov_offset + progress, 1424 bounce_buffer + skip_bytes, 1425 MIN(pnum - skip_bytes, bytes - progress)); 1426 } 1427 } else if (!(flags & BDRV_REQ_PREFETCH)) { 1428 /* Read directly into the destination */ 1429 ret = bdrv_driver_preadv(bs, offset + progress, 1430 MIN(pnum - skip_bytes, bytes - progress), 1431 qiov, qiov_offset + progress, 0); 1432 if (ret < 0) { 1433 goto err; 1434 } 1435 } 1436 1437 cluster_offset += pnum; 1438 cluster_bytes -= pnum; 1439 progress += pnum - skip_bytes; 1440 skip_bytes = 0; 1441 } 1442 ret = 0; 1443 1444 err: 1445 qemu_vfree(bounce_buffer); 1446 return ret; 1447 } 1448 1449 /* 1450 * Forwards an already correctly aligned request to the BlockDriver. This 1451 * handles copy on read, zeroing after EOF, and fragmentation of large 1452 * reads; any other features must be implemented by the caller. 1453 */ 1454 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1455 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1456 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1457 { 1458 BlockDriverState *bs = child->bs; 1459 int64_t total_bytes, max_bytes; 1460 int ret = 0; 1461 uint64_t bytes_remaining = bytes; 1462 int max_transfer; 1463 1464 assert(is_power_of_2(align)); 1465 assert((offset & (align - 1)) == 0); 1466 assert((bytes & (align - 1)) == 0); 1467 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1468 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1469 align); 1470 1471 /* TODO: We would need a per-BDS .supported_read_flags and 1472 * potential fallback support, if we ever implement any read flags 1473 * to pass through to drivers. For now, there aren't any 1474 * passthrough flags. */ 1475 assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); 1476 1477 /* Handle Copy on Read and associated serialisation */ 1478 if (flags & BDRV_REQ_COPY_ON_READ) { 1479 /* If we touch the same cluster it counts as an overlap. This 1480 * guarantees that allocating writes will be serialized and not race 1481 * with each other for the same cluster. For example, in copy-on-read 1482 * it ensures that the CoR read and write operations are atomic and 1483 * guest writes cannot interleave between them. */ 1484 bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1485 } else { 1486 bdrv_wait_serialising_requests(req); 1487 } 1488 1489 if (flags & BDRV_REQ_COPY_ON_READ) { 1490 int64_t pnum; 1491 1492 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1493 if (ret < 0) { 1494 goto out; 1495 } 1496 1497 if (!ret || pnum != bytes) { 1498 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 1499 qiov, qiov_offset, flags); 1500 goto out; 1501 } else if (flags & BDRV_REQ_PREFETCH) { 1502 goto out; 1503 } 1504 } 1505 1506 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1507 total_bytes = bdrv_getlength(bs); 1508 if (total_bytes < 0) { 1509 ret = total_bytes; 1510 goto out; 1511 } 1512 1513 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1514 if (bytes <= max_bytes && bytes <= max_transfer) { 1515 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); 1516 goto out; 1517 } 1518 1519 while (bytes_remaining) { 1520 int num; 1521 1522 if (max_bytes) { 1523 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1524 assert(num); 1525 1526 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1527 num, qiov, 1528 qiov_offset + bytes - bytes_remaining, 0); 1529 max_bytes -= num; 1530 } else { 1531 num = bytes_remaining; 1532 ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1533 0, bytes_remaining); 1534 } 1535 if (ret < 0) { 1536 goto out; 1537 } 1538 bytes_remaining -= num; 1539 } 1540 1541 out: 1542 return ret < 0 ? ret : 0; 1543 } 1544 1545 /* 1546 * Request padding 1547 * 1548 * |<---- align ----->| |<----- align ---->| 1549 * |<- head ->|<------------- bytes ------------->|<-- tail -->| 1550 * | | | | | | 1551 * -*----------$-------*-------- ... --------*-----$------------*--- 1552 * | | | | | | 1553 * | offset | | end | 1554 * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 1555 * [buf ... ) [tail_buf ) 1556 * 1557 * @buf is an aligned allocation needed to store @head and @tail paddings. @head 1558 * is placed at the beginning of @buf and @tail at the @end. 1559 * 1560 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 1561 * around tail, if tail exists. 1562 * 1563 * @merge_reads is true for small requests, 1564 * if @buf_len == @head + bytes + @tail. In this case it is possible that both 1565 * head and tail exist but @buf_len == align and @tail_buf == @buf. 1566 */ 1567 typedef struct BdrvRequestPadding { 1568 uint8_t *buf; 1569 size_t buf_len; 1570 uint8_t *tail_buf; 1571 size_t head; 1572 size_t tail; 1573 bool merge_reads; 1574 QEMUIOVector local_qiov; 1575 } BdrvRequestPadding; 1576 1577 static bool bdrv_init_padding(BlockDriverState *bs, 1578 int64_t offset, int64_t bytes, 1579 BdrvRequestPadding *pad) 1580 { 1581 uint64_t align = bs->bl.request_alignment; 1582 size_t sum; 1583 1584 memset(pad, 0, sizeof(*pad)); 1585 1586 pad->head = offset & (align - 1); 1587 pad->tail = ((offset + bytes) & (align - 1)); 1588 if (pad->tail) { 1589 pad->tail = align - pad->tail; 1590 } 1591 1592 if (!pad->head && !pad->tail) { 1593 return false; 1594 } 1595 1596 assert(bytes); /* Nothing good in aligning zero-length requests */ 1597 1598 sum = pad->head + bytes + pad->tail; 1599 pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 1600 pad->buf = qemu_blockalign(bs, pad->buf_len); 1601 pad->merge_reads = sum == pad->buf_len; 1602 if (pad->tail) { 1603 pad->tail_buf = pad->buf + pad->buf_len - align; 1604 } 1605 1606 return true; 1607 } 1608 1609 static int bdrv_padding_rmw_read(BdrvChild *child, 1610 BdrvTrackedRequest *req, 1611 BdrvRequestPadding *pad, 1612 bool zero_middle) 1613 { 1614 QEMUIOVector local_qiov; 1615 BlockDriverState *bs = child->bs; 1616 uint64_t align = bs->bl.request_alignment; 1617 int ret; 1618 1619 assert(req->serialising && pad->buf); 1620 1621 if (pad->head || pad->merge_reads) { 1622 uint64_t bytes = pad->merge_reads ? pad->buf_len : align; 1623 1624 qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 1625 1626 if (pad->head) { 1627 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1628 } 1629 if (pad->merge_reads && pad->tail) { 1630 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1631 } 1632 ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 1633 align, &local_qiov, 0, 0); 1634 if (ret < 0) { 1635 return ret; 1636 } 1637 if (pad->head) { 1638 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1639 } 1640 if (pad->merge_reads && pad->tail) { 1641 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1642 } 1643 1644 if (pad->merge_reads) { 1645 goto zero_mem; 1646 } 1647 } 1648 1649 if (pad->tail) { 1650 qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 1651 1652 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1653 ret = bdrv_aligned_preadv( 1654 child, req, 1655 req->overlap_offset + req->overlap_bytes - align, 1656 align, align, &local_qiov, 0, 0); 1657 if (ret < 0) { 1658 return ret; 1659 } 1660 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1661 } 1662 1663 zero_mem: 1664 if (zero_middle) { 1665 memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 1666 } 1667 1668 return 0; 1669 } 1670 1671 static void bdrv_padding_destroy(BdrvRequestPadding *pad) 1672 { 1673 if (pad->buf) { 1674 qemu_vfree(pad->buf); 1675 qemu_iovec_destroy(&pad->local_qiov); 1676 } 1677 } 1678 1679 /* 1680 * bdrv_pad_request 1681 * 1682 * Exchange request parameters with padded request if needed. Don't include RMW 1683 * read of padding, bdrv_padding_rmw_read() should be called separately if 1684 * needed. 1685 * 1686 * All parameters except @bs are in-out: they represent original request at 1687 * function call and padded (if padding needed) at function finish. 1688 * 1689 * Function always succeeds. 1690 */ 1691 static bool bdrv_pad_request(BlockDriverState *bs, 1692 QEMUIOVector **qiov, size_t *qiov_offset, 1693 int64_t *offset, unsigned int *bytes, 1694 BdrvRequestPadding *pad) 1695 { 1696 if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 1697 return false; 1698 } 1699 1700 qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 1701 *qiov, *qiov_offset, *bytes, 1702 pad->buf + pad->buf_len - pad->tail, pad->tail); 1703 *bytes += pad->head + pad->tail; 1704 *offset -= pad->head; 1705 *qiov = &pad->local_qiov; 1706 *qiov_offset = 0; 1707 1708 return true; 1709 } 1710 1711 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1712 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1713 BdrvRequestFlags flags) 1714 { 1715 return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 1716 } 1717 1718 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 1719 int64_t offset, unsigned int bytes, 1720 QEMUIOVector *qiov, size_t qiov_offset, 1721 BdrvRequestFlags flags) 1722 { 1723 BlockDriverState *bs = child->bs; 1724 BdrvTrackedRequest req; 1725 BdrvRequestPadding pad; 1726 int ret; 1727 1728 trace_bdrv_co_preadv(bs, offset, bytes, flags); 1729 1730 ret = bdrv_check_byte_request(bs, offset, bytes); 1731 if (ret < 0) { 1732 return ret; 1733 } 1734 1735 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1736 /* 1737 * Aligning zero request is nonsense. Even if driver has special meaning 1738 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1739 * it to driver due to request_alignment. 1740 * 1741 * Still, no reason to return an error if someone do unaligned 1742 * zero-length read occasionally. 1743 */ 1744 return 0; 1745 } 1746 1747 bdrv_inc_in_flight(bs); 1748 1749 /* Don't do copy-on-read if we read data before write operation */ 1750 if (atomic_read(&bs->copy_on_read)) { 1751 flags |= BDRV_REQ_COPY_ON_READ; 1752 } 1753 1754 bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad); 1755 1756 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1757 ret = bdrv_aligned_preadv(child, &req, offset, bytes, 1758 bs->bl.request_alignment, 1759 qiov, qiov_offset, flags); 1760 tracked_request_end(&req); 1761 bdrv_dec_in_flight(bs); 1762 1763 bdrv_padding_destroy(&pad); 1764 1765 return ret; 1766 } 1767 1768 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1769 int64_t offset, int bytes, BdrvRequestFlags flags) 1770 { 1771 BlockDriver *drv = bs->drv; 1772 QEMUIOVector qiov; 1773 void *buf = NULL; 1774 int ret = 0; 1775 bool need_flush = false; 1776 int head = 0; 1777 int tail = 0; 1778 1779 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1780 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1781 bs->bl.request_alignment); 1782 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1783 1784 if (!drv) { 1785 return -ENOMEDIUM; 1786 } 1787 1788 if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1789 return -ENOTSUP; 1790 } 1791 1792 assert(alignment % bs->bl.request_alignment == 0); 1793 head = offset % alignment; 1794 tail = (offset + bytes) % alignment; 1795 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1796 assert(max_write_zeroes >= bs->bl.request_alignment); 1797 1798 while (bytes > 0 && !ret) { 1799 int num = bytes; 1800 1801 /* Align request. Block drivers can expect the "bulk" of the request 1802 * to be aligned, and that unaligned requests do not cross cluster 1803 * boundaries. 1804 */ 1805 if (head) { 1806 /* Make a small request up to the first aligned sector. For 1807 * convenience, limit this request to max_transfer even if 1808 * we don't need to fall back to writes. */ 1809 num = MIN(MIN(bytes, max_transfer), alignment - head); 1810 head = (head + num) % alignment; 1811 assert(num < max_write_zeroes); 1812 } else if (tail && num > alignment) { 1813 /* Shorten the request to the last aligned sector. */ 1814 num -= tail; 1815 } 1816 1817 /* limit request size */ 1818 if (num > max_write_zeroes) { 1819 num = max_write_zeroes; 1820 } 1821 1822 ret = -ENOTSUP; 1823 /* First try the efficient write zeroes operation */ 1824 if (drv->bdrv_co_pwrite_zeroes) { 1825 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1826 flags & bs->supported_zero_flags); 1827 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1828 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1829 need_flush = true; 1830 } 1831 } else { 1832 assert(!bs->supported_zero_flags); 1833 } 1834 1835 if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 1836 /* Fall back to bounce buffer if write zeroes is unsupported */ 1837 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1838 1839 if ((flags & BDRV_REQ_FUA) && 1840 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1841 /* No need for bdrv_driver_pwrite() to do a fallback 1842 * flush on each chunk; use just one at the end */ 1843 write_flags &= ~BDRV_REQ_FUA; 1844 need_flush = true; 1845 } 1846 num = MIN(num, max_transfer); 1847 if (buf == NULL) { 1848 buf = qemu_try_blockalign0(bs, num); 1849 if (buf == NULL) { 1850 ret = -ENOMEM; 1851 goto fail; 1852 } 1853 } 1854 qemu_iovec_init_buf(&qiov, buf, num); 1855 1856 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 1857 1858 /* Keep bounce buffer around if it is big enough for all 1859 * all future requests. 1860 */ 1861 if (num < max_transfer) { 1862 qemu_vfree(buf); 1863 buf = NULL; 1864 } 1865 } 1866 1867 offset += num; 1868 bytes -= num; 1869 } 1870 1871 fail: 1872 if (ret == 0 && need_flush) { 1873 ret = bdrv_co_flush(bs); 1874 } 1875 qemu_vfree(buf); 1876 return ret; 1877 } 1878 1879 static inline int coroutine_fn 1880 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, 1881 BdrvTrackedRequest *req, int flags) 1882 { 1883 BlockDriverState *bs = child->bs; 1884 bool waited; 1885 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1886 1887 if (bs->read_only) { 1888 return -EPERM; 1889 } 1890 1891 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1892 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1893 assert(!(flags & ~BDRV_REQ_MASK)); 1894 1895 if (flags & BDRV_REQ_SERIALISING) { 1896 waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1897 /* 1898 * For a misaligned request we should have already waited earlier, 1899 * because we come after bdrv_padding_rmw_read which must be called 1900 * with the request already marked as serialising. 1901 */ 1902 assert(!waited || 1903 (req->offset == req->overlap_offset && 1904 req->bytes == req->overlap_bytes)); 1905 } else { 1906 bdrv_wait_serialising_requests(req); 1907 } 1908 1909 assert(req->overlap_offset <= offset); 1910 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1911 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1912 1913 switch (req->type) { 1914 case BDRV_TRACKED_WRITE: 1915 case BDRV_TRACKED_DISCARD: 1916 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1917 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1918 } else { 1919 assert(child->perm & BLK_PERM_WRITE); 1920 } 1921 return notifier_with_return_list_notify(&bs->before_write_notifiers, 1922 req); 1923 case BDRV_TRACKED_TRUNCATE: 1924 assert(child->perm & BLK_PERM_RESIZE); 1925 return 0; 1926 default: 1927 abort(); 1928 } 1929 } 1930 1931 static inline void coroutine_fn 1932 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, 1933 BdrvTrackedRequest *req, int ret) 1934 { 1935 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1936 BlockDriverState *bs = child->bs; 1937 1938 atomic_inc(&bs->write_gen); 1939 1940 /* 1941 * Discard cannot extend the image, but in error handling cases, such as 1942 * when reverting a qcow2 cluster allocation, the discarded range can pass 1943 * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 1944 * here. Instead, just skip it, since semantically a discard request 1945 * beyond EOF cannot expand the image anyway. 1946 */ 1947 if (ret == 0 && 1948 (req->type == BDRV_TRACKED_TRUNCATE || 1949 end_sector > bs->total_sectors) && 1950 req->type != BDRV_TRACKED_DISCARD) { 1951 bs->total_sectors = end_sector; 1952 bdrv_parent_cb_resize(bs); 1953 bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 1954 } 1955 if (req->bytes) { 1956 switch (req->type) { 1957 case BDRV_TRACKED_WRITE: 1958 stat64_max(&bs->wr_highest_offset, offset + bytes); 1959 /* fall through, to set dirty bits */ 1960 case BDRV_TRACKED_DISCARD: 1961 bdrv_set_dirty(bs, offset, bytes); 1962 break; 1963 default: 1964 break; 1965 } 1966 } 1967 } 1968 1969 /* 1970 * Forwards an already correctly aligned write request to the BlockDriver, 1971 * after possibly fragmenting it. 1972 */ 1973 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1974 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1975 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1976 { 1977 BlockDriverState *bs = child->bs; 1978 BlockDriver *drv = bs->drv; 1979 int ret; 1980 1981 uint64_t bytes_remaining = bytes; 1982 int max_transfer; 1983 1984 if (!drv) { 1985 return -ENOMEDIUM; 1986 } 1987 1988 if (bdrv_has_readonly_bitmaps(bs)) { 1989 return -EPERM; 1990 } 1991 1992 assert(is_power_of_2(align)); 1993 assert((offset & (align - 1)) == 0); 1994 assert((bytes & (align - 1)) == 0); 1995 assert(!qiov || qiov_offset + bytes <= qiov->size); 1996 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1997 align); 1998 1999 ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 2000 2001 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 2002 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 2003 qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 2004 flags |= BDRV_REQ_ZERO_WRITE; 2005 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 2006 flags |= BDRV_REQ_MAY_UNMAP; 2007 } 2008 } 2009 2010 if (ret < 0) { 2011 /* Do nothing, write notifier decided to fail this request */ 2012 } else if (flags & BDRV_REQ_ZERO_WRITE) { 2013 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 2014 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 2015 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 2016 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 2017 qiov, qiov_offset); 2018 } else if (bytes <= max_transfer) { 2019 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2020 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 2021 } else { 2022 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2023 while (bytes_remaining) { 2024 int num = MIN(bytes_remaining, max_transfer); 2025 int local_flags = flags; 2026 2027 assert(num); 2028 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 2029 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 2030 /* If FUA is going to be emulated by flush, we only 2031 * need to flush on the last iteration */ 2032 local_flags &= ~BDRV_REQ_FUA; 2033 } 2034 2035 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 2036 num, qiov, 2037 qiov_offset + bytes - bytes_remaining, 2038 local_flags); 2039 if (ret < 0) { 2040 break; 2041 } 2042 bytes_remaining -= num; 2043 } 2044 } 2045 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 2046 2047 if (ret >= 0) { 2048 ret = 0; 2049 } 2050 bdrv_co_write_req_finish(child, offset, bytes, req, ret); 2051 2052 return ret; 2053 } 2054 2055 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 2056 int64_t offset, 2057 unsigned int bytes, 2058 BdrvRequestFlags flags, 2059 BdrvTrackedRequest *req) 2060 { 2061 BlockDriverState *bs = child->bs; 2062 QEMUIOVector local_qiov; 2063 uint64_t align = bs->bl.request_alignment; 2064 int ret = 0; 2065 bool padding; 2066 BdrvRequestPadding pad; 2067 2068 padding = bdrv_init_padding(bs, offset, bytes, &pad); 2069 if (padding) { 2070 bdrv_mark_request_serialising(req, align); 2071 2072 bdrv_padding_rmw_read(child, req, &pad, true); 2073 2074 if (pad.head || pad.merge_reads) { 2075 int64_t aligned_offset = offset & ~(align - 1); 2076 int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 2077 2078 qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 2079 ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 2080 align, &local_qiov, 0, 2081 flags & ~BDRV_REQ_ZERO_WRITE); 2082 if (ret < 0 || pad.merge_reads) { 2083 /* Error or all work is done */ 2084 goto out; 2085 } 2086 offset += write_bytes - pad.head; 2087 bytes -= write_bytes - pad.head; 2088 } 2089 } 2090 2091 assert(!bytes || (offset & (align - 1)) == 0); 2092 if (bytes >= align) { 2093 /* Write the aligned part in the middle. */ 2094 uint64_t aligned_bytes = bytes & ~(align - 1); 2095 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 2096 NULL, 0, flags); 2097 if (ret < 0) { 2098 goto out; 2099 } 2100 bytes -= aligned_bytes; 2101 offset += aligned_bytes; 2102 } 2103 2104 assert(!bytes || (offset & (align - 1)) == 0); 2105 if (bytes) { 2106 assert(align == pad.tail + bytes); 2107 2108 qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 2109 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 2110 &local_qiov, 0, 2111 flags & ~BDRV_REQ_ZERO_WRITE); 2112 } 2113 2114 out: 2115 bdrv_padding_destroy(&pad); 2116 2117 return ret; 2118 } 2119 2120 /* 2121 * Handle a write request in coroutine context 2122 */ 2123 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2124 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 2125 BdrvRequestFlags flags) 2126 { 2127 return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 2128 } 2129 2130 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 2131 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, 2132 BdrvRequestFlags flags) 2133 { 2134 BlockDriverState *bs = child->bs; 2135 BdrvTrackedRequest req; 2136 uint64_t align = bs->bl.request_alignment; 2137 BdrvRequestPadding pad; 2138 int ret; 2139 2140 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 2141 2142 if (!bs->drv) { 2143 return -ENOMEDIUM; 2144 } 2145 2146 ret = bdrv_check_byte_request(bs, offset, bytes); 2147 if (ret < 0) { 2148 return ret; 2149 } 2150 2151 /* If the request is misaligned then we can't make it efficient */ 2152 if ((flags & BDRV_REQ_NO_FALLBACK) && 2153 !QEMU_IS_ALIGNED(offset | bytes, align)) 2154 { 2155 return -ENOTSUP; 2156 } 2157 2158 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2159 /* 2160 * Aligning zero request is nonsense. Even if driver has special meaning 2161 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2162 * it to driver due to request_alignment. 2163 * 2164 * Still, no reason to return an error if someone do unaligned 2165 * zero-length write occasionally. 2166 */ 2167 return 0; 2168 } 2169 2170 bdrv_inc_in_flight(bs); 2171 /* 2172 * Align write if necessary by performing a read-modify-write cycle. 2173 * Pad qiov with the read parts and be sure to have a tracked request not 2174 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 2175 */ 2176 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 2177 2178 if (flags & BDRV_REQ_ZERO_WRITE) { 2179 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 2180 goto out; 2181 } 2182 2183 if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { 2184 bdrv_mark_request_serialising(&req, align); 2185 bdrv_padding_rmw_read(child, &req, &pad, false); 2186 } 2187 2188 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 2189 qiov, qiov_offset, flags); 2190 2191 bdrv_padding_destroy(&pad); 2192 2193 out: 2194 tracked_request_end(&req); 2195 bdrv_dec_in_flight(bs); 2196 2197 return ret; 2198 } 2199 2200 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2201 int bytes, BdrvRequestFlags flags) 2202 { 2203 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 2204 2205 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 2206 flags &= ~BDRV_REQ_MAY_UNMAP; 2207 } 2208 2209 return bdrv_co_pwritev(child, offset, bytes, NULL, 2210 BDRV_REQ_ZERO_WRITE | flags); 2211 } 2212 2213 /* 2214 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 2215 */ 2216 int bdrv_flush_all(void) 2217 { 2218 BdrvNextIterator it; 2219 BlockDriverState *bs = NULL; 2220 int result = 0; 2221 2222 /* 2223 * bdrv queue is managed by record/replay, 2224 * creating new flush request for stopping 2225 * the VM may break the determinism 2226 */ 2227 if (replay_events_enabled()) { 2228 return result; 2229 } 2230 2231 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 2232 AioContext *aio_context = bdrv_get_aio_context(bs); 2233 int ret; 2234 2235 aio_context_acquire(aio_context); 2236 ret = bdrv_flush(bs); 2237 if (ret < 0 && !result) { 2238 result = ret; 2239 } 2240 aio_context_release(aio_context); 2241 } 2242 2243 return result; 2244 } 2245 2246 2247 typedef struct BdrvCoBlockStatusData { 2248 BlockDriverState *bs; 2249 BlockDriverState *base; 2250 bool want_zero; 2251 int64_t offset; 2252 int64_t bytes; 2253 int64_t *pnum; 2254 int64_t *map; 2255 BlockDriverState **file; 2256 } BdrvCoBlockStatusData; 2257 2258 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 2259 bool want_zero, 2260 int64_t offset, 2261 int64_t bytes, 2262 int64_t *pnum, 2263 int64_t *map, 2264 BlockDriverState **file) 2265 { 2266 assert(bs->file && bs->file->bs); 2267 *pnum = bytes; 2268 *map = offset; 2269 *file = bs->file->bs; 2270 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2271 } 2272 2273 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 2274 bool want_zero, 2275 int64_t offset, 2276 int64_t bytes, 2277 int64_t *pnum, 2278 int64_t *map, 2279 BlockDriverState **file) 2280 { 2281 assert(bs->backing && bs->backing->bs); 2282 *pnum = bytes; 2283 *map = offset; 2284 *file = bs->backing->bs; 2285 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2286 } 2287 2288 /* 2289 * Returns the allocation status of the specified sectors. 2290 * Drivers not implementing the functionality are assumed to not support 2291 * backing files, hence all their sectors are reported as allocated. 2292 * 2293 * If 'want_zero' is true, the caller is querying for mapping 2294 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 2295 * _ZERO where possible; otherwise, the result favors larger 'pnum', 2296 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2297 * 2298 * If 'offset' is beyond the end of the disk image the return value is 2299 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 2300 * 2301 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2302 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2303 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 2304 * 2305 * 'pnum' is set to the number of bytes (including and immediately 2306 * following the specified offset) that are easily known to be in the 2307 * same allocated/unallocated state. Note that a second call starting 2308 * at the original offset plus returned pnum may have the same status. 2309 * The returned value is non-zero on success except at end-of-file. 2310 * 2311 * Returns negative errno on failure. Otherwise, if the 2312 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 2313 * set to the host mapping and BDS corresponding to the guest offset. 2314 */ 2315 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2316 bool want_zero, 2317 int64_t offset, int64_t bytes, 2318 int64_t *pnum, int64_t *map, 2319 BlockDriverState **file) 2320 { 2321 int64_t total_size; 2322 int64_t n; /* bytes */ 2323 int ret; 2324 int64_t local_map = 0; 2325 BlockDriverState *local_file = NULL; 2326 int64_t aligned_offset, aligned_bytes; 2327 uint32_t align; 2328 2329 assert(pnum); 2330 *pnum = 0; 2331 total_size = bdrv_getlength(bs); 2332 if (total_size < 0) { 2333 ret = total_size; 2334 goto early_out; 2335 } 2336 2337 if (offset >= total_size) { 2338 ret = BDRV_BLOCK_EOF; 2339 goto early_out; 2340 } 2341 if (!bytes) { 2342 ret = 0; 2343 goto early_out; 2344 } 2345 2346 n = total_size - offset; 2347 if (n < bytes) { 2348 bytes = n; 2349 } 2350 2351 /* Must be non-NULL or bdrv_getlength() would have failed */ 2352 assert(bs->drv); 2353 if (!bs->drv->bdrv_co_block_status) { 2354 *pnum = bytes; 2355 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2356 if (offset + bytes == total_size) { 2357 ret |= BDRV_BLOCK_EOF; 2358 } 2359 if (bs->drv->protocol_name) { 2360 ret |= BDRV_BLOCK_OFFSET_VALID; 2361 local_map = offset; 2362 local_file = bs; 2363 } 2364 goto early_out; 2365 } 2366 2367 bdrv_inc_in_flight(bs); 2368 2369 /* Round out to request_alignment boundaries */ 2370 align = bs->bl.request_alignment; 2371 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2372 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2373 2374 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2375 aligned_bytes, pnum, &local_map, 2376 &local_file); 2377 if (ret < 0) { 2378 *pnum = 0; 2379 goto out; 2380 } 2381 2382 /* 2383 * The driver's result must be a non-zero multiple of request_alignment. 2384 * Clamp pnum and adjust map to original request. 2385 */ 2386 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2387 align > offset - aligned_offset); 2388 if (ret & BDRV_BLOCK_RECURSE) { 2389 assert(ret & BDRV_BLOCK_DATA); 2390 assert(ret & BDRV_BLOCK_OFFSET_VALID); 2391 assert(!(ret & BDRV_BLOCK_ZERO)); 2392 } 2393 2394 *pnum -= offset - aligned_offset; 2395 if (*pnum > bytes) { 2396 *pnum = bytes; 2397 } 2398 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2399 local_map += offset - aligned_offset; 2400 } 2401 2402 if (ret & BDRV_BLOCK_RAW) { 2403 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2404 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2405 *pnum, pnum, &local_map, &local_file); 2406 goto out; 2407 } 2408 2409 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2410 ret |= BDRV_BLOCK_ALLOCATED; 2411 } else if (want_zero && bs->drv->supports_backing) { 2412 if (bs->backing) { 2413 BlockDriverState *bs2 = bs->backing->bs; 2414 int64_t size2 = bdrv_getlength(bs2); 2415 2416 if (size2 >= 0 && offset >= size2) { 2417 ret |= BDRV_BLOCK_ZERO; 2418 } 2419 } else { 2420 ret |= BDRV_BLOCK_ZERO; 2421 } 2422 } 2423 2424 if (want_zero && ret & BDRV_BLOCK_RECURSE && 2425 local_file && local_file != bs && 2426 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2427 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2428 int64_t file_pnum; 2429 int ret2; 2430 2431 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2432 *pnum, &file_pnum, NULL, NULL); 2433 if (ret2 >= 0) { 2434 /* Ignore errors. This is just providing extra information, it 2435 * is useful but not necessary. 2436 */ 2437 if (ret2 & BDRV_BLOCK_EOF && 2438 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2439 /* 2440 * It is valid for the format block driver to read 2441 * beyond the end of the underlying file's current 2442 * size; such areas read as zero. 2443 */ 2444 ret |= BDRV_BLOCK_ZERO; 2445 } else { 2446 /* Limit request to the range reported by the protocol driver */ 2447 *pnum = file_pnum; 2448 ret |= (ret2 & BDRV_BLOCK_ZERO); 2449 } 2450 } 2451 } 2452 2453 out: 2454 bdrv_dec_in_flight(bs); 2455 if (ret >= 0 && offset + *pnum == total_size) { 2456 ret |= BDRV_BLOCK_EOF; 2457 } 2458 early_out: 2459 if (file) { 2460 *file = local_file; 2461 } 2462 if (map) { 2463 *map = local_map; 2464 } 2465 return ret; 2466 } 2467 2468 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2469 BlockDriverState *base, 2470 bool want_zero, 2471 int64_t offset, 2472 int64_t bytes, 2473 int64_t *pnum, 2474 int64_t *map, 2475 BlockDriverState **file) 2476 { 2477 BlockDriverState *p; 2478 int ret = 0; 2479 bool first = true; 2480 2481 assert(bs != base); 2482 for (p = bs; p != base; p = backing_bs(p)) { 2483 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2484 file); 2485 if (ret < 0) { 2486 break; 2487 } 2488 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2489 /* 2490 * Reading beyond the end of the file continues to read 2491 * zeroes, but we can only widen the result to the 2492 * unallocated length we learned from an earlier 2493 * iteration. 2494 */ 2495 *pnum = bytes; 2496 } 2497 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2498 break; 2499 } 2500 /* [offset, pnum] unallocated on this layer, which could be only 2501 * the first part of [offset, bytes]. */ 2502 bytes = MIN(bytes, *pnum); 2503 first = false; 2504 } 2505 return ret; 2506 } 2507 2508 /* Coroutine wrapper for bdrv_block_status_above() */ 2509 static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2510 { 2511 BdrvCoBlockStatusData *data = opaque; 2512 2513 return bdrv_co_block_status_above(data->bs, data->base, 2514 data->want_zero, 2515 data->offset, data->bytes, 2516 data->pnum, data->map, data->file); 2517 } 2518 2519 /* 2520 * Synchronous wrapper around bdrv_co_block_status_above(). 2521 * 2522 * See bdrv_co_block_status_above() for details. 2523 */ 2524 static int bdrv_common_block_status_above(BlockDriverState *bs, 2525 BlockDriverState *base, 2526 bool want_zero, int64_t offset, 2527 int64_t bytes, int64_t *pnum, 2528 int64_t *map, 2529 BlockDriverState **file) 2530 { 2531 BdrvCoBlockStatusData data = { 2532 .bs = bs, 2533 .base = base, 2534 .want_zero = want_zero, 2535 .offset = offset, 2536 .bytes = bytes, 2537 .pnum = pnum, 2538 .map = map, 2539 .file = file, 2540 }; 2541 2542 return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data); 2543 } 2544 2545 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2546 int64_t offset, int64_t bytes, int64_t *pnum, 2547 int64_t *map, BlockDriverState **file) 2548 { 2549 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2550 pnum, map, file); 2551 } 2552 2553 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2554 int64_t *pnum, int64_t *map, BlockDriverState **file) 2555 { 2556 return bdrv_block_status_above(bs, backing_bs(bs), 2557 offset, bytes, pnum, map, file); 2558 } 2559 2560 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2561 int64_t bytes, int64_t *pnum) 2562 { 2563 int ret; 2564 int64_t dummy; 2565 2566 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2567 bytes, pnum ? pnum : &dummy, NULL, 2568 NULL); 2569 if (ret < 0) { 2570 return ret; 2571 } 2572 return !!(ret & BDRV_BLOCK_ALLOCATED); 2573 } 2574 2575 /* 2576 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2577 * 2578 * Return 1 if (a prefix of) the given range is allocated in any image 2579 * between BASE and TOP (BASE is only included if include_base is set). 2580 * BASE can be NULL to check if the given offset is allocated in any 2581 * image of the chain. Return 0 otherwise, or negative errno on 2582 * failure. 2583 * 2584 * 'pnum' is set to the number of bytes (including and immediately 2585 * following the specified offset) that are known to be in the same 2586 * allocated/unallocated state. Note that a subsequent call starting 2587 * at 'offset + *pnum' may return the same allocation status (in other 2588 * words, the result is not necessarily the maximum possible range); 2589 * but 'pnum' will only be 0 when end of file is reached. 2590 * 2591 */ 2592 int bdrv_is_allocated_above(BlockDriverState *top, 2593 BlockDriverState *base, 2594 bool include_base, int64_t offset, 2595 int64_t bytes, int64_t *pnum) 2596 { 2597 BlockDriverState *intermediate; 2598 int ret; 2599 int64_t n = bytes; 2600 2601 assert(base || !include_base); 2602 2603 intermediate = top; 2604 while (include_base || intermediate != base) { 2605 int64_t pnum_inter; 2606 int64_t size_inter; 2607 2608 assert(intermediate); 2609 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2610 if (ret < 0) { 2611 return ret; 2612 } 2613 if (ret) { 2614 *pnum = pnum_inter; 2615 return 1; 2616 } 2617 2618 size_inter = bdrv_getlength(intermediate); 2619 if (size_inter < 0) { 2620 return size_inter; 2621 } 2622 if (n > pnum_inter && 2623 (intermediate == top || offset + pnum_inter < size_inter)) { 2624 n = pnum_inter; 2625 } 2626 2627 if (intermediate == base) { 2628 break; 2629 } 2630 2631 intermediate = backing_bs(intermediate); 2632 } 2633 2634 *pnum = n; 2635 return 0; 2636 } 2637 2638 typedef struct BdrvVmstateCo { 2639 BlockDriverState *bs; 2640 QEMUIOVector *qiov; 2641 int64_t pos; 2642 bool is_read; 2643 } BdrvVmstateCo; 2644 2645 static int coroutine_fn 2646 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2647 bool is_read) 2648 { 2649 BlockDriver *drv = bs->drv; 2650 int ret = -ENOTSUP; 2651 2652 bdrv_inc_in_flight(bs); 2653 2654 if (!drv) { 2655 ret = -ENOMEDIUM; 2656 } else if (drv->bdrv_load_vmstate) { 2657 if (is_read) { 2658 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2659 } else { 2660 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2661 } 2662 } else if (bs->file) { 2663 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2664 } 2665 2666 bdrv_dec_in_flight(bs); 2667 return ret; 2668 } 2669 2670 static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2671 { 2672 BdrvVmstateCo *co = opaque; 2673 2674 return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2675 } 2676 2677 static inline int 2678 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2679 bool is_read) 2680 { 2681 BdrvVmstateCo data = { 2682 .bs = bs, 2683 .qiov = qiov, 2684 .pos = pos, 2685 .is_read = is_read, 2686 }; 2687 2688 return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data); 2689 } 2690 2691 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2692 int64_t pos, int size) 2693 { 2694 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2695 int ret; 2696 2697 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2698 if (ret < 0) { 2699 return ret; 2700 } 2701 2702 return size; 2703 } 2704 2705 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2706 { 2707 return bdrv_rw_vmstate(bs, qiov, pos, false); 2708 } 2709 2710 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2711 int64_t pos, int size) 2712 { 2713 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2714 int ret; 2715 2716 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2717 if (ret < 0) { 2718 return ret; 2719 } 2720 2721 return size; 2722 } 2723 2724 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2725 { 2726 return bdrv_rw_vmstate(bs, qiov, pos, true); 2727 } 2728 2729 /**************************************************************/ 2730 /* async I/Os */ 2731 2732 void bdrv_aio_cancel(BlockAIOCB *acb) 2733 { 2734 qemu_aio_ref(acb); 2735 bdrv_aio_cancel_async(acb); 2736 while (acb->refcnt > 1) { 2737 if (acb->aiocb_info->get_aio_context) { 2738 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2739 } else if (acb->bs) { 2740 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2741 * assert that we're not using an I/O thread. Thread-safe 2742 * code should use bdrv_aio_cancel_async exclusively. 2743 */ 2744 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2745 aio_poll(bdrv_get_aio_context(acb->bs), true); 2746 } else { 2747 abort(); 2748 } 2749 } 2750 qemu_aio_unref(acb); 2751 } 2752 2753 /* Async version of aio cancel. The caller is not blocked if the acb implements 2754 * cancel_async, otherwise we do nothing and let the request normally complete. 2755 * In either case the completion callback must be called. */ 2756 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2757 { 2758 if (acb->aiocb_info->cancel_async) { 2759 acb->aiocb_info->cancel_async(acb); 2760 } 2761 } 2762 2763 /**************************************************************/ 2764 /* Coroutine block device emulation */ 2765 2766 static int coroutine_fn bdrv_flush_co_entry(void *opaque) 2767 { 2768 return bdrv_co_flush(opaque); 2769 } 2770 2771 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2772 { 2773 int current_gen; 2774 int ret = 0; 2775 2776 bdrv_inc_in_flight(bs); 2777 2778 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2779 bdrv_is_sg(bs)) { 2780 goto early_exit; 2781 } 2782 2783 qemu_co_mutex_lock(&bs->reqs_lock); 2784 current_gen = atomic_read(&bs->write_gen); 2785 2786 /* Wait until any previous flushes are completed */ 2787 while (bs->active_flush_req) { 2788 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2789 } 2790 2791 /* Flushes reach this point in nondecreasing current_gen order. */ 2792 bs->active_flush_req = true; 2793 qemu_co_mutex_unlock(&bs->reqs_lock); 2794 2795 /* Write back all layers by calling one driver function */ 2796 if (bs->drv->bdrv_co_flush) { 2797 ret = bs->drv->bdrv_co_flush(bs); 2798 goto out; 2799 } 2800 2801 /* Write back cached data to the OS even with cache=unsafe */ 2802 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2803 if (bs->drv->bdrv_co_flush_to_os) { 2804 ret = bs->drv->bdrv_co_flush_to_os(bs); 2805 if (ret < 0) { 2806 goto out; 2807 } 2808 } 2809 2810 /* But don't actually force it to the disk with cache=unsafe */ 2811 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2812 goto flush_parent; 2813 } 2814 2815 /* Check if we really need to flush anything */ 2816 if (bs->flushed_gen == current_gen) { 2817 goto flush_parent; 2818 } 2819 2820 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2821 if (!bs->drv) { 2822 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2823 * (even in case of apparent success) */ 2824 ret = -ENOMEDIUM; 2825 goto out; 2826 } 2827 if (bs->drv->bdrv_co_flush_to_disk) { 2828 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2829 } else if (bs->drv->bdrv_aio_flush) { 2830 BlockAIOCB *acb; 2831 CoroutineIOCompletion co = { 2832 .coroutine = qemu_coroutine_self(), 2833 }; 2834 2835 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2836 if (acb == NULL) { 2837 ret = -EIO; 2838 } else { 2839 qemu_coroutine_yield(); 2840 ret = co.ret; 2841 } 2842 } else { 2843 /* 2844 * Some block drivers always operate in either writethrough or unsafe 2845 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2846 * know how the server works (because the behaviour is hardcoded or 2847 * depends on server-side configuration), so we can't ensure that 2848 * everything is safe on disk. Returning an error doesn't work because 2849 * that would break guests even if the server operates in writethrough 2850 * mode. 2851 * 2852 * Let's hope the user knows what he's doing. 2853 */ 2854 ret = 0; 2855 } 2856 2857 if (ret < 0) { 2858 goto out; 2859 } 2860 2861 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2862 * in the case of cache=unsafe, so there are no useless flushes. 2863 */ 2864 flush_parent: 2865 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2866 out: 2867 /* Notify any pending flushes that we have completed */ 2868 if (ret == 0) { 2869 bs->flushed_gen = current_gen; 2870 } 2871 2872 qemu_co_mutex_lock(&bs->reqs_lock); 2873 bs->active_flush_req = false; 2874 /* Return value is ignored - it's ok if wait queue is empty */ 2875 qemu_co_queue_next(&bs->flush_queue); 2876 qemu_co_mutex_unlock(&bs->reqs_lock); 2877 2878 early_exit: 2879 bdrv_dec_in_flight(bs); 2880 return ret; 2881 } 2882 2883 int bdrv_flush(BlockDriverState *bs) 2884 { 2885 return bdrv_run_co(bs, bdrv_flush_co_entry, bs); 2886 } 2887 2888 typedef struct DiscardCo { 2889 BdrvChild *child; 2890 int64_t offset; 2891 int64_t bytes; 2892 } DiscardCo; 2893 2894 static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2895 { 2896 DiscardCo *rwco = opaque; 2897 2898 return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); 2899 } 2900 2901 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2902 int64_t bytes) 2903 { 2904 BdrvTrackedRequest req; 2905 int max_pdiscard, ret; 2906 int head, tail, align; 2907 BlockDriverState *bs = child->bs; 2908 2909 if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 2910 return -ENOMEDIUM; 2911 } 2912 2913 if (bdrv_has_readonly_bitmaps(bs)) { 2914 return -EPERM; 2915 } 2916 2917 if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) { 2918 return -EIO; 2919 } 2920 2921 /* Do nothing if disabled. */ 2922 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2923 return 0; 2924 } 2925 2926 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2927 return 0; 2928 } 2929 2930 /* Discard is advisory, but some devices track and coalesce 2931 * unaligned requests, so we must pass everything down rather than 2932 * round here. Still, most devices will just silently ignore 2933 * unaligned requests (by returning -ENOTSUP), so we must fragment 2934 * the request accordingly. */ 2935 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2936 assert(align % bs->bl.request_alignment == 0); 2937 head = offset % align; 2938 tail = (offset + bytes) % align; 2939 2940 bdrv_inc_in_flight(bs); 2941 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2942 2943 ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2944 if (ret < 0) { 2945 goto out; 2946 } 2947 2948 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2949 align); 2950 assert(max_pdiscard >= bs->bl.request_alignment); 2951 2952 while (bytes > 0) { 2953 int64_t num = bytes; 2954 2955 if (head) { 2956 /* Make small requests to get to alignment boundaries. */ 2957 num = MIN(bytes, align - head); 2958 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2959 num %= bs->bl.request_alignment; 2960 } 2961 head = (head + num) % align; 2962 assert(num < max_pdiscard); 2963 } else if (tail) { 2964 if (num > align) { 2965 /* Shorten the request to the last aligned cluster. */ 2966 num -= tail; 2967 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2968 tail > bs->bl.request_alignment) { 2969 tail %= bs->bl.request_alignment; 2970 num -= tail; 2971 } 2972 } 2973 /* limit request size */ 2974 if (num > max_pdiscard) { 2975 num = max_pdiscard; 2976 } 2977 2978 if (!bs->drv) { 2979 ret = -ENOMEDIUM; 2980 goto out; 2981 } 2982 if (bs->drv->bdrv_co_pdiscard) { 2983 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2984 } else { 2985 BlockAIOCB *acb; 2986 CoroutineIOCompletion co = { 2987 .coroutine = qemu_coroutine_self(), 2988 }; 2989 2990 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2991 bdrv_co_io_em_complete, &co); 2992 if (acb == NULL) { 2993 ret = -EIO; 2994 goto out; 2995 } else { 2996 qemu_coroutine_yield(); 2997 ret = co.ret; 2998 } 2999 } 3000 if (ret && ret != -ENOTSUP) { 3001 goto out; 3002 } 3003 3004 offset += num; 3005 bytes -= num; 3006 } 3007 ret = 0; 3008 out: 3009 bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3010 tracked_request_end(&req); 3011 bdrv_dec_in_flight(bs); 3012 return ret; 3013 } 3014 3015 int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) 3016 { 3017 DiscardCo rwco = { 3018 .child = child, 3019 .offset = offset, 3020 .bytes = bytes, 3021 }; 3022 3023 return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco); 3024 } 3025 3026 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 3027 { 3028 BlockDriver *drv = bs->drv; 3029 CoroutineIOCompletion co = { 3030 .coroutine = qemu_coroutine_self(), 3031 }; 3032 BlockAIOCB *acb; 3033 3034 bdrv_inc_in_flight(bs); 3035 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 3036 co.ret = -ENOTSUP; 3037 goto out; 3038 } 3039 3040 if (drv->bdrv_co_ioctl) { 3041 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 3042 } else { 3043 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 3044 if (!acb) { 3045 co.ret = -ENOTSUP; 3046 goto out; 3047 } 3048 qemu_coroutine_yield(); 3049 } 3050 out: 3051 bdrv_dec_in_flight(bs); 3052 return co.ret; 3053 } 3054 3055 void *qemu_blockalign(BlockDriverState *bs, size_t size) 3056 { 3057 return qemu_memalign(bdrv_opt_mem_align(bs), size); 3058 } 3059 3060 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 3061 { 3062 return memset(qemu_blockalign(bs, size), 0, size); 3063 } 3064 3065 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 3066 { 3067 size_t align = bdrv_opt_mem_align(bs); 3068 3069 /* Ensure that NULL is never returned on success */ 3070 assert(align > 0); 3071 if (size == 0) { 3072 size = align; 3073 } 3074 3075 return qemu_try_memalign(align, size); 3076 } 3077 3078 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 3079 { 3080 void *mem = qemu_try_blockalign(bs, size); 3081 3082 if (mem) { 3083 memset(mem, 0, size); 3084 } 3085 3086 return mem; 3087 } 3088 3089 /* 3090 * Check if all memory in this vector is sector aligned. 3091 */ 3092 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 3093 { 3094 int i; 3095 size_t alignment = bdrv_min_mem_align(bs); 3096 3097 for (i = 0; i < qiov->niov; i++) { 3098 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 3099 return false; 3100 } 3101 if (qiov->iov[i].iov_len % alignment) { 3102 return false; 3103 } 3104 } 3105 3106 return true; 3107 } 3108 3109 void bdrv_add_before_write_notifier(BlockDriverState *bs, 3110 NotifierWithReturn *notifier) 3111 { 3112 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 3113 } 3114 3115 void bdrv_io_plug(BlockDriverState *bs) 3116 { 3117 BdrvChild *child; 3118 3119 QLIST_FOREACH(child, &bs->children, next) { 3120 bdrv_io_plug(child->bs); 3121 } 3122 3123 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 3124 BlockDriver *drv = bs->drv; 3125 if (drv && drv->bdrv_io_plug) { 3126 drv->bdrv_io_plug(bs); 3127 } 3128 } 3129 } 3130 3131 void bdrv_io_unplug(BlockDriverState *bs) 3132 { 3133 BdrvChild *child; 3134 3135 assert(bs->io_plugged); 3136 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 3137 BlockDriver *drv = bs->drv; 3138 if (drv && drv->bdrv_io_unplug) { 3139 drv->bdrv_io_unplug(bs); 3140 } 3141 } 3142 3143 QLIST_FOREACH(child, &bs->children, next) { 3144 bdrv_io_unplug(child->bs); 3145 } 3146 } 3147 3148 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 3149 { 3150 BdrvChild *child; 3151 3152 if (bs->drv && bs->drv->bdrv_register_buf) { 3153 bs->drv->bdrv_register_buf(bs, host, size); 3154 } 3155 QLIST_FOREACH(child, &bs->children, next) { 3156 bdrv_register_buf(child->bs, host, size); 3157 } 3158 } 3159 3160 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 3161 { 3162 BdrvChild *child; 3163 3164 if (bs->drv && bs->drv->bdrv_unregister_buf) { 3165 bs->drv->bdrv_unregister_buf(bs, host); 3166 } 3167 QLIST_FOREACH(child, &bs->children, next) { 3168 bdrv_unregister_buf(child->bs, host); 3169 } 3170 } 3171 3172 static int coroutine_fn bdrv_co_copy_range_internal( 3173 BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 3174 uint64_t dst_offset, uint64_t bytes, 3175 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3176 bool recurse_src) 3177 { 3178 BdrvTrackedRequest req; 3179 int ret; 3180 3181 /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3182 assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3183 assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3184 3185 if (!dst || !dst->bs) { 3186 return -ENOMEDIUM; 3187 } 3188 ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 3189 if (ret) { 3190 return ret; 3191 } 3192 if (write_flags & BDRV_REQ_ZERO_WRITE) { 3193 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3194 } 3195 3196 if (!src || !src->bs) { 3197 return -ENOMEDIUM; 3198 } 3199 ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 3200 if (ret) { 3201 return ret; 3202 } 3203 3204 if (!src->bs->drv->bdrv_co_copy_range_from 3205 || !dst->bs->drv->bdrv_co_copy_range_to 3206 || src->bs->encrypted || dst->bs->encrypted) { 3207 return -ENOTSUP; 3208 } 3209 3210 if (recurse_src) { 3211 bdrv_inc_in_flight(src->bs); 3212 tracked_request_begin(&req, src->bs, src_offset, bytes, 3213 BDRV_TRACKED_READ); 3214 3215 /* BDRV_REQ_SERIALISING is only for write operation */ 3216 assert(!(read_flags & BDRV_REQ_SERIALISING)); 3217 bdrv_wait_serialising_requests(&req); 3218 3219 ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3220 src, src_offset, 3221 dst, dst_offset, 3222 bytes, 3223 read_flags, write_flags); 3224 3225 tracked_request_end(&req); 3226 bdrv_dec_in_flight(src->bs); 3227 } else { 3228 bdrv_inc_in_flight(dst->bs); 3229 tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3230 BDRV_TRACKED_WRITE); 3231 ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 3232 write_flags); 3233 if (!ret) { 3234 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3235 src, src_offset, 3236 dst, dst_offset, 3237 bytes, 3238 read_flags, write_flags); 3239 } 3240 bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3241 tracked_request_end(&req); 3242 bdrv_dec_in_flight(dst->bs); 3243 } 3244 3245 return ret; 3246 } 3247 3248 /* Copy range from @src to @dst. 3249 * 3250 * See the comment of bdrv_co_copy_range for the parameter and return value 3251 * semantics. */ 3252 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 3253 BdrvChild *dst, uint64_t dst_offset, 3254 uint64_t bytes, 3255 BdrvRequestFlags read_flags, 3256 BdrvRequestFlags write_flags) 3257 { 3258 trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3259 read_flags, write_flags); 3260 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3261 bytes, read_flags, write_flags, true); 3262 } 3263 3264 /* Copy range from @src to @dst. 3265 * 3266 * See the comment of bdrv_co_copy_range for the parameter and return value 3267 * semantics. */ 3268 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3269 BdrvChild *dst, uint64_t dst_offset, 3270 uint64_t bytes, 3271 BdrvRequestFlags read_flags, 3272 BdrvRequestFlags write_flags) 3273 { 3274 trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3275 read_flags, write_flags); 3276 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3277 bytes, read_flags, write_flags, false); 3278 } 3279 3280 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3281 BdrvChild *dst, uint64_t dst_offset, 3282 uint64_t bytes, BdrvRequestFlags read_flags, 3283 BdrvRequestFlags write_flags) 3284 { 3285 return bdrv_co_copy_range_from(src, src_offset, 3286 dst, dst_offset, 3287 bytes, read_flags, write_flags); 3288 } 3289 3290 static void bdrv_parent_cb_resize(BlockDriverState *bs) 3291 { 3292 BdrvChild *c; 3293 QLIST_FOREACH(c, &bs->parents, next_parent) { 3294 if (c->klass->resize) { 3295 c->klass->resize(c); 3296 } 3297 } 3298 } 3299 3300 /** 3301 * Truncate file to 'offset' bytes (needed only for file protocols) 3302 * 3303 * If 'exact' is true, the file must be resized to exactly the given 3304 * 'offset'. Otherwise, it is sufficient for the node to be at least 3305 * 'offset' bytes in length. 3306 */ 3307 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 3308 PreallocMode prealloc, BdrvRequestFlags flags, 3309 Error **errp) 3310 { 3311 BlockDriverState *bs = child->bs; 3312 BlockDriver *drv = bs->drv; 3313 BdrvTrackedRequest req; 3314 int64_t old_size, new_bytes; 3315 int ret; 3316 3317 3318 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 3319 if (!drv) { 3320 error_setg(errp, "No medium inserted"); 3321 return -ENOMEDIUM; 3322 } 3323 if (offset < 0) { 3324 error_setg(errp, "Image size cannot be negative"); 3325 return -EINVAL; 3326 } 3327 3328 old_size = bdrv_getlength(bs); 3329 if (old_size < 0) { 3330 error_setg_errno(errp, -old_size, "Failed to get old image size"); 3331 return old_size; 3332 } 3333 3334 if (offset > old_size) { 3335 new_bytes = offset - old_size; 3336 } else { 3337 new_bytes = 0; 3338 } 3339 3340 bdrv_inc_in_flight(bs); 3341 tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 3342 BDRV_TRACKED_TRUNCATE); 3343 3344 /* If we are growing the image and potentially using preallocation for the 3345 * new area, we need to make sure that no write requests are made to it 3346 * concurrently or they might be overwritten by preallocation. */ 3347 if (new_bytes) { 3348 bdrv_mark_request_serialising(&req, 1); 3349 } 3350 if (bs->read_only) { 3351 error_setg(errp, "Image is read-only"); 3352 ret = -EACCES; 3353 goto out; 3354 } 3355 ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3356 0); 3357 if (ret < 0) { 3358 error_setg_errno(errp, -ret, 3359 "Failed to prepare request for truncation"); 3360 goto out; 3361 } 3362 3363 /* 3364 * If the image has a backing file that is large enough that it would 3365 * provide data for the new area, we cannot leave it unallocated because 3366 * then the backing file content would become visible. Instead, zero-fill 3367 * the new area. 3368 * 3369 * Note that if the image has a backing file, but was opened without the 3370 * backing file, taking care of keeping things consistent with that backing 3371 * file is the user's responsibility. 3372 */ 3373 if (new_bytes && bs->backing) { 3374 int64_t backing_len; 3375 3376 backing_len = bdrv_getlength(backing_bs(bs)); 3377 if (backing_len < 0) { 3378 ret = backing_len; 3379 error_setg_errno(errp, -ret, "Could not get backing file size"); 3380 goto out; 3381 } 3382 3383 if (backing_len > old_size) { 3384 flags |= BDRV_REQ_ZERO_WRITE; 3385 } 3386 } 3387 3388 if (drv->bdrv_co_truncate) { 3389 if (flags & ~bs->supported_truncate_flags) { 3390 error_setg(errp, "Block driver does not support requested flags"); 3391 ret = -ENOTSUP; 3392 goto out; 3393 } 3394 ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 3395 } else if (bs->file && drv->is_filter) { 3396 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); 3397 } else { 3398 error_setg(errp, "Image format driver does not support resize"); 3399 ret = -ENOTSUP; 3400 goto out; 3401 } 3402 if (ret < 0) { 3403 goto out; 3404 } 3405 3406 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3407 if (ret < 0) { 3408 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 3409 } else { 3410 offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3411 } 3412 /* It's possible that truncation succeeded but refresh_total_sectors 3413 * failed, but the latter doesn't affect how we should finish the request. 3414 * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3415 bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 3416 3417 out: 3418 tracked_request_end(&req); 3419 bdrv_dec_in_flight(bs); 3420 3421 return ret; 3422 } 3423 3424 typedef struct TruncateCo { 3425 BdrvChild *child; 3426 int64_t offset; 3427 bool exact; 3428 PreallocMode prealloc; 3429 BdrvRequestFlags flags; 3430 Error **errp; 3431 } TruncateCo; 3432 3433 static int coroutine_fn bdrv_truncate_co_entry(void *opaque) 3434 { 3435 TruncateCo *tco = opaque; 3436 3437 return bdrv_co_truncate(tco->child, tco->offset, tco->exact, 3438 tco->prealloc, tco->flags, tco->errp); 3439 } 3440 3441 int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, 3442 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) 3443 { 3444 TruncateCo tco = { 3445 .child = child, 3446 .offset = offset, 3447 .exact = exact, 3448 .prealloc = prealloc, 3449 .flags = flags, 3450 .errp = errp, 3451 }; 3452 3453 return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco); 3454 } 3455