1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "sysemu/replay.h" 37 38 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 39 40 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 41 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 42 43 static void bdrv_parent_cb_resize(BlockDriverState *bs); 44 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 45 int64_t offset, int bytes, BdrvRequestFlags flags); 46 47 static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 48 bool ignore_bds_parents) 49 { 50 BdrvChild *c, *next; 51 52 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 53 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 54 continue; 55 } 56 bdrv_parent_drained_begin_single(c, false); 57 } 58 } 59 60 static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 61 int *drained_end_counter) 62 { 63 assert(c->parent_quiesce_counter > 0); 64 c->parent_quiesce_counter--; 65 if (c->klass->drained_end) { 66 c->klass->drained_end(c, drained_end_counter); 67 } 68 } 69 70 void bdrv_parent_drained_end_single(BdrvChild *c) 71 { 72 int drained_end_counter = 0; 73 bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 74 BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0); 75 } 76 77 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 78 bool ignore_bds_parents, 79 int *drained_end_counter) 80 { 81 BdrvChild *c; 82 83 QLIST_FOREACH(c, &bs->parents, next_parent) { 84 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 85 continue; 86 } 87 bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 88 } 89 } 90 91 static bool bdrv_parent_drained_poll_single(BdrvChild *c) 92 { 93 if (c->klass->drained_poll) { 94 return c->klass->drained_poll(c); 95 } 96 return false; 97 } 98 99 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 100 bool ignore_bds_parents) 101 { 102 BdrvChild *c, *next; 103 bool busy = false; 104 105 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 106 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 107 continue; 108 } 109 busy |= bdrv_parent_drained_poll_single(c); 110 } 111 112 return busy; 113 } 114 115 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 116 { 117 c->parent_quiesce_counter++; 118 if (c->klass->drained_begin) { 119 c->klass->drained_begin(c); 120 } 121 if (poll) { 122 BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 123 } 124 } 125 126 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 127 { 128 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 129 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 130 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 131 src->opt_mem_alignment); 132 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 133 src->min_mem_alignment); 134 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 135 } 136 137 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 138 { 139 BlockDriver *drv = bs->drv; 140 Error *local_err = NULL; 141 142 memset(&bs->bl, 0, sizeof(bs->bl)); 143 144 if (!drv) { 145 return; 146 } 147 148 /* Default alignment based on whether driver has byte interface */ 149 bs->bl.request_alignment = (drv->bdrv_co_preadv || 150 drv->bdrv_aio_preadv || 151 drv->bdrv_co_preadv_part) ? 1 : 512; 152 153 /* Take some limits from the children as a default */ 154 if (bs->file) { 155 bdrv_refresh_limits(bs->file->bs, &local_err); 156 if (local_err) { 157 error_propagate(errp, local_err); 158 return; 159 } 160 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 161 } else { 162 bs->bl.min_mem_alignment = 512; 163 bs->bl.opt_mem_alignment = qemu_real_host_page_size; 164 165 /* Safe default since most protocols use readv()/writev()/etc */ 166 bs->bl.max_iov = IOV_MAX; 167 } 168 169 if (bs->backing) { 170 bdrv_refresh_limits(bs->backing->bs, &local_err); 171 if (local_err) { 172 error_propagate(errp, local_err); 173 return; 174 } 175 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 176 } 177 178 /* Then let the driver override it */ 179 if (drv->bdrv_refresh_limits) { 180 drv->bdrv_refresh_limits(bs, errp); 181 } 182 } 183 184 /** 185 * The copy-on-read flag is actually a reference count so multiple users may 186 * use the feature without worrying about clobbering its previous state. 187 * Copy-on-read stays enabled until all users have called to disable it. 188 */ 189 void bdrv_enable_copy_on_read(BlockDriverState *bs) 190 { 191 atomic_inc(&bs->copy_on_read); 192 } 193 194 void bdrv_disable_copy_on_read(BlockDriverState *bs) 195 { 196 int old = atomic_fetch_dec(&bs->copy_on_read); 197 assert(old >= 1); 198 } 199 200 typedef struct { 201 Coroutine *co; 202 BlockDriverState *bs; 203 bool done; 204 bool begin; 205 bool recursive; 206 bool poll; 207 BdrvChild *parent; 208 bool ignore_bds_parents; 209 int *drained_end_counter; 210 } BdrvCoDrainData; 211 212 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 213 { 214 BdrvCoDrainData *data = opaque; 215 BlockDriverState *bs = data->bs; 216 217 if (data->begin) { 218 bs->drv->bdrv_co_drain_begin(bs); 219 } else { 220 bs->drv->bdrv_co_drain_end(bs); 221 } 222 223 /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 224 atomic_mb_set(&data->done, true); 225 if (!data->begin) { 226 atomic_dec(data->drained_end_counter); 227 } 228 bdrv_dec_in_flight(bs); 229 230 g_free(data); 231 } 232 233 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 234 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 235 int *drained_end_counter) 236 { 237 BdrvCoDrainData *data; 238 239 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 240 (!begin && !bs->drv->bdrv_co_drain_end)) { 241 return; 242 } 243 244 data = g_new(BdrvCoDrainData, 1); 245 *data = (BdrvCoDrainData) { 246 .bs = bs, 247 .done = false, 248 .begin = begin, 249 .drained_end_counter = drained_end_counter, 250 }; 251 252 if (!begin) { 253 atomic_inc(drained_end_counter); 254 } 255 256 /* Make sure the driver callback completes during the polling phase for 257 * drain_begin. */ 258 bdrv_inc_in_flight(bs); 259 data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 260 aio_co_schedule(bdrv_get_aio_context(bs), data->co); 261 } 262 263 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 264 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 265 BdrvChild *ignore_parent, bool ignore_bds_parents) 266 { 267 BdrvChild *child, *next; 268 269 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 270 return true; 271 } 272 273 if (atomic_read(&bs->in_flight)) { 274 return true; 275 } 276 277 if (recursive) { 278 assert(!ignore_bds_parents); 279 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 280 if (bdrv_drain_poll(child->bs, recursive, child, false)) { 281 return true; 282 } 283 } 284 } 285 286 return false; 287 } 288 289 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 290 BdrvChild *ignore_parent) 291 { 292 return bdrv_drain_poll(bs, recursive, ignore_parent, false); 293 } 294 295 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 296 BdrvChild *parent, bool ignore_bds_parents, 297 bool poll); 298 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 299 BdrvChild *parent, bool ignore_bds_parents, 300 int *drained_end_counter); 301 302 static void bdrv_co_drain_bh_cb(void *opaque) 303 { 304 BdrvCoDrainData *data = opaque; 305 Coroutine *co = data->co; 306 BlockDriverState *bs = data->bs; 307 308 if (bs) { 309 AioContext *ctx = bdrv_get_aio_context(bs); 310 AioContext *co_ctx = qemu_coroutine_get_aio_context(co); 311 312 /* 313 * When the coroutine yielded, the lock for its home context was 314 * released, so we need to re-acquire it here. If it explicitly 315 * acquired a different context, the lock is still held and we don't 316 * want to lock it a second time (or AIO_WAIT_WHILE() would hang). 317 */ 318 if (ctx == co_ctx) { 319 aio_context_acquire(ctx); 320 } 321 bdrv_dec_in_flight(bs); 322 if (data->begin) { 323 assert(!data->drained_end_counter); 324 bdrv_do_drained_begin(bs, data->recursive, data->parent, 325 data->ignore_bds_parents, data->poll); 326 } else { 327 assert(!data->poll); 328 bdrv_do_drained_end(bs, data->recursive, data->parent, 329 data->ignore_bds_parents, 330 data->drained_end_counter); 331 } 332 if (ctx == co_ctx) { 333 aio_context_release(ctx); 334 } 335 } else { 336 assert(data->begin); 337 bdrv_drain_all_begin(); 338 } 339 340 data->done = true; 341 aio_co_wake(co); 342 } 343 344 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 345 bool begin, bool recursive, 346 BdrvChild *parent, 347 bool ignore_bds_parents, 348 bool poll, 349 int *drained_end_counter) 350 { 351 BdrvCoDrainData data; 352 353 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 354 * other coroutines run if they were queued by aio_co_enter(). */ 355 356 assert(qemu_in_coroutine()); 357 data = (BdrvCoDrainData) { 358 .co = qemu_coroutine_self(), 359 .bs = bs, 360 .done = false, 361 .begin = begin, 362 .recursive = recursive, 363 .parent = parent, 364 .ignore_bds_parents = ignore_bds_parents, 365 .poll = poll, 366 .drained_end_counter = drained_end_counter, 367 }; 368 369 if (bs) { 370 bdrv_inc_in_flight(bs); 371 } 372 replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs), 373 bdrv_co_drain_bh_cb, &data); 374 375 qemu_coroutine_yield(); 376 /* If we are resumed from some other event (such as an aio completion or a 377 * timer callback), it is a bug in the caller that should be fixed. */ 378 assert(data.done); 379 } 380 381 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 382 BdrvChild *parent, bool ignore_bds_parents) 383 { 384 assert(!qemu_in_coroutine()); 385 386 /* Stop things in parent-to-child order */ 387 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 388 aio_disable_external(bdrv_get_aio_context(bs)); 389 } 390 391 bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 392 bdrv_drain_invoke(bs, true, NULL); 393 } 394 395 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 396 BdrvChild *parent, bool ignore_bds_parents, 397 bool poll) 398 { 399 BdrvChild *child, *next; 400 401 if (qemu_in_coroutine()) { 402 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 403 poll, NULL); 404 return; 405 } 406 407 bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 408 409 if (recursive) { 410 assert(!ignore_bds_parents); 411 bs->recursive_quiesce_counter++; 412 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 413 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 414 false); 415 } 416 } 417 418 /* 419 * Wait for drained requests to finish. 420 * 421 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 422 * call is needed so things in this AioContext can make progress even 423 * though we don't return to the main AioContext loop - this automatically 424 * includes other nodes in the same AioContext and therefore all child 425 * nodes. 426 */ 427 if (poll) { 428 assert(!ignore_bds_parents); 429 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 430 } 431 } 432 433 void bdrv_drained_begin(BlockDriverState *bs) 434 { 435 bdrv_do_drained_begin(bs, false, NULL, false, true); 436 } 437 438 void bdrv_subtree_drained_begin(BlockDriverState *bs) 439 { 440 bdrv_do_drained_begin(bs, true, NULL, false, true); 441 } 442 443 /** 444 * This function does not poll, nor must any of its recursively called 445 * functions. The *drained_end_counter pointee will be incremented 446 * once for every background operation scheduled, and decremented once 447 * the operation settles. Therefore, the pointer must remain valid 448 * until the pointee reaches 0. That implies that whoever sets up the 449 * pointee has to poll until it is 0. 450 * 451 * We use atomic operations to access *drained_end_counter, because 452 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 453 * @bs may contain nodes in different AioContexts, 454 * (2) bdrv_drain_all_end() uses the same counter for all nodes, 455 * regardless of which AioContext they are in. 456 */ 457 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 458 BdrvChild *parent, bool ignore_bds_parents, 459 int *drained_end_counter) 460 { 461 BdrvChild *child; 462 int old_quiesce_counter; 463 464 assert(drained_end_counter != NULL); 465 466 if (qemu_in_coroutine()) { 467 bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 468 false, drained_end_counter); 469 return; 470 } 471 assert(bs->quiesce_counter > 0); 472 473 /* Re-enable things in child-to-parent order */ 474 bdrv_drain_invoke(bs, false, drained_end_counter); 475 bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 476 drained_end_counter); 477 478 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 479 if (old_quiesce_counter == 1) { 480 aio_enable_external(bdrv_get_aio_context(bs)); 481 } 482 483 if (recursive) { 484 assert(!ignore_bds_parents); 485 bs->recursive_quiesce_counter--; 486 QLIST_FOREACH(child, &bs->children, next) { 487 bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 488 drained_end_counter); 489 } 490 } 491 } 492 493 void bdrv_drained_end(BlockDriverState *bs) 494 { 495 int drained_end_counter = 0; 496 bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 497 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 498 } 499 500 void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 501 { 502 bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 503 } 504 505 void bdrv_subtree_drained_end(BlockDriverState *bs) 506 { 507 int drained_end_counter = 0; 508 bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 509 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 510 } 511 512 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 513 { 514 int i; 515 516 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 517 bdrv_do_drained_begin(child->bs, true, child, false, true); 518 } 519 } 520 521 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 522 { 523 int drained_end_counter = 0; 524 int i; 525 526 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 527 bdrv_do_drained_end(child->bs, true, child, false, 528 &drained_end_counter); 529 } 530 531 BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0); 532 } 533 534 /* 535 * Wait for pending requests to complete on a single BlockDriverState subtree, 536 * and suspend block driver's internal I/O until next request arrives. 537 * 538 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 539 * AioContext. 540 */ 541 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 542 { 543 assert(qemu_in_coroutine()); 544 bdrv_drained_begin(bs); 545 bdrv_drained_end(bs); 546 } 547 548 void bdrv_drain(BlockDriverState *bs) 549 { 550 bdrv_drained_begin(bs); 551 bdrv_drained_end(bs); 552 } 553 554 static void bdrv_drain_assert_idle(BlockDriverState *bs) 555 { 556 BdrvChild *child, *next; 557 558 assert(atomic_read(&bs->in_flight) == 0); 559 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 560 bdrv_drain_assert_idle(child->bs); 561 } 562 } 563 564 unsigned int bdrv_drain_all_count = 0; 565 566 static bool bdrv_drain_all_poll(void) 567 { 568 BlockDriverState *bs = NULL; 569 bool result = false; 570 571 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 572 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 573 while ((bs = bdrv_next_all_states(bs))) { 574 AioContext *aio_context = bdrv_get_aio_context(bs); 575 aio_context_acquire(aio_context); 576 result |= bdrv_drain_poll(bs, false, NULL, true); 577 aio_context_release(aio_context); 578 } 579 580 return result; 581 } 582 583 /* 584 * Wait for pending requests to complete across all BlockDriverStates 585 * 586 * This function does not flush data to disk, use bdrv_flush_all() for that 587 * after calling this function. 588 * 589 * This pauses all block jobs and disables external clients. It must 590 * be paired with bdrv_drain_all_end(). 591 * 592 * NOTE: no new block jobs or BlockDriverStates can be created between 593 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 594 */ 595 void bdrv_drain_all_begin(void) 596 { 597 BlockDriverState *bs = NULL; 598 599 if (qemu_in_coroutine()) { 600 bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 601 return; 602 } 603 604 /* 605 * bdrv queue is managed by record/replay, 606 * waiting for finishing the I/O requests may 607 * be infinite 608 */ 609 if (replay_events_enabled()) { 610 return; 611 } 612 613 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 614 * loop AioContext, so make sure we're in the main context. */ 615 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 616 assert(bdrv_drain_all_count < INT_MAX); 617 bdrv_drain_all_count++; 618 619 /* Quiesce all nodes, without polling in-flight requests yet. The graph 620 * cannot change during this loop. */ 621 while ((bs = bdrv_next_all_states(bs))) { 622 AioContext *aio_context = bdrv_get_aio_context(bs); 623 624 aio_context_acquire(aio_context); 625 bdrv_do_drained_begin(bs, false, NULL, true, false); 626 aio_context_release(aio_context); 627 } 628 629 /* Now poll the in-flight requests */ 630 AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 631 632 while ((bs = bdrv_next_all_states(bs))) { 633 bdrv_drain_assert_idle(bs); 634 } 635 } 636 637 void bdrv_drain_all_end(void) 638 { 639 BlockDriverState *bs = NULL; 640 int drained_end_counter = 0; 641 642 /* 643 * bdrv queue is managed by record/replay, 644 * waiting for finishing the I/O requests may 645 * be endless 646 */ 647 if (replay_events_enabled()) { 648 return; 649 } 650 651 while ((bs = bdrv_next_all_states(bs))) { 652 AioContext *aio_context = bdrv_get_aio_context(bs); 653 654 aio_context_acquire(aio_context); 655 bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 656 aio_context_release(aio_context); 657 } 658 659 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 660 AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0); 661 662 assert(bdrv_drain_all_count > 0); 663 bdrv_drain_all_count--; 664 } 665 666 void bdrv_drain_all(void) 667 { 668 bdrv_drain_all_begin(); 669 bdrv_drain_all_end(); 670 } 671 672 /** 673 * Remove an active request from the tracked requests list 674 * 675 * This function should be called when a tracked request is completing. 676 */ 677 static void tracked_request_end(BdrvTrackedRequest *req) 678 { 679 if (req->serialising) { 680 atomic_dec(&req->bs->serialising_in_flight); 681 } 682 683 qemu_co_mutex_lock(&req->bs->reqs_lock); 684 QLIST_REMOVE(req, list); 685 qemu_co_queue_restart_all(&req->wait_queue); 686 qemu_co_mutex_unlock(&req->bs->reqs_lock); 687 } 688 689 /** 690 * Add an active request to the tracked requests list 691 */ 692 static void tracked_request_begin(BdrvTrackedRequest *req, 693 BlockDriverState *bs, 694 int64_t offset, 695 uint64_t bytes, 696 enum BdrvTrackedRequestType type) 697 { 698 assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); 699 700 *req = (BdrvTrackedRequest){ 701 .bs = bs, 702 .offset = offset, 703 .bytes = bytes, 704 .type = type, 705 .co = qemu_coroutine_self(), 706 .serialising = false, 707 .overlap_offset = offset, 708 .overlap_bytes = bytes, 709 }; 710 711 qemu_co_queue_init(&req->wait_queue); 712 713 qemu_co_mutex_lock(&bs->reqs_lock); 714 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 715 qemu_co_mutex_unlock(&bs->reqs_lock); 716 } 717 718 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 719 int64_t offset, uint64_t bytes) 720 { 721 /* aaaa bbbb */ 722 if (offset >= req->overlap_offset + req->overlap_bytes) { 723 return false; 724 } 725 /* bbbb aaaa */ 726 if (req->overlap_offset >= offset + bytes) { 727 return false; 728 } 729 return true; 730 } 731 732 static bool coroutine_fn 733 bdrv_wait_serialising_requests_locked(BlockDriverState *bs, 734 BdrvTrackedRequest *self) 735 { 736 BdrvTrackedRequest *req; 737 bool retry; 738 bool waited = false; 739 740 do { 741 retry = false; 742 QLIST_FOREACH(req, &bs->tracked_requests, list) { 743 if (req == self || (!req->serialising && !self->serialising)) { 744 continue; 745 } 746 if (tracked_request_overlaps(req, self->overlap_offset, 747 self->overlap_bytes)) 748 { 749 /* Hitting this means there was a reentrant request, for 750 * example, a block driver issuing nested requests. This must 751 * never happen since it means deadlock. 752 */ 753 assert(qemu_coroutine_self() != req->co); 754 755 /* If the request is already (indirectly) waiting for us, or 756 * will wait for us as soon as it wakes up, then just go on 757 * (instead of producing a deadlock in the former case). */ 758 if (!req->waiting_for) { 759 self->waiting_for = req; 760 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 761 self->waiting_for = NULL; 762 retry = true; 763 waited = true; 764 break; 765 } 766 } 767 } 768 } while (retry); 769 return waited; 770 } 771 772 bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 773 { 774 BlockDriverState *bs = req->bs; 775 int64_t overlap_offset = req->offset & ~(align - 1); 776 uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 777 - overlap_offset; 778 bool waited; 779 780 qemu_co_mutex_lock(&bs->reqs_lock); 781 if (!req->serialising) { 782 atomic_inc(&req->bs->serialising_in_flight); 783 req->serialising = true; 784 } 785 786 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 787 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 788 waited = bdrv_wait_serialising_requests_locked(bs, req); 789 qemu_co_mutex_unlock(&bs->reqs_lock); 790 return waited; 791 } 792 793 /** 794 * Return the tracked request on @bs for the current coroutine, or 795 * NULL if there is none. 796 */ 797 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 798 { 799 BdrvTrackedRequest *req; 800 Coroutine *self = qemu_coroutine_self(); 801 802 QLIST_FOREACH(req, &bs->tracked_requests, list) { 803 if (req->co == self) { 804 return req; 805 } 806 } 807 808 return NULL; 809 } 810 811 /** 812 * Round a region to cluster boundaries 813 */ 814 void bdrv_round_to_clusters(BlockDriverState *bs, 815 int64_t offset, int64_t bytes, 816 int64_t *cluster_offset, 817 int64_t *cluster_bytes) 818 { 819 BlockDriverInfo bdi; 820 821 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 822 *cluster_offset = offset; 823 *cluster_bytes = bytes; 824 } else { 825 int64_t c = bdi.cluster_size; 826 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 827 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 828 } 829 } 830 831 static int bdrv_get_cluster_size(BlockDriverState *bs) 832 { 833 BlockDriverInfo bdi; 834 int ret; 835 836 ret = bdrv_get_info(bs, &bdi); 837 if (ret < 0 || bdi.cluster_size == 0) { 838 return bs->bl.request_alignment; 839 } else { 840 return bdi.cluster_size; 841 } 842 } 843 844 void bdrv_inc_in_flight(BlockDriverState *bs) 845 { 846 atomic_inc(&bs->in_flight); 847 } 848 849 void bdrv_wakeup(BlockDriverState *bs) 850 { 851 aio_wait_kick(); 852 } 853 854 void bdrv_dec_in_flight(BlockDriverState *bs) 855 { 856 atomic_dec(&bs->in_flight); 857 bdrv_wakeup(bs); 858 } 859 860 static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 861 { 862 BlockDriverState *bs = self->bs; 863 bool waited = false; 864 865 if (!atomic_read(&bs->serialising_in_flight)) { 866 return false; 867 } 868 869 qemu_co_mutex_lock(&bs->reqs_lock); 870 waited = bdrv_wait_serialising_requests_locked(bs, self); 871 qemu_co_mutex_unlock(&bs->reqs_lock); 872 873 return waited; 874 } 875 876 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 877 size_t size) 878 { 879 if (size > BDRV_REQUEST_MAX_BYTES) { 880 return -EIO; 881 } 882 883 if (!bdrv_is_inserted(bs)) { 884 return -ENOMEDIUM; 885 } 886 887 if (offset < 0) { 888 return -EIO; 889 } 890 891 return 0; 892 } 893 894 typedef struct RwCo { 895 BdrvChild *child; 896 int64_t offset; 897 QEMUIOVector *qiov; 898 bool is_write; 899 int ret; 900 BdrvRequestFlags flags; 901 } RwCo; 902 903 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 904 { 905 RwCo *rwco = opaque; 906 907 if (!rwco->is_write) { 908 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 909 rwco->qiov->size, rwco->qiov, 910 rwco->flags); 911 } else { 912 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 913 rwco->qiov->size, rwco->qiov, 914 rwco->flags); 915 } 916 aio_wait_kick(); 917 } 918 919 /* 920 * Process a vectored synchronous request using coroutines 921 */ 922 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 923 QEMUIOVector *qiov, bool is_write, 924 BdrvRequestFlags flags) 925 { 926 Coroutine *co; 927 RwCo rwco = { 928 .child = child, 929 .offset = offset, 930 .qiov = qiov, 931 .is_write = is_write, 932 .ret = NOT_DONE, 933 .flags = flags, 934 }; 935 936 if (qemu_in_coroutine()) { 937 /* Fast-path if already in coroutine context */ 938 bdrv_rw_co_entry(&rwco); 939 } else { 940 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 941 bdrv_coroutine_enter(child->bs, co); 942 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 943 } 944 return rwco.ret; 945 } 946 947 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 948 int bytes, BdrvRequestFlags flags) 949 { 950 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); 951 952 return bdrv_prwv_co(child, offset, &qiov, true, 953 BDRV_REQ_ZERO_WRITE | flags); 954 } 955 956 /* 957 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 958 * The operation is sped up by checking the block status and only writing 959 * zeroes to the device if they currently do not return zeroes. Optional 960 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 961 * BDRV_REQ_FUA). 962 * 963 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 964 */ 965 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 966 { 967 int ret; 968 int64_t target_size, bytes, offset = 0; 969 BlockDriverState *bs = child->bs; 970 971 target_size = bdrv_getlength(bs); 972 if (target_size < 0) { 973 return target_size; 974 } 975 976 for (;;) { 977 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 978 if (bytes <= 0) { 979 return 0; 980 } 981 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 982 if (ret < 0) { 983 return ret; 984 } 985 if (ret & BDRV_BLOCK_ZERO) { 986 offset += bytes; 987 continue; 988 } 989 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 990 if (ret < 0) { 991 return ret; 992 } 993 offset += bytes; 994 } 995 } 996 997 /* return < 0 if error. See bdrv_pwrite() for the return codes */ 998 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 999 { 1000 int ret; 1001 1002 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 1003 if (ret < 0) { 1004 return ret; 1005 } 1006 1007 return qiov->size; 1008 } 1009 1010 /* See bdrv_pwrite() for the return codes */ 1011 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 1012 { 1013 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1014 1015 if (bytes < 0) { 1016 return -EINVAL; 1017 } 1018 1019 return bdrv_preadv(child, offset, &qiov); 1020 } 1021 1022 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 1023 { 1024 int ret; 1025 1026 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 1027 if (ret < 0) { 1028 return ret; 1029 } 1030 1031 return qiov->size; 1032 } 1033 1034 /* Return no. of bytes on success or < 0 on error. Important errors are: 1035 -EIO generic I/O error (may happen for all errors) 1036 -ENOMEDIUM No media inserted. 1037 -EINVAL Invalid offset or number of bytes 1038 -EACCES Trying to write a read-only device 1039 */ 1040 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 1041 { 1042 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1043 1044 if (bytes < 0) { 1045 return -EINVAL; 1046 } 1047 1048 return bdrv_pwritev(child, offset, &qiov); 1049 } 1050 1051 /* 1052 * Writes to the file and ensures that no writes are reordered across this 1053 * request (acts as a barrier) 1054 * 1055 * Returns 0 on success, -errno in error cases. 1056 */ 1057 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 1058 const void *buf, int count) 1059 { 1060 int ret; 1061 1062 ret = bdrv_pwrite(child, offset, buf, count); 1063 if (ret < 0) { 1064 return ret; 1065 } 1066 1067 ret = bdrv_flush(child->bs); 1068 if (ret < 0) { 1069 return ret; 1070 } 1071 1072 return 0; 1073 } 1074 1075 typedef struct CoroutineIOCompletion { 1076 Coroutine *coroutine; 1077 int ret; 1078 } CoroutineIOCompletion; 1079 1080 static void bdrv_co_io_em_complete(void *opaque, int ret) 1081 { 1082 CoroutineIOCompletion *co = opaque; 1083 1084 co->ret = ret; 1085 aio_co_wake(co->coroutine); 1086 } 1087 1088 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1089 uint64_t offset, uint64_t bytes, 1090 QEMUIOVector *qiov, 1091 size_t qiov_offset, int flags) 1092 { 1093 BlockDriver *drv = bs->drv; 1094 int64_t sector_num; 1095 unsigned int nb_sectors; 1096 QEMUIOVector local_qiov; 1097 int ret; 1098 1099 assert(!(flags & ~BDRV_REQ_MASK)); 1100 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1101 1102 if (!drv) { 1103 return -ENOMEDIUM; 1104 } 1105 1106 if (drv->bdrv_co_preadv_part) { 1107 return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1108 flags); 1109 } 1110 1111 if (qiov_offset > 0 || bytes != qiov->size) { 1112 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1113 qiov = &local_qiov; 1114 } 1115 1116 if (drv->bdrv_co_preadv) { 1117 ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1118 goto out; 1119 } 1120 1121 if (drv->bdrv_aio_preadv) { 1122 BlockAIOCB *acb; 1123 CoroutineIOCompletion co = { 1124 .coroutine = qemu_coroutine_self(), 1125 }; 1126 1127 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 1128 bdrv_co_io_em_complete, &co); 1129 if (acb == NULL) { 1130 ret = -EIO; 1131 goto out; 1132 } else { 1133 qemu_coroutine_yield(); 1134 ret = co.ret; 1135 goto out; 1136 } 1137 } 1138 1139 sector_num = offset >> BDRV_SECTOR_BITS; 1140 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1141 1142 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1143 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1144 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1145 assert(drv->bdrv_co_readv); 1146 1147 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1148 1149 out: 1150 if (qiov == &local_qiov) { 1151 qemu_iovec_destroy(&local_qiov); 1152 } 1153 1154 return ret; 1155 } 1156 1157 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1158 uint64_t offset, uint64_t bytes, 1159 QEMUIOVector *qiov, 1160 size_t qiov_offset, int flags) 1161 { 1162 BlockDriver *drv = bs->drv; 1163 int64_t sector_num; 1164 unsigned int nb_sectors; 1165 QEMUIOVector local_qiov; 1166 int ret; 1167 1168 assert(!(flags & ~BDRV_REQ_MASK)); 1169 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1170 1171 if (!drv) { 1172 return -ENOMEDIUM; 1173 } 1174 1175 if (drv->bdrv_co_pwritev_part) { 1176 ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1177 flags & bs->supported_write_flags); 1178 flags &= ~bs->supported_write_flags; 1179 goto emulate_flags; 1180 } 1181 1182 if (qiov_offset > 0 || bytes != qiov->size) { 1183 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1184 qiov = &local_qiov; 1185 } 1186 1187 if (drv->bdrv_co_pwritev) { 1188 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1189 flags & bs->supported_write_flags); 1190 flags &= ~bs->supported_write_flags; 1191 goto emulate_flags; 1192 } 1193 1194 if (drv->bdrv_aio_pwritev) { 1195 BlockAIOCB *acb; 1196 CoroutineIOCompletion co = { 1197 .coroutine = qemu_coroutine_self(), 1198 }; 1199 1200 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1201 flags & bs->supported_write_flags, 1202 bdrv_co_io_em_complete, &co); 1203 flags &= ~bs->supported_write_flags; 1204 if (acb == NULL) { 1205 ret = -EIO; 1206 } else { 1207 qemu_coroutine_yield(); 1208 ret = co.ret; 1209 } 1210 goto emulate_flags; 1211 } 1212 1213 sector_num = offset >> BDRV_SECTOR_BITS; 1214 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1215 1216 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1217 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1218 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1219 1220 assert(drv->bdrv_co_writev); 1221 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1222 flags & bs->supported_write_flags); 1223 flags &= ~bs->supported_write_flags; 1224 1225 emulate_flags: 1226 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1227 ret = bdrv_co_flush(bs); 1228 } 1229 1230 if (qiov == &local_qiov) { 1231 qemu_iovec_destroy(&local_qiov); 1232 } 1233 1234 return ret; 1235 } 1236 1237 static int coroutine_fn 1238 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1239 uint64_t bytes, QEMUIOVector *qiov, 1240 size_t qiov_offset) 1241 { 1242 BlockDriver *drv = bs->drv; 1243 QEMUIOVector local_qiov; 1244 int ret; 1245 1246 if (!drv) { 1247 return -ENOMEDIUM; 1248 } 1249 1250 if (!block_driver_can_compress(drv)) { 1251 return -ENOTSUP; 1252 } 1253 1254 if (drv->bdrv_co_pwritev_compressed_part) { 1255 return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1256 qiov, qiov_offset); 1257 } 1258 1259 if (qiov_offset == 0) { 1260 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1261 } 1262 1263 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1264 ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1265 qemu_iovec_destroy(&local_qiov); 1266 1267 return ret; 1268 } 1269 1270 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1271 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1272 size_t qiov_offset, int flags) 1273 { 1274 BlockDriverState *bs = child->bs; 1275 1276 /* Perform I/O through a temporary buffer so that users who scribble over 1277 * their read buffer while the operation is in progress do not end up 1278 * modifying the image file. This is critical for zero-copy guest I/O 1279 * where anything might happen inside guest memory. 1280 */ 1281 void *bounce_buffer = NULL; 1282 1283 BlockDriver *drv = bs->drv; 1284 int64_t cluster_offset; 1285 int64_t cluster_bytes; 1286 size_t skip_bytes; 1287 int ret; 1288 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1289 BDRV_REQUEST_MAX_BYTES); 1290 unsigned int progress = 0; 1291 bool skip_write; 1292 1293 if (!drv) { 1294 return -ENOMEDIUM; 1295 } 1296 1297 /* 1298 * Do not write anything when the BDS is inactive. That is not 1299 * allowed, and it would not help. 1300 */ 1301 skip_write = (bs->open_flags & BDRV_O_INACTIVE); 1302 1303 /* FIXME We cannot require callers to have write permissions when all they 1304 * are doing is a read request. If we did things right, write permissions 1305 * would be obtained anyway, but internally by the copy-on-read code. As 1306 * long as it is implemented here rather than in a separate filter driver, 1307 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1308 * it could request permissions. Therefore we have to bypass the permission 1309 * system for the moment. */ 1310 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1311 1312 /* Cover entire cluster so no additional backing file I/O is required when 1313 * allocating cluster in the image file. Note that this value may exceed 1314 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1315 * is one reason we loop rather than doing it all at once. 1316 */ 1317 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1318 skip_bytes = offset - cluster_offset; 1319 1320 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1321 cluster_offset, cluster_bytes); 1322 1323 while (cluster_bytes) { 1324 int64_t pnum; 1325 1326 if (skip_write) { 1327 ret = 1; /* "already allocated", so nothing will be copied */ 1328 pnum = MIN(cluster_bytes, max_transfer); 1329 } else { 1330 ret = bdrv_is_allocated(bs, cluster_offset, 1331 MIN(cluster_bytes, max_transfer), &pnum); 1332 if (ret < 0) { 1333 /* 1334 * Safe to treat errors in querying allocation as if 1335 * unallocated; we'll probably fail again soon on the 1336 * read, but at least that will set a decent errno. 1337 */ 1338 pnum = MIN(cluster_bytes, max_transfer); 1339 } 1340 1341 /* Stop at EOF if the image ends in the middle of the cluster */ 1342 if (ret == 0 && pnum == 0) { 1343 assert(progress >= bytes); 1344 break; 1345 } 1346 1347 assert(skip_bytes < pnum); 1348 } 1349 1350 if (ret <= 0) { 1351 QEMUIOVector local_qiov; 1352 1353 /* Must copy-on-read; use the bounce buffer */ 1354 pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1355 if (!bounce_buffer) { 1356 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 1357 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 1358 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 1359 1360 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 1361 if (!bounce_buffer) { 1362 ret = -ENOMEM; 1363 goto err; 1364 } 1365 } 1366 qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1367 1368 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1369 &local_qiov, 0, 0); 1370 if (ret < 0) { 1371 goto err; 1372 } 1373 1374 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1375 if (drv->bdrv_co_pwrite_zeroes && 1376 buffer_is_zero(bounce_buffer, pnum)) { 1377 /* FIXME: Should we (perhaps conditionally) be setting 1378 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1379 * that still correctly reads as zero? */ 1380 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1381 BDRV_REQ_WRITE_UNCHANGED); 1382 } else { 1383 /* This does not change the data on the disk, it is not 1384 * necessary to flush even in cache=writethrough mode. 1385 */ 1386 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1387 &local_qiov, 0, 1388 BDRV_REQ_WRITE_UNCHANGED); 1389 } 1390 1391 if (ret < 0) { 1392 /* It might be okay to ignore write errors for guest 1393 * requests. If this is a deliberate copy-on-read 1394 * then we don't want to ignore the error. Simply 1395 * report it in all cases. 1396 */ 1397 goto err; 1398 } 1399 1400 if (!(flags & BDRV_REQ_PREFETCH)) { 1401 qemu_iovec_from_buf(qiov, qiov_offset + progress, 1402 bounce_buffer + skip_bytes, 1403 MIN(pnum - skip_bytes, bytes - progress)); 1404 } 1405 } else if (!(flags & BDRV_REQ_PREFETCH)) { 1406 /* Read directly into the destination */ 1407 ret = bdrv_driver_preadv(bs, offset + progress, 1408 MIN(pnum - skip_bytes, bytes - progress), 1409 qiov, qiov_offset + progress, 0); 1410 if (ret < 0) { 1411 goto err; 1412 } 1413 } 1414 1415 cluster_offset += pnum; 1416 cluster_bytes -= pnum; 1417 progress += pnum - skip_bytes; 1418 skip_bytes = 0; 1419 } 1420 ret = 0; 1421 1422 err: 1423 qemu_vfree(bounce_buffer); 1424 return ret; 1425 } 1426 1427 /* 1428 * Forwards an already correctly aligned request to the BlockDriver. This 1429 * handles copy on read, zeroing after EOF, and fragmentation of large 1430 * reads; any other features must be implemented by the caller. 1431 */ 1432 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1433 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1434 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1435 { 1436 BlockDriverState *bs = child->bs; 1437 int64_t total_bytes, max_bytes; 1438 int ret = 0; 1439 uint64_t bytes_remaining = bytes; 1440 int max_transfer; 1441 1442 assert(is_power_of_2(align)); 1443 assert((offset & (align - 1)) == 0); 1444 assert((bytes & (align - 1)) == 0); 1445 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1446 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1447 align); 1448 1449 /* TODO: We would need a per-BDS .supported_read_flags and 1450 * potential fallback support, if we ever implement any read flags 1451 * to pass through to drivers. For now, there aren't any 1452 * passthrough flags. */ 1453 assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); 1454 1455 /* Handle Copy on Read and associated serialisation */ 1456 if (flags & BDRV_REQ_COPY_ON_READ) { 1457 /* If we touch the same cluster it counts as an overlap. This 1458 * guarantees that allocating writes will be serialized and not race 1459 * with each other for the same cluster. For example, in copy-on-read 1460 * it ensures that the CoR read and write operations are atomic and 1461 * guest writes cannot interleave between them. */ 1462 bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1463 } else { 1464 bdrv_wait_serialising_requests(req); 1465 } 1466 1467 if (flags & BDRV_REQ_COPY_ON_READ) { 1468 int64_t pnum; 1469 1470 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1471 if (ret < 0) { 1472 goto out; 1473 } 1474 1475 if (!ret || pnum != bytes) { 1476 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 1477 qiov, qiov_offset, flags); 1478 goto out; 1479 } else if (flags & BDRV_REQ_PREFETCH) { 1480 goto out; 1481 } 1482 } 1483 1484 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1485 total_bytes = bdrv_getlength(bs); 1486 if (total_bytes < 0) { 1487 ret = total_bytes; 1488 goto out; 1489 } 1490 1491 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1492 if (bytes <= max_bytes && bytes <= max_transfer) { 1493 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); 1494 goto out; 1495 } 1496 1497 while (bytes_remaining) { 1498 int num; 1499 1500 if (max_bytes) { 1501 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1502 assert(num); 1503 1504 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1505 num, qiov, bytes - bytes_remaining, 0); 1506 max_bytes -= num; 1507 } else { 1508 num = bytes_remaining; 1509 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1510 bytes_remaining); 1511 } 1512 if (ret < 0) { 1513 goto out; 1514 } 1515 bytes_remaining -= num; 1516 } 1517 1518 out: 1519 return ret < 0 ? ret : 0; 1520 } 1521 1522 /* 1523 * Request padding 1524 * 1525 * |<---- align ----->| |<----- align ---->| 1526 * |<- head ->|<------------- bytes ------------->|<-- tail -->| 1527 * | | | | | | 1528 * -*----------$-------*-------- ... --------*-----$------------*--- 1529 * | | | | | | 1530 * | offset | | end | 1531 * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 1532 * [buf ... ) [tail_buf ) 1533 * 1534 * @buf is an aligned allocation needed to store @head and @tail paddings. @head 1535 * is placed at the beginning of @buf and @tail at the @end. 1536 * 1537 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 1538 * around tail, if tail exists. 1539 * 1540 * @merge_reads is true for small requests, 1541 * if @buf_len == @head + bytes + @tail. In this case it is possible that both 1542 * head and tail exist but @buf_len == align and @tail_buf == @buf. 1543 */ 1544 typedef struct BdrvRequestPadding { 1545 uint8_t *buf; 1546 size_t buf_len; 1547 uint8_t *tail_buf; 1548 size_t head; 1549 size_t tail; 1550 bool merge_reads; 1551 QEMUIOVector local_qiov; 1552 } BdrvRequestPadding; 1553 1554 static bool bdrv_init_padding(BlockDriverState *bs, 1555 int64_t offset, int64_t bytes, 1556 BdrvRequestPadding *pad) 1557 { 1558 uint64_t align = bs->bl.request_alignment; 1559 size_t sum; 1560 1561 memset(pad, 0, sizeof(*pad)); 1562 1563 pad->head = offset & (align - 1); 1564 pad->tail = ((offset + bytes) & (align - 1)); 1565 if (pad->tail) { 1566 pad->tail = align - pad->tail; 1567 } 1568 1569 if (!pad->head && !pad->tail) { 1570 return false; 1571 } 1572 1573 assert(bytes); /* Nothing good in aligning zero-length requests */ 1574 1575 sum = pad->head + bytes + pad->tail; 1576 pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 1577 pad->buf = qemu_blockalign(bs, pad->buf_len); 1578 pad->merge_reads = sum == pad->buf_len; 1579 if (pad->tail) { 1580 pad->tail_buf = pad->buf + pad->buf_len - align; 1581 } 1582 1583 return true; 1584 } 1585 1586 static int bdrv_padding_rmw_read(BdrvChild *child, 1587 BdrvTrackedRequest *req, 1588 BdrvRequestPadding *pad, 1589 bool zero_middle) 1590 { 1591 QEMUIOVector local_qiov; 1592 BlockDriverState *bs = child->bs; 1593 uint64_t align = bs->bl.request_alignment; 1594 int ret; 1595 1596 assert(req->serialising && pad->buf); 1597 1598 if (pad->head || pad->merge_reads) { 1599 uint64_t bytes = pad->merge_reads ? pad->buf_len : align; 1600 1601 qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 1602 1603 if (pad->head) { 1604 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1605 } 1606 if (pad->merge_reads && pad->tail) { 1607 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1608 } 1609 ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 1610 align, &local_qiov, 0, 0); 1611 if (ret < 0) { 1612 return ret; 1613 } 1614 if (pad->head) { 1615 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1616 } 1617 if (pad->merge_reads && pad->tail) { 1618 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1619 } 1620 1621 if (pad->merge_reads) { 1622 goto zero_mem; 1623 } 1624 } 1625 1626 if (pad->tail) { 1627 qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 1628 1629 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1630 ret = bdrv_aligned_preadv( 1631 child, req, 1632 req->overlap_offset + req->overlap_bytes - align, 1633 align, align, &local_qiov, 0, 0); 1634 if (ret < 0) { 1635 return ret; 1636 } 1637 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1638 } 1639 1640 zero_mem: 1641 if (zero_middle) { 1642 memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 1643 } 1644 1645 return 0; 1646 } 1647 1648 static void bdrv_padding_destroy(BdrvRequestPadding *pad) 1649 { 1650 if (pad->buf) { 1651 qemu_vfree(pad->buf); 1652 qemu_iovec_destroy(&pad->local_qiov); 1653 } 1654 } 1655 1656 /* 1657 * bdrv_pad_request 1658 * 1659 * Exchange request parameters with padded request if needed. Don't include RMW 1660 * read of padding, bdrv_padding_rmw_read() should be called separately if 1661 * needed. 1662 * 1663 * All parameters except @bs are in-out: they represent original request at 1664 * function call and padded (if padding needed) at function finish. 1665 * 1666 * Function always succeeds. 1667 */ 1668 static bool bdrv_pad_request(BlockDriverState *bs, 1669 QEMUIOVector **qiov, size_t *qiov_offset, 1670 int64_t *offset, unsigned int *bytes, 1671 BdrvRequestPadding *pad) 1672 { 1673 if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 1674 return false; 1675 } 1676 1677 qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 1678 *qiov, *qiov_offset, *bytes, 1679 pad->buf + pad->buf_len - pad->tail, pad->tail); 1680 *bytes += pad->head + pad->tail; 1681 *offset -= pad->head; 1682 *qiov = &pad->local_qiov; 1683 *qiov_offset = 0; 1684 1685 return true; 1686 } 1687 1688 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1689 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1690 BdrvRequestFlags flags) 1691 { 1692 return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 1693 } 1694 1695 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 1696 int64_t offset, unsigned int bytes, 1697 QEMUIOVector *qiov, size_t qiov_offset, 1698 BdrvRequestFlags flags) 1699 { 1700 BlockDriverState *bs = child->bs; 1701 BdrvTrackedRequest req; 1702 BdrvRequestPadding pad; 1703 int ret; 1704 1705 trace_bdrv_co_preadv(bs, offset, bytes, flags); 1706 1707 ret = bdrv_check_byte_request(bs, offset, bytes); 1708 if (ret < 0) { 1709 return ret; 1710 } 1711 1712 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1713 /* 1714 * Aligning zero request is nonsense. Even if driver has special meaning 1715 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1716 * it to driver due to request_alignment. 1717 * 1718 * Still, no reason to return an error if someone do unaligned 1719 * zero-length read occasionally. 1720 */ 1721 return 0; 1722 } 1723 1724 bdrv_inc_in_flight(bs); 1725 1726 /* Don't do copy-on-read if we read data before write operation */ 1727 if (atomic_read(&bs->copy_on_read)) { 1728 flags |= BDRV_REQ_COPY_ON_READ; 1729 } 1730 1731 bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad); 1732 1733 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1734 ret = bdrv_aligned_preadv(child, &req, offset, bytes, 1735 bs->bl.request_alignment, 1736 qiov, qiov_offset, flags); 1737 tracked_request_end(&req); 1738 bdrv_dec_in_flight(bs); 1739 1740 bdrv_padding_destroy(&pad); 1741 1742 return ret; 1743 } 1744 1745 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1746 int64_t offset, int bytes, BdrvRequestFlags flags) 1747 { 1748 BlockDriver *drv = bs->drv; 1749 QEMUIOVector qiov; 1750 void *buf = NULL; 1751 int ret = 0; 1752 bool need_flush = false; 1753 int head = 0; 1754 int tail = 0; 1755 1756 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1757 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1758 bs->bl.request_alignment); 1759 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1760 1761 if (!drv) { 1762 return -ENOMEDIUM; 1763 } 1764 1765 if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1766 return -ENOTSUP; 1767 } 1768 1769 assert(alignment % bs->bl.request_alignment == 0); 1770 head = offset % alignment; 1771 tail = (offset + bytes) % alignment; 1772 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1773 assert(max_write_zeroes >= bs->bl.request_alignment); 1774 1775 while (bytes > 0 && !ret) { 1776 int num = bytes; 1777 1778 /* Align request. Block drivers can expect the "bulk" of the request 1779 * to be aligned, and that unaligned requests do not cross cluster 1780 * boundaries. 1781 */ 1782 if (head) { 1783 /* Make a small request up to the first aligned sector. For 1784 * convenience, limit this request to max_transfer even if 1785 * we don't need to fall back to writes. */ 1786 num = MIN(MIN(bytes, max_transfer), alignment - head); 1787 head = (head + num) % alignment; 1788 assert(num < max_write_zeroes); 1789 } else if (tail && num > alignment) { 1790 /* Shorten the request to the last aligned sector. */ 1791 num -= tail; 1792 } 1793 1794 /* limit request size */ 1795 if (num > max_write_zeroes) { 1796 num = max_write_zeroes; 1797 } 1798 1799 ret = -ENOTSUP; 1800 /* First try the efficient write zeroes operation */ 1801 if (drv->bdrv_co_pwrite_zeroes) { 1802 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1803 flags & bs->supported_zero_flags); 1804 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1805 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1806 need_flush = true; 1807 } 1808 } else { 1809 assert(!bs->supported_zero_flags); 1810 } 1811 1812 if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 1813 /* Fall back to bounce buffer if write zeroes is unsupported */ 1814 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1815 1816 if ((flags & BDRV_REQ_FUA) && 1817 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1818 /* No need for bdrv_driver_pwrite() to do a fallback 1819 * flush on each chunk; use just one at the end */ 1820 write_flags &= ~BDRV_REQ_FUA; 1821 need_flush = true; 1822 } 1823 num = MIN(num, max_transfer); 1824 if (buf == NULL) { 1825 buf = qemu_try_blockalign0(bs, num); 1826 if (buf == NULL) { 1827 ret = -ENOMEM; 1828 goto fail; 1829 } 1830 } 1831 qemu_iovec_init_buf(&qiov, buf, num); 1832 1833 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 1834 1835 /* Keep bounce buffer around if it is big enough for all 1836 * all future requests. 1837 */ 1838 if (num < max_transfer) { 1839 qemu_vfree(buf); 1840 buf = NULL; 1841 } 1842 } 1843 1844 offset += num; 1845 bytes -= num; 1846 } 1847 1848 fail: 1849 if (ret == 0 && need_flush) { 1850 ret = bdrv_co_flush(bs); 1851 } 1852 qemu_vfree(buf); 1853 return ret; 1854 } 1855 1856 static inline int coroutine_fn 1857 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, 1858 BdrvTrackedRequest *req, int flags) 1859 { 1860 BlockDriverState *bs = child->bs; 1861 bool waited; 1862 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1863 1864 if (bs->read_only) { 1865 return -EPERM; 1866 } 1867 1868 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1869 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1870 assert(!(flags & ~BDRV_REQ_MASK)); 1871 1872 if (flags & BDRV_REQ_SERIALISING) { 1873 waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1874 /* 1875 * For a misaligned request we should have already waited earlier, 1876 * because we come after bdrv_padding_rmw_read which must be called 1877 * with the request already marked as serialising. 1878 */ 1879 assert(!waited || 1880 (req->offset == req->overlap_offset && 1881 req->bytes == req->overlap_bytes)); 1882 } else { 1883 bdrv_wait_serialising_requests(req); 1884 } 1885 1886 assert(req->overlap_offset <= offset); 1887 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1888 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1889 1890 switch (req->type) { 1891 case BDRV_TRACKED_WRITE: 1892 case BDRV_TRACKED_DISCARD: 1893 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1894 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1895 } else { 1896 assert(child->perm & BLK_PERM_WRITE); 1897 } 1898 return notifier_with_return_list_notify(&bs->before_write_notifiers, 1899 req); 1900 case BDRV_TRACKED_TRUNCATE: 1901 assert(child->perm & BLK_PERM_RESIZE); 1902 return 0; 1903 default: 1904 abort(); 1905 } 1906 } 1907 1908 static inline void coroutine_fn 1909 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, 1910 BdrvTrackedRequest *req, int ret) 1911 { 1912 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1913 BlockDriverState *bs = child->bs; 1914 1915 atomic_inc(&bs->write_gen); 1916 1917 /* 1918 * Discard cannot extend the image, but in error handling cases, such as 1919 * when reverting a qcow2 cluster allocation, the discarded range can pass 1920 * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 1921 * here. Instead, just skip it, since semantically a discard request 1922 * beyond EOF cannot expand the image anyway. 1923 */ 1924 if (ret == 0 && 1925 (req->type == BDRV_TRACKED_TRUNCATE || 1926 end_sector > bs->total_sectors) && 1927 req->type != BDRV_TRACKED_DISCARD) { 1928 bs->total_sectors = end_sector; 1929 bdrv_parent_cb_resize(bs); 1930 bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 1931 } 1932 if (req->bytes) { 1933 switch (req->type) { 1934 case BDRV_TRACKED_WRITE: 1935 stat64_max(&bs->wr_highest_offset, offset + bytes); 1936 /* fall through, to set dirty bits */ 1937 case BDRV_TRACKED_DISCARD: 1938 bdrv_set_dirty(bs, offset, bytes); 1939 break; 1940 default: 1941 break; 1942 } 1943 } 1944 } 1945 1946 /* 1947 * Forwards an already correctly aligned write request to the BlockDriver, 1948 * after possibly fragmenting it. 1949 */ 1950 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1951 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1952 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1953 { 1954 BlockDriverState *bs = child->bs; 1955 BlockDriver *drv = bs->drv; 1956 int ret; 1957 1958 uint64_t bytes_remaining = bytes; 1959 int max_transfer; 1960 1961 if (!drv) { 1962 return -ENOMEDIUM; 1963 } 1964 1965 if (bdrv_has_readonly_bitmaps(bs)) { 1966 return -EPERM; 1967 } 1968 1969 assert(is_power_of_2(align)); 1970 assert((offset & (align - 1)) == 0); 1971 assert((bytes & (align - 1)) == 0); 1972 assert(!qiov || qiov_offset + bytes <= qiov->size); 1973 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1974 align); 1975 1976 ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 1977 1978 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1979 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1980 qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 1981 flags |= BDRV_REQ_ZERO_WRITE; 1982 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1983 flags |= BDRV_REQ_MAY_UNMAP; 1984 } 1985 } 1986 1987 if (ret < 0) { 1988 /* Do nothing, write notifier decided to fail this request */ 1989 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1990 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1991 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1992 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1993 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 1994 qiov, qiov_offset); 1995 } else if (bytes <= max_transfer) { 1996 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1997 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 1998 } else { 1999 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2000 while (bytes_remaining) { 2001 int num = MIN(bytes_remaining, max_transfer); 2002 int local_flags = flags; 2003 2004 assert(num); 2005 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 2006 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 2007 /* If FUA is going to be emulated by flush, we only 2008 * need to flush on the last iteration */ 2009 local_flags &= ~BDRV_REQ_FUA; 2010 } 2011 2012 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 2013 num, qiov, bytes - bytes_remaining, 2014 local_flags); 2015 if (ret < 0) { 2016 break; 2017 } 2018 bytes_remaining -= num; 2019 } 2020 } 2021 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 2022 2023 if (ret >= 0) { 2024 ret = 0; 2025 } 2026 bdrv_co_write_req_finish(child, offset, bytes, req, ret); 2027 2028 return ret; 2029 } 2030 2031 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 2032 int64_t offset, 2033 unsigned int bytes, 2034 BdrvRequestFlags flags, 2035 BdrvTrackedRequest *req) 2036 { 2037 BlockDriverState *bs = child->bs; 2038 QEMUIOVector local_qiov; 2039 uint64_t align = bs->bl.request_alignment; 2040 int ret = 0; 2041 bool padding; 2042 BdrvRequestPadding pad; 2043 2044 padding = bdrv_init_padding(bs, offset, bytes, &pad); 2045 if (padding) { 2046 bdrv_mark_request_serialising(req, align); 2047 2048 bdrv_padding_rmw_read(child, req, &pad, true); 2049 2050 if (pad.head || pad.merge_reads) { 2051 int64_t aligned_offset = offset & ~(align - 1); 2052 int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 2053 2054 qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 2055 ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 2056 align, &local_qiov, 0, 2057 flags & ~BDRV_REQ_ZERO_WRITE); 2058 if (ret < 0 || pad.merge_reads) { 2059 /* Error or all work is done */ 2060 goto out; 2061 } 2062 offset += write_bytes - pad.head; 2063 bytes -= write_bytes - pad.head; 2064 } 2065 } 2066 2067 assert(!bytes || (offset & (align - 1)) == 0); 2068 if (bytes >= align) { 2069 /* Write the aligned part in the middle. */ 2070 uint64_t aligned_bytes = bytes & ~(align - 1); 2071 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 2072 NULL, 0, flags); 2073 if (ret < 0) { 2074 goto out; 2075 } 2076 bytes -= aligned_bytes; 2077 offset += aligned_bytes; 2078 } 2079 2080 assert(!bytes || (offset & (align - 1)) == 0); 2081 if (bytes) { 2082 assert(align == pad.tail + bytes); 2083 2084 qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 2085 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 2086 &local_qiov, 0, 2087 flags & ~BDRV_REQ_ZERO_WRITE); 2088 } 2089 2090 out: 2091 bdrv_padding_destroy(&pad); 2092 2093 return ret; 2094 } 2095 2096 /* 2097 * Handle a write request in coroutine context 2098 */ 2099 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2100 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 2101 BdrvRequestFlags flags) 2102 { 2103 return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 2104 } 2105 2106 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 2107 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, 2108 BdrvRequestFlags flags) 2109 { 2110 BlockDriverState *bs = child->bs; 2111 BdrvTrackedRequest req; 2112 uint64_t align = bs->bl.request_alignment; 2113 BdrvRequestPadding pad; 2114 int ret; 2115 2116 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 2117 2118 if (!bs->drv) { 2119 return -ENOMEDIUM; 2120 } 2121 2122 ret = bdrv_check_byte_request(bs, offset, bytes); 2123 if (ret < 0) { 2124 return ret; 2125 } 2126 2127 /* If the request is misaligned then we can't make it efficient */ 2128 if ((flags & BDRV_REQ_NO_FALLBACK) && 2129 !QEMU_IS_ALIGNED(offset | bytes, align)) 2130 { 2131 return -ENOTSUP; 2132 } 2133 2134 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2135 /* 2136 * Aligning zero request is nonsense. Even if driver has special meaning 2137 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2138 * it to driver due to request_alignment. 2139 * 2140 * Still, no reason to return an error if someone do unaligned 2141 * zero-length write occasionally. 2142 */ 2143 return 0; 2144 } 2145 2146 bdrv_inc_in_flight(bs); 2147 /* 2148 * Align write if necessary by performing a read-modify-write cycle. 2149 * Pad qiov with the read parts and be sure to have a tracked request not 2150 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 2151 */ 2152 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 2153 2154 if (flags & BDRV_REQ_ZERO_WRITE) { 2155 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 2156 goto out; 2157 } 2158 2159 if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { 2160 bdrv_mark_request_serialising(&req, align); 2161 bdrv_padding_rmw_read(child, &req, &pad, false); 2162 } 2163 2164 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 2165 qiov, qiov_offset, flags); 2166 2167 bdrv_padding_destroy(&pad); 2168 2169 out: 2170 tracked_request_end(&req); 2171 bdrv_dec_in_flight(bs); 2172 2173 return ret; 2174 } 2175 2176 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2177 int bytes, BdrvRequestFlags flags) 2178 { 2179 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 2180 2181 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 2182 flags &= ~BDRV_REQ_MAY_UNMAP; 2183 } 2184 2185 return bdrv_co_pwritev(child, offset, bytes, NULL, 2186 BDRV_REQ_ZERO_WRITE | flags); 2187 } 2188 2189 /* 2190 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 2191 */ 2192 int bdrv_flush_all(void) 2193 { 2194 BdrvNextIterator it; 2195 BlockDriverState *bs = NULL; 2196 int result = 0; 2197 2198 /* 2199 * bdrv queue is managed by record/replay, 2200 * creating new flush request for stopping 2201 * the VM may break the determinism 2202 */ 2203 if (replay_events_enabled()) { 2204 return result; 2205 } 2206 2207 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 2208 AioContext *aio_context = bdrv_get_aio_context(bs); 2209 int ret; 2210 2211 aio_context_acquire(aio_context); 2212 ret = bdrv_flush(bs); 2213 if (ret < 0 && !result) { 2214 result = ret; 2215 } 2216 aio_context_release(aio_context); 2217 } 2218 2219 return result; 2220 } 2221 2222 2223 typedef struct BdrvCoBlockStatusData { 2224 BlockDriverState *bs; 2225 BlockDriverState *base; 2226 bool want_zero; 2227 int64_t offset; 2228 int64_t bytes; 2229 int64_t *pnum; 2230 int64_t *map; 2231 BlockDriverState **file; 2232 int ret; 2233 bool done; 2234 } BdrvCoBlockStatusData; 2235 2236 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 2237 bool want_zero, 2238 int64_t offset, 2239 int64_t bytes, 2240 int64_t *pnum, 2241 int64_t *map, 2242 BlockDriverState **file) 2243 { 2244 assert(bs->file && bs->file->bs); 2245 *pnum = bytes; 2246 *map = offset; 2247 *file = bs->file->bs; 2248 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2249 } 2250 2251 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 2252 bool want_zero, 2253 int64_t offset, 2254 int64_t bytes, 2255 int64_t *pnum, 2256 int64_t *map, 2257 BlockDriverState **file) 2258 { 2259 assert(bs->backing && bs->backing->bs); 2260 *pnum = bytes; 2261 *map = offset; 2262 *file = bs->backing->bs; 2263 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2264 } 2265 2266 /* 2267 * Returns the allocation status of the specified sectors. 2268 * Drivers not implementing the functionality are assumed to not support 2269 * backing files, hence all their sectors are reported as allocated. 2270 * 2271 * If 'want_zero' is true, the caller is querying for mapping 2272 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 2273 * _ZERO where possible; otherwise, the result favors larger 'pnum', 2274 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2275 * 2276 * If 'offset' is beyond the end of the disk image the return value is 2277 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 2278 * 2279 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2280 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2281 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 2282 * 2283 * 'pnum' is set to the number of bytes (including and immediately 2284 * following the specified offset) that are easily known to be in the 2285 * same allocated/unallocated state. Note that a second call starting 2286 * at the original offset plus returned pnum may have the same status. 2287 * The returned value is non-zero on success except at end-of-file. 2288 * 2289 * Returns negative errno on failure. Otherwise, if the 2290 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 2291 * set to the host mapping and BDS corresponding to the guest offset. 2292 */ 2293 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2294 bool want_zero, 2295 int64_t offset, int64_t bytes, 2296 int64_t *pnum, int64_t *map, 2297 BlockDriverState **file) 2298 { 2299 int64_t total_size; 2300 int64_t n; /* bytes */ 2301 int ret; 2302 int64_t local_map = 0; 2303 BlockDriverState *local_file = NULL; 2304 int64_t aligned_offset, aligned_bytes; 2305 uint32_t align; 2306 2307 assert(pnum); 2308 *pnum = 0; 2309 total_size = bdrv_getlength(bs); 2310 if (total_size < 0) { 2311 ret = total_size; 2312 goto early_out; 2313 } 2314 2315 if (offset >= total_size) { 2316 ret = BDRV_BLOCK_EOF; 2317 goto early_out; 2318 } 2319 if (!bytes) { 2320 ret = 0; 2321 goto early_out; 2322 } 2323 2324 n = total_size - offset; 2325 if (n < bytes) { 2326 bytes = n; 2327 } 2328 2329 /* Must be non-NULL or bdrv_getlength() would have failed */ 2330 assert(bs->drv); 2331 if (!bs->drv->bdrv_co_block_status) { 2332 *pnum = bytes; 2333 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2334 if (offset + bytes == total_size) { 2335 ret |= BDRV_BLOCK_EOF; 2336 } 2337 if (bs->drv->protocol_name) { 2338 ret |= BDRV_BLOCK_OFFSET_VALID; 2339 local_map = offset; 2340 local_file = bs; 2341 } 2342 goto early_out; 2343 } 2344 2345 bdrv_inc_in_flight(bs); 2346 2347 /* Round out to request_alignment boundaries */ 2348 align = bs->bl.request_alignment; 2349 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2350 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2351 2352 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2353 aligned_bytes, pnum, &local_map, 2354 &local_file); 2355 if (ret < 0) { 2356 *pnum = 0; 2357 goto out; 2358 } 2359 2360 /* 2361 * The driver's result must be a non-zero multiple of request_alignment. 2362 * Clamp pnum and adjust map to original request. 2363 */ 2364 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2365 align > offset - aligned_offset); 2366 if (ret & BDRV_BLOCK_RECURSE) { 2367 assert(ret & BDRV_BLOCK_DATA); 2368 assert(ret & BDRV_BLOCK_OFFSET_VALID); 2369 assert(!(ret & BDRV_BLOCK_ZERO)); 2370 } 2371 2372 *pnum -= offset - aligned_offset; 2373 if (*pnum > bytes) { 2374 *pnum = bytes; 2375 } 2376 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2377 local_map += offset - aligned_offset; 2378 } 2379 2380 if (ret & BDRV_BLOCK_RAW) { 2381 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2382 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2383 *pnum, pnum, &local_map, &local_file); 2384 goto out; 2385 } 2386 2387 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2388 ret |= BDRV_BLOCK_ALLOCATED; 2389 } else if (want_zero) { 2390 if (bdrv_unallocated_blocks_are_zero(bs)) { 2391 ret |= BDRV_BLOCK_ZERO; 2392 } else if (bs->backing) { 2393 BlockDriverState *bs2 = bs->backing->bs; 2394 int64_t size2 = bdrv_getlength(bs2); 2395 2396 if (size2 >= 0 && offset >= size2) { 2397 ret |= BDRV_BLOCK_ZERO; 2398 } 2399 } 2400 } 2401 2402 if (want_zero && ret & BDRV_BLOCK_RECURSE && 2403 local_file && local_file != bs && 2404 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2405 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2406 int64_t file_pnum; 2407 int ret2; 2408 2409 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2410 *pnum, &file_pnum, NULL, NULL); 2411 if (ret2 >= 0) { 2412 /* Ignore errors. This is just providing extra information, it 2413 * is useful but not necessary. 2414 */ 2415 if (ret2 & BDRV_BLOCK_EOF && 2416 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2417 /* 2418 * It is valid for the format block driver to read 2419 * beyond the end of the underlying file's current 2420 * size; such areas read as zero. 2421 */ 2422 ret |= BDRV_BLOCK_ZERO; 2423 } else { 2424 /* Limit request to the range reported by the protocol driver */ 2425 *pnum = file_pnum; 2426 ret |= (ret2 & BDRV_BLOCK_ZERO); 2427 } 2428 } 2429 } 2430 2431 out: 2432 bdrv_dec_in_flight(bs); 2433 if (ret >= 0 && offset + *pnum == total_size) { 2434 ret |= BDRV_BLOCK_EOF; 2435 } 2436 early_out: 2437 if (file) { 2438 *file = local_file; 2439 } 2440 if (map) { 2441 *map = local_map; 2442 } 2443 return ret; 2444 } 2445 2446 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2447 BlockDriverState *base, 2448 bool want_zero, 2449 int64_t offset, 2450 int64_t bytes, 2451 int64_t *pnum, 2452 int64_t *map, 2453 BlockDriverState **file) 2454 { 2455 BlockDriverState *p; 2456 int ret = 0; 2457 bool first = true; 2458 2459 assert(bs != base); 2460 for (p = bs; p != base; p = backing_bs(p)) { 2461 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2462 file); 2463 if (ret < 0) { 2464 break; 2465 } 2466 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2467 /* 2468 * Reading beyond the end of the file continues to read 2469 * zeroes, but we can only widen the result to the 2470 * unallocated length we learned from an earlier 2471 * iteration. 2472 */ 2473 *pnum = bytes; 2474 } 2475 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2476 break; 2477 } 2478 /* [offset, pnum] unallocated on this layer, which could be only 2479 * the first part of [offset, bytes]. */ 2480 bytes = MIN(bytes, *pnum); 2481 first = false; 2482 } 2483 return ret; 2484 } 2485 2486 /* Coroutine wrapper for bdrv_block_status_above() */ 2487 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2488 { 2489 BdrvCoBlockStatusData *data = opaque; 2490 2491 data->ret = bdrv_co_block_status_above(data->bs, data->base, 2492 data->want_zero, 2493 data->offset, data->bytes, 2494 data->pnum, data->map, data->file); 2495 data->done = true; 2496 aio_wait_kick(); 2497 } 2498 2499 /* 2500 * Synchronous wrapper around bdrv_co_block_status_above(). 2501 * 2502 * See bdrv_co_block_status_above() for details. 2503 */ 2504 static int bdrv_common_block_status_above(BlockDriverState *bs, 2505 BlockDriverState *base, 2506 bool want_zero, int64_t offset, 2507 int64_t bytes, int64_t *pnum, 2508 int64_t *map, 2509 BlockDriverState **file) 2510 { 2511 Coroutine *co; 2512 BdrvCoBlockStatusData data = { 2513 .bs = bs, 2514 .base = base, 2515 .want_zero = want_zero, 2516 .offset = offset, 2517 .bytes = bytes, 2518 .pnum = pnum, 2519 .map = map, 2520 .file = file, 2521 .done = false, 2522 }; 2523 2524 if (qemu_in_coroutine()) { 2525 /* Fast-path if already in coroutine context */ 2526 bdrv_block_status_above_co_entry(&data); 2527 } else { 2528 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2529 bdrv_coroutine_enter(bs, co); 2530 BDRV_POLL_WHILE(bs, !data.done); 2531 } 2532 return data.ret; 2533 } 2534 2535 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2536 int64_t offset, int64_t bytes, int64_t *pnum, 2537 int64_t *map, BlockDriverState **file) 2538 { 2539 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2540 pnum, map, file); 2541 } 2542 2543 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2544 int64_t *pnum, int64_t *map, BlockDriverState **file) 2545 { 2546 return bdrv_block_status_above(bs, backing_bs(bs), 2547 offset, bytes, pnum, map, file); 2548 } 2549 2550 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2551 int64_t bytes, int64_t *pnum) 2552 { 2553 int ret; 2554 int64_t dummy; 2555 2556 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2557 bytes, pnum ? pnum : &dummy, NULL, 2558 NULL); 2559 if (ret < 0) { 2560 return ret; 2561 } 2562 return !!(ret & BDRV_BLOCK_ALLOCATED); 2563 } 2564 2565 /* 2566 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2567 * 2568 * Return 1 if (a prefix of) the given range is allocated in any image 2569 * between BASE and TOP (BASE is only included if include_base is set). 2570 * BASE can be NULL to check if the given offset is allocated in any 2571 * image of the chain. Return 0 otherwise, or negative errno on 2572 * failure. 2573 * 2574 * 'pnum' is set to the number of bytes (including and immediately 2575 * following the specified offset) that are known to be in the same 2576 * allocated/unallocated state. Note that a subsequent call starting 2577 * at 'offset + *pnum' may return the same allocation status (in other 2578 * words, the result is not necessarily the maximum possible range); 2579 * but 'pnum' will only be 0 when end of file is reached. 2580 * 2581 */ 2582 int bdrv_is_allocated_above(BlockDriverState *top, 2583 BlockDriverState *base, 2584 bool include_base, int64_t offset, 2585 int64_t bytes, int64_t *pnum) 2586 { 2587 BlockDriverState *intermediate; 2588 int ret; 2589 int64_t n = bytes; 2590 2591 assert(base || !include_base); 2592 2593 intermediate = top; 2594 while (include_base || intermediate != base) { 2595 int64_t pnum_inter; 2596 int64_t size_inter; 2597 2598 assert(intermediate); 2599 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2600 if (ret < 0) { 2601 return ret; 2602 } 2603 if (ret) { 2604 *pnum = pnum_inter; 2605 return 1; 2606 } 2607 2608 size_inter = bdrv_getlength(intermediate); 2609 if (size_inter < 0) { 2610 return size_inter; 2611 } 2612 if (n > pnum_inter && 2613 (intermediate == top || offset + pnum_inter < size_inter)) { 2614 n = pnum_inter; 2615 } 2616 2617 if (intermediate == base) { 2618 break; 2619 } 2620 2621 intermediate = backing_bs(intermediate); 2622 } 2623 2624 *pnum = n; 2625 return 0; 2626 } 2627 2628 typedef struct BdrvVmstateCo { 2629 BlockDriverState *bs; 2630 QEMUIOVector *qiov; 2631 int64_t pos; 2632 bool is_read; 2633 int ret; 2634 } BdrvVmstateCo; 2635 2636 static int coroutine_fn 2637 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2638 bool is_read) 2639 { 2640 BlockDriver *drv = bs->drv; 2641 int ret = -ENOTSUP; 2642 2643 bdrv_inc_in_flight(bs); 2644 2645 if (!drv) { 2646 ret = -ENOMEDIUM; 2647 } else if (drv->bdrv_load_vmstate) { 2648 if (is_read) { 2649 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2650 } else { 2651 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2652 } 2653 } else if (bs->file) { 2654 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2655 } 2656 2657 bdrv_dec_in_flight(bs); 2658 return ret; 2659 } 2660 2661 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2662 { 2663 BdrvVmstateCo *co = opaque; 2664 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2665 aio_wait_kick(); 2666 } 2667 2668 static inline int 2669 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2670 bool is_read) 2671 { 2672 if (qemu_in_coroutine()) { 2673 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2674 } else { 2675 BdrvVmstateCo data = { 2676 .bs = bs, 2677 .qiov = qiov, 2678 .pos = pos, 2679 .is_read = is_read, 2680 .ret = -EINPROGRESS, 2681 }; 2682 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2683 2684 bdrv_coroutine_enter(bs, co); 2685 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 2686 return data.ret; 2687 } 2688 } 2689 2690 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2691 int64_t pos, int size) 2692 { 2693 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2694 int ret; 2695 2696 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2697 if (ret < 0) { 2698 return ret; 2699 } 2700 2701 return size; 2702 } 2703 2704 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2705 { 2706 return bdrv_rw_vmstate(bs, qiov, pos, false); 2707 } 2708 2709 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2710 int64_t pos, int size) 2711 { 2712 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2713 int ret; 2714 2715 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2716 if (ret < 0) { 2717 return ret; 2718 } 2719 2720 return size; 2721 } 2722 2723 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2724 { 2725 return bdrv_rw_vmstate(bs, qiov, pos, true); 2726 } 2727 2728 /**************************************************************/ 2729 /* async I/Os */ 2730 2731 void bdrv_aio_cancel(BlockAIOCB *acb) 2732 { 2733 qemu_aio_ref(acb); 2734 bdrv_aio_cancel_async(acb); 2735 while (acb->refcnt > 1) { 2736 if (acb->aiocb_info->get_aio_context) { 2737 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2738 } else if (acb->bs) { 2739 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2740 * assert that we're not using an I/O thread. Thread-safe 2741 * code should use bdrv_aio_cancel_async exclusively. 2742 */ 2743 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2744 aio_poll(bdrv_get_aio_context(acb->bs), true); 2745 } else { 2746 abort(); 2747 } 2748 } 2749 qemu_aio_unref(acb); 2750 } 2751 2752 /* Async version of aio cancel. The caller is not blocked if the acb implements 2753 * cancel_async, otherwise we do nothing and let the request normally complete. 2754 * In either case the completion callback must be called. */ 2755 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2756 { 2757 if (acb->aiocb_info->cancel_async) { 2758 acb->aiocb_info->cancel_async(acb); 2759 } 2760 } 2761 2762 /**************************************************************/ 2763 /* Coroutine block device emulation */ 2764 2765 typedef struct FlushCo { 2766 BlockDriverState *bs; 2767 int ret; 2768 } FlushCo; 2769 2770 2771 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2772 { 2773 FlushCo *rwco = opaque; 2774 2775 rwco->ret = bdrv_co_flush(rwco->bs); 2776 aio_wait_kick(); 2777 } 2778 2779 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2780 { 2781 int current_gen; 2782 int ret = 0; 2783 2784 bdrv_inc_in_flight(bs); 2785 2786 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2787 bdrv_is_sg(bs)) { 2788 goto early_exit; 2789 } 2790 2791 qemu_co_mutex_lock(&bs->reqs_lock); 2792 current_gen = atomic_read(&bs->write_gen); 2793 2794 /* Wait until any previous flushes are completed */ 2795 while (bs->active_flush_req) { 2796 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2797 } 2798 2799 /* Flushes reach this point in nondecreasing current_gen order. */ 2800 bs->active_flush_req = true; 2801 qemu_co_mutex_unlock(&bs->reqs_lock); 2802 2803 /* Write back all layers by calling one driver function */ 2804 if (bs->drv->bdrv_co_flush) { 2805 ret = bs->drv->bdrv_co_flush(bs); 2806 goto out; 2807 } 2808 2809 /* Write back cached data to the OS even with cache=unsafe */ 2810 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2811 if (bs->drv->bdrv_co_flush_to_os) { 2812 ret = bs->drv->bdrv_co_flush_to_os(bs); 2813 if (ret < 0) { 2814 goto out; 2815 } 2816 } 2817 2818 /* But don't actually force it to the disk with cache=unsafe */ 2819 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2820 goto flush_parent; 2821 } 2822 2823 /* Check if we really need to flush anything */ 2824 if (bs->flushed_gen == current_gen) { 2825 goto flush_parent; 2826 } 2827 2828 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2829 if (!bs->drv) { 2830 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2831 * (even in case of apparent success) */ 2832 ret = -ENOMEDIUM; 2833 goto out; 2834 } 2835 if (bs->drv->bdrv_co_flush_to_disk) { 2836 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2837 } else if (bs->drv->bdrv_aio_flush) { 2838 BlockAIOCB *acb; 2839 CoroutineIOCompletion co = { 2840 .coroutine = qemu_coroutine_self(), 2841 }; 2842 2843 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2844 if (acb == NULL) { 2845 ret = -EIO; 2846 } else { 2847 qemu_coroutine_yield(); 2848 ret = co.ret; 2849 } 2850 } else { 2851 /* 2852 * Some block drivers always operate in either writethrough or unsafe 2853 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2854 * know how the server works (because the behaviour is hardcoded or 2855 * depends on server-side configuration), so we can't ensure that 2856 * everything is safe on disk. Returning an error doesn't work because 2857 * that would break guests even if the server operates in writethrough 2858 * mode. 2859 * 2860 * Let's hope the user knows what he's doing. 2861 */ 2862 ret = 0; 2863 } 2864 2865 if (ret < 0) { 2866 goto out; 2867 } 2868 2869 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2870 * in the case of cache=unsafe, so there are no useless flushes. 2871 */ 2872 flush_parent: 2873 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2874 out: 2875 /* Notify any pending flushes that we have completed */ 2876 if (ret == 0) { 2877 bs->flushed_gen = current_gen; 2878 } 2879 2880 qemu_co_mutex_lock(&bs->reqs_lock); 2881 bs->active_flush_req = false; 2882 /* Return value is ignored - it's ok if wait queue is empty */ 2883 qemu_co_queue_next(&bs->flush_queue); 2884 qemu_co_mutex_unlock(&bs->reqs_lock); 2885 2886 early_exit: 2887 bdrv_dec_in_flight(bs); 2888 return ret; 2889 } 2890 2891 int bdrv_flush(BlockDriverState *bs) 2892 { 2893 Coroutine *co; 2894 FlushCo flush_co = { 2895 .bs = bs, 2896 .ret = NOT_DONE, 2897 }; 2898 2899 if (qemu_in_coroutine()) { 2900 /* Fast-path if already in coroutine context */ 2901 bdrv_flush_co_entry(&flush_co); 2902 } else { 2903 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2904 bdrv_coroutine_enter(bs, co); 2905 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2906 } 2907 2908 return flush_co.ret; 2909 } 2910 2911 typedef struct DiscardCo { 2912 BdrvChild *child; 2913 int64_t offset; 2914 int64_t bytes; 2915 int ret; 2916 } DiscardCo; 2917 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2918 { 2919 DiscardCo *rwco = opaque; 2920 2921 rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); 2922 aio_wait_kick(); 2923 } 2924 2925 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2926 int64_t bytes) 2927 { 2928 BdrvTrackedRequest req; 2929 int max_pdiscard, ret; 2930 int head, tail, align; 2931 BlockDriverState *bs = child->bs; 2932 2933 if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 2934 return -ENOMEDIUM; 2935 } 2936 2937 if (bdrv_has_readonly_bitmaps(bs)) { 2938 return -EPERM; 2939 } 2940 2941 if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) { 2942 return -EIO; 2943 } 2944 2945 /* Do nothing if disabled. */ 2946 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2947 return 0; 2948 } 2949 2950 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2951 return 0; 2952 } 2953 2954 /* Discard is advisory, but some devices track and coalesce 2955 * unaligned requests, so we must pass everything down rather than 2956 * round here. Still, most devices will just silently ignore 2957 * unaligned requests (by returning -ENOTSUP), so we must fragment 2958 * the request accordingly. */ 2959 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2960 assert(align % bs->bl.request_alignment == 0); 2961 head = offset % align; 2962 tail = (offset + bytes) % align; 2963 2964 bdrv_inc_in_flight(bs); 2965 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2966 2967 ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2968 if (ret < 0) { 2969 goto out; 2970 } 2971 2972 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2973 align); 2974 assert(max_pdiscard >= bs->bl.request_alignment); 2975 2976 while (bytes > 0) { 2977 int64_t num = bytes; 2978 2979 if (head) { 2980 /* Make small requests to get to alignment boundaries. */ 2981 num = MIN(bytes, align - head); 2982 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2983 num %= bs->bl.request_alignment; 2984 } 2985 head = (head + num) % align; 2986 assert(num < max_pdiscard); 2987 } else if (tail) { 2988 if (num > align) { 2989 /* Shorten the request to the last aligned cluster. */ 2990 num -= tail; 2991 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2992 tail > bs->bl.request_alignment) { 2993 tail %= bs->bl.request_alignment; 2994 num -= tail; 2995 } 2996 } 2997 /* limit request size */ 2998 if (num > max_pdiscard) { 2999 num = max_pdiscard; 3000 } 3001 3002 if (!bs->drv) { 3003 ret = -ENOMEDIUM; 3004 goto out; 3005 } 3006 if (bs->drv->bdrv_co_pdiscard) { 3007 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 3008 } else { 3009 BlockAIOCB *acb; 3010 CoroutineIOCompletion co = { 3011 .coroutine = qemu_coroutine_self(), 3012 }; 3013 3014 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 3015 bdrv_co_io_em_complete, &co); 3016 if (acb == NULL) { 3017 ret = -EIO; 3018 goto out; 3019 } else { 3020 qemu_coroutine_yield(); 3021 ret = co.ret; 3022 } 3023 } 3024 if (ret && ret != -ENOTSUP) { 3025 goto out; 3026 } 3027 3028 offset += num; 3029 bytes -= num; 3030 } 3031 ret = 0; 3032 out: 3033 bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3034 tracked_request_end(&req); 3035 bdrv_dec_in_flight(bs); 3036 return ret; 3037 } 3038 3039 int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) 3040 { 3041 Coroutine *co; 3042 DiscardCo rwco = { 3043 .child = child, 3044 .offset = offset, 3045 .bytes = bytes, 3046 .ret = NOT_DONE, 3047 }; 3048 3049 if (qemu_in_coroutine()) { 3050 /* Fast-path if already in coroutine context */ 3051 bdrv_pdiscard_co_entry(&rwco); 3052 } else { 3053 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 3054 bdrv_coroutine_enter(child->bs, co); 3055 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 3056 } 3057 3058 return rwco.ret; 3059 } 3060 3061 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 3062 { 3063 BlockDriver *drv = bs->drv; 3064 CoroutineIOCompletion co = { 3065 .coroutine = qemu_coroutine_self(), 3066 }; 3067 BlockAIOCB *acb; 3068 3069 bdrv_inc_in_flight(bs); 3070 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 3071 co.ret = -ENOTSUP; 3072 goto out; 3073 } 3074 3075 if (drv->bdrv_co_ioctl) { 3076 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 3077 } else { 3078 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 3079 if (!acb) { 3080 co.ret = -ENOTSUP; 3081 goto out; 3082 } 3083 qemu_coroutine_yield(); 3084 } 3085 out: 3086 bdrv_dec_in_flight(bs); 3087 return co.ret; 3088 } 3089 3090 void *qemu_blockalign(BlockDriverState *bs, size_t size) 3091 { 3092 return qemu_memalign(bdrv_opt_mem_align(bs), size); 3093 } 3094 3095 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 3096 { 3097 return memset(qemu_blockalign(bs, size), 0, size); 3098 } 3099 3100 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 3101 { 3102 size_t align = bdrv_opt_mem_align(bs); 3103 3104 /* Ensure that NULL is never returned on success */ 3105 assert(align > 0); 3106 if (size == 0) { 3107 size = align; 3108 } 3109 3110 return qemu_try_memalign(align, size); 3111 } 3112 3113 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 3114 { 3115 void *mem = qemu_try_blockalign(bs, size); 3116 3117 if (mem) { 3118 memset(mem, 0, size); 3119 } 3120 3121 return mem; 3122 } 3123 3124 /* 3125 * Check if all memory in this vector is sector aligned. 3126 */ 3127 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 3128 { 3129 int i; 3130 size_t alignment = bdrv_min_mem_align(bs); 3131 3132 for (i = 0; i < qiov->niov; i++) { 3133 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 3134 return false; 3135 } 3136 if (qiov->iov[i].iov_len % alignment) { 3137 return false; 3138 } 3139 } 3140 3141 return true; 3142 } 3143 3144 void bdrv_add_before_write_notifier(BlockDriverState *bs, 3145 NotifierWithReturn *notifier) 3146 { 3147 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 3148 } 3149 3150 void bdrv_io_plug(BlockDriverState *bs) 3151 { 3152 BdrvChild *child; 3153 3154 QLIST_FOREACH(child, &bs->children, next) { 3155 bdrv_io_plug(child->bs); 3156 } 3157 3158 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 3159 BlockDriver *drv = bs->drv; 3160 if (drv && drv->bdrv_io_plug) { 3161 drv->bdrv_io_plug(bs); 3162 } 3163 } 3164 } 3165 3166 void bdrv_io_unplug(BlockDriverState *bs) 3167 { 3168 BdrvChild *child; 3169 3170 assert(bs->io_plugged); 3171 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 3172 BlockDriver *drv = bs->drv; 3173 if (drv && drv->bdrv_io_unplug) { 3174 drv->bdrv_io_unplug(bs); 3175 } 3176 } 3177 3178 QLIST_FOREACH(child, &bs->children, next) { 3179 bdrv_io_unplug(child->bs); 3180 } 3181 } 3182 3183 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 3184 { 3185 BdrvChild *child; 3186 3187 if (bs->drv && bs->drv->bdrv_register_buf) { 3188 bs->drv->bdrv_register_buf(bs, host, size); 3189 } 3190 QLIST_FOREACH(child, &bs->children, next) { 3191 bdrv_register_buf(child->bs, host, size); 3192 } 3193 } 3194 3195 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 3196 { 3197 BdrvChild *child; 3198 3199 if (bs->drv && bs->drv->bdrv_unregister_buf) { 3200 bs->drv->bdrv_unregister_buf(bs, host); 3201 } 3202 QLIST_FOREACH(child, &bs->children, next) { 3203 bdrv_unregister_buf(child->bs, host); 3204 } 3205 } 3206 3207 static int coroutine_fn bdrv_co_copy_range_internal( 3208 BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 3209 uint64_t dst_offset, uint64_t bytes, 3210 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3211 bool recurse_src) 3212 { 3213 BdrvTrackedRequest req; 3214 int ret; 3215 3216 /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3217 assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3218 assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3219 3220 if (!dst || !dst->bs) { 3221 return -ENOMEDIUM; 3222 } 3223 ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 3224 if (ret) { 3225 return ret; 3226 } 3227 if (write_flags & BDRV_REQ_ZERO_WRITE) { 3228 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3229 } 3230 3231 if (!src || !src->bs) { 3232 return -ENOMEDIUM; 3233 } 3234 ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 3235 if (ret) { 3236 return ret; 3237 } 3238 3239 if (!src->bs->drv->bdrv_co_copy_range_from 3240 || !dst->bs->drv->bdrv_co_copy_range_to 3241 || src->bs->encrypted || dst->bs->encrypted) { 3242 return -ENOTSUP; 3243 } 3244 3245 if (recurse_src) { 3246 bdrv_inc_in_flight(src->bs); 3247 tracked_request_begin(&req, src->bs, src_offset, bytes, 3248 BDRV_TRACKED_READ); 3249 3250 /* BDRV_REQ_SERIALISING is only for write operation */ 3251 assert(!(read_flags & BDRV_REQ_SERIALISING)); 3252 bdrv_wait_serialising_requests(&req); 3253 3254 ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3255 src, src_offset, 3256 dst, dst_offset, 3257 bytes, 3258 read_flags, write_flags); 3259 3260 tracked_request_end(&req); 3261 bdrv_dec_in_flight(src->bs); 3262 } else { 3263 bdrv_inc_in_flight(dst->bs); 3264 tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3265 BDRV_TRACKED_WRITE); 3266 ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 3267 write_flags); 3268 if (!ret) { 3269 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3270 src, src_offset, 3271 dst, dst_offset, 3272 bytes, 3273 read_flags, write_flags); 3274 } 3275 bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3276 tracked_request_end(&req); 3277 bdrv_dec_in_flight(dst->bs); 3278 } 3279 3280 return ret; 3281 } 3282 3283 /* Copy range from @src to @dst. 3284 * 3285 * See the comment of bdrv_co_copy_range for the parameter and return value 3286 * semantics. */ 3287 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 3288 BdrvChild *dst, uint64_t dst_offset, 3289 uint64_t bytes, 3290 BdrvRequestFlags read_flags, 3291 BdrvRequestFlags write_flags) 3292 { 3293 trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3294 read_flags, write_flags); 3295 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3296 bytes, read_flags, write_flags, true); 3297 } 3298 3299 /* Copy range from @src to @dst. 3300 * 3301 * See the comment of bdrv_co_copy_range for the parameter and return value 3302 * semantics. */ 3303 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3304 BdrvChild *dst, uint64_t dst_offset, 3305 uint64_t bytes, 3306 BdrvRequestFlags read_flags, 3307 BdrvRequestFlags write_flags) 3308 { 3309 trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3310 read_flags, write_flags); 3311 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3312 bytes, read_flags, write_flags, false); 3313 } 3314 3315 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3316 BdrvChild *dst, uint64_t dst_offset, 3317 uint64_t bytes, BdrvRequestFlags read_flags, 3318 BdrvRequestFlags write_flags) 3319 { 3320 return bdrv_co_copy_range_from(src, src_offset, 3321 dst, dst_offset, 3322 bytes, read_flags, write_flags); 3323 } 3324 3325 static void bdrv_parent_cb_resize(BlockDriverState *bs) 3326 { 3327 BdrvChild *c; 3328 QLIST_FOREACH(c, &bs->parents, next_parent) { 3329 if (c->klass->resize) { 3330 c->klass->resize(c); 3331 } 3332 } 3333 } 3334 3335 /** 3336 * Truncate file to 'offset' bytes (needed only for file protocols) 3337 * 3338 * If 'exact' is true, the file must be resized to exactly the given 3339 * 'offset'. Otherwise, it is sufficient for the node to be at least 3340 * 'offset' bytes in length. 3341 */ 3342 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 3343 PreallocMode prealloc, BdrvRequestFlags flags, 3344 Error **errp) 3345 { 3346 BlockDriverState *bs = child->bs; 3347 BlockDriver *drv = bs->drv; 3348 BdrvTrackedRequest req; 3349 int64_t old_size, new_bytes; 3350 int ret; 3351 3352 3353 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 3354 if (!drv) { 3355 error_setg(errp, "No medium inserted"); 3356 return -ENOMEDIUM; 3357 } 3358 if (offset < 0) { 3359 error_setg(errp, "Image size cannot be negative"); 3360 return -EINVAL; 3361 } 3362 3363 old_size = bdrv_getlength(bs); 3364 if (old_size < 0) { 3365 error_setg_errno(errp, -old_size, "Failed to get old image size"); 3366 return old_size; 3367 } 3368 3369 if (offset > old_size) { 3370 new_bytes = offset - old_size; 3371 } else { 3372 new_bytes = 0; 3373 } 3374 3375 bdrv_inc_in_flight(bs); 3376 tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 3377 BDRV_TRACKED_TRUNCATE); 3378 3379 /* If we are growing the image and potentially using preallocation for the 3380 * new area, we need to make sure that no write requests are made to it 3381 * concurrently or they might be overwritten by preallocation. */ 3382 if (new_bytes) { 3383 bdrv_mark_request_serialising(&req, 1); 3384 } 3385 if (bs->read_only) { 3386 error_setg(errp, "Image is read-only"); 3387 ret = -EACCES; 3388 goto out; 3389 } 3390 ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3391 0); 3392 if (ret < 0) { 3393 error_setg_errno(errp, -ret, 3394 "Failed to prepare request for truncation"); 3395 goto out; 3396 } 3397 3398 /* 3399 * If the image has a backing file that is large enough that it would 3400 * provide data for the new area, we cannot leave it unallocated because 3401 * then the backing file content would become visible. Instead, zero-fill 3402 * the new area. 3403 * 3404 * Note that if the image has a backing file, but was opened without the 3405 * backing file, taking care of keeping things consistent with that backing 3406 * file is the user's responsibility. 3407 */ 3408 if (new_bytes && bs->backing) { 3409 int64_t backing_len; 3410 3411 backing_len = bdrv_getlength(backing_bs(bs)); 3412 if (backing_len < 0) { 3413 ret = backing_len; 3414 error_setg_errno(errp, -ret, "Could not get backing file size"); 3415 goto out; 3416 } 3417 3418 if (backing_len > old_size) { 3419 flags |= BDRV_REQ_ZERO_WRITE; 3420 } 3421 } 3422 3423 if (drv->bdrv_co_truncate) { 3424 if (flags & ~bs->supported_truncate_flags) { 3425 error_setg(errp, "Block driver does not support requested flags"); 3426 ret = -ENOTSUP; 3427 goto out; 3428 } 3429 ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 3430 } else if (bs->file && drv->is_filter) { 3431 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); 3432 } else { 3433 error_setg(errp, "Image format driver does not support resize"); 3434 ret = -ENOTSUP; 3435 goto out; 3436 } 3437 if (ret < 0) { 3438 goto out; 3439 } 3440 3441 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3442 if (ret < 0) { 3443 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 3444 } else { 3445 offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3446 } 3447 /* It's possible that truncation succeeded but refresh_total_sectors 3448 * failed, but the latter doesn't affect how we should finish the request. 3449 * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3450 bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 3451 3452 out: 3453 tracked_request_end(&req); 3454 bdrv_dec_in_flight(bs); 3455 3456 return ret; 3457 } 3458 3459 typedef struct TruncateCo { 3460 BdrvChild *child; 3461 int64_t offset; 3462 bool exact; 3463 PreallocMode prealloc; 3464 BdrvRequestFlags flags; 3465 Error **errp; 3466 int ret; 3467 } TruncateCo; 3468 3469 static void coroutine_fn bdrv_truncate_co_entry(void *opaque) 3470 { 3471 TruncateCo *tco = opaque; 3472 tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact, 3473 tco->prealloc, tco->flags, tco->errp); 3474 aio_wait_kick(); 3475 } 3476 3477 int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, 3478 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) 3479 { 3480 Coroutine *co; 3481 TruncateCo tco = { 3482 .child = child, 3483 .offset = offset, 3484 .exact = exact, 3485 .prealloc = prealloc, 3486 .flags = flags, 3487 .errp = errp, 3488 .ret = NOT_DONE, 3489 }; 3490 3491 if (qemu_in_coroutine()) { 3492 /* Fast-path if already in coroutine context */ 3493 bdrv_truncate_co_entry(&tco); 3494 } else { 3495 co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco); 3496 bdrv_coroutine_enter(child->bs, co); 3497 BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE); 3498 } 3499 3500 return tco.ret; 3501 } 3502