1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 #include "qemu/main-loop.h" 36 #include "sysemu/replay.h" 37 38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40 41 static void bdrv_parent_cb_resize(BlockDriverState *bs); 42 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 43 int64_t offset, int bytes, BdrvRequestFlags flags); 44 45 static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 46 bool ignore_bds_parents) 47 { 48 BdrvChild *c, *next; 49 50 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 51 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 52 continue; 53 } 54 bdrv_parent_drained_begin_single(c, false); 55 } 56 } 57 58 static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 59 int *drained_end_counter) 60 { 61 assert(c->parent_quiesce_counter > 0); 62 c->parent_quiesce_counter--; 63 if (c->klass->drained_end) { 64 c->klass->drained_end(c, drained_end_counter); 65 } 66 } 67 68 void bdrv_parent_drained_end_single(BdrvChild *c) 69 { 70 int drained_end_counter = 0; 71 bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 72 BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0); 73 } 74 75 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 76 bool ignore_bds_parents, 77 int *drained_end_counter) 78 { 79 BdrvChild *c; 80 81 QLIST_FOREACH(c, &bs->parents, next_parent) { 82 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 83 continue; 84 } 85 bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 86 } 87 } 88 89 static bool bdrv_parent_drained_poll_single(BdrvChild *c) 90 { 91 if (c->klass->drained_poll) { 92 return c->klass->drained_poll(c); 93 } 94 return false; 95 } 96 97 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 98 bool ignore_bds_parents) 99 { 100 BdrvChild *c, *next; 101 bool busy = false; 102 103 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 104 if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 105 continue; 106 } 107 busy |= bdrv_parent_drained_poll_single(c); 108 } 109 110 return busy; 111 } 112 113 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 114 { 115 c->parent_quiesce_counter++; 116 if (c->klass->drained_begin) { 117 c->klass->drained_begin(c); 118 } 119 if (poll) { 120 BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 121 } 122 } 123 124 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 125 { 126 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 127 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 128 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 129 src->opt_mem_alignment); 130 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 131 src->min_mem_alignment); 132 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 133 } 134 135 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 136 { 137 BlockDriver *drv = bs->drv; 138 Error *local_err = NULL; 139 140 memset(&bs->bl, 0, sizeof(bs->bl)); 141 142 if (!drv) { 143 return; 144 } 145 146 /* Default alignment based on whether driver has byte interface */ 147 bs->bl.request_alignment = (drv->bdrv_co_preadv || 148 drv->bdrv_aio_preadv || 149 drv->bdrv_co_preadv_part) ? 1 : 512; 150 151 /* Take some limits from the children as a default */ 152 if (bs->file) { 153 bdrv_refresh_limits(bs->file->bs, &local_err); 154 if (local_err) { 155 error_propagate(errp, local_err); 156 return; 157 } 158 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 159 } else { 160 bs->bl.min_mem_alignment = 512; 161 bs->bl.opt_mem_alignment = qemu_real_host_page_size; 162 163 /* Safe default since most protocols use readv()/writev()/etc */ 164 bs->bl.max_iov = IOV_MAX; 165 } 166 167 if (bs->backing) { 168 bdrv_refresh_limits(bs->backing->bs, &local_err); 169 if (local_err) { 170 error_propagate(errp, local_err); 171 return; 172 } 173 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 174 } 175 176 /* Then let the driver override it */ 177 if (drv->bdrv_refresh_limits) { 178 drv->bdrv_refresh_limits(bs, errp); 179 } 180 } 181 182 /** 183 * The copy-on-read flag is actually a reference count so multiple users may 184 * use the feature without worrying about clobbering its previous state. 185 * Copy-on-read stays enabled until all users have called to disable it. 186 */ 187 void bdrv_enable_copy_on_read(BlockDriverState *bs) 188 { 189 atomic_inc(&bs->copy_on_read); 190 } 191 192 void bdrv_disable_copy_on_read(BlockDriverState *bs) 193 { 194 int old = atomic_fetch_dec(&bs->copy_on_read); 195 assert(old >= 1); 196 } 197 198 typedef struct { 199 Coroutine *co; 200 BlockDriverState *bs; 201 bool done; 202 bool begin; 203 bool recursive; 204 bool poll; 205 BdrvChild *parent; 206 bool ignore_bds_parents; 207 int *drained_end_counter; 208 } BdrvCoDrainData; 209 210 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 211 { 212 BdrvCoDrainData *data = opaque; 213 BlockDriverState *bs = data->bs; 214 215 if (data->begin) { 216 bs->drv->bdrv_co_drain_begin(bs); 217 } else { 218 bs->drv->bdrv_co_drain_end(bs); 219 } 220 221 /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 222 atomic_mb_set(&data->done, true); 223 if (!data->begin) { 224 atomic_dec(data->drained_end_counter); 225 } 226 bdrv_dec_in_flight(bs); 227 228 g_free(data); 229 } 230 231 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 232 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 233 int *drained_end_counter) 234 { 235 BdrvCoDrainData *data; 236 237 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 238 (!begin && !bs->drv->bdrv_co_drain_end)) { 239 return; 240 } 241 242 data = g_new(BdrvCoDrainData, 1); 243 *data = (BdrvCoDrainData) { 244 .bs = bs, 245 .done = false, 246 .begin = begin, 247 .drained_end_counter = drained_end_counter, 248 }; 249 250 if (!begin) { 251 atomic_inc(drained_end_counter); 252 } 253 254 /* Make sure the driver callback completes during the polling phase for 255 * drain_begin. */ 256 bdrv_inc_in_flight(bs); 257 data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 258 aio_co_schedule(bdrv_get_aio_context(bs), data->co); 259 } 260 261 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 262 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 263 BdrvChild *ignore_parent, bool ignore_bds_parents) 264 { 265 BdrvChild *child, *next; 266 267 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 268 return true; 269 } 270 271 if (atomic_read(&bs->in_flight)) { 272 return true; 273 } 274 275 if (recursive) { 276 assert(!ignore_bds_parents); 277 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 278 if (bdrv_drain_poll(child->bs, recursive, child, false)) { 279 return true; 280 } 281 } 282 } 283 284 return false; 285 } 286 287 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 288 BdrvChild *ignore_parent) 289 { 290 return bdrv_drain_poll(bs, recursive, ignore_parent, false); 291 } 292 293 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 294 BdrvChild *parent, bool ignore_bds_parents, 295 bool poll); 296 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 297 BdrvChild *parent, bool ignore_bds_parents, 298 int *drained_end_counter); 299 300 static void bdrv_co_drain_bh_cb(void *opaque) 301 { 302 BdrvCoDrainData *data = opaque; 303 Coroutine *co = data->co; 304 BlockDriverState *bs = data->bs; 305 306 if (bs) { 307 AioContext *ctx = bdrv_get_aio_context(bs); 308 AioContext *co_ctx = qemu_coroutine_get_aio_context(co); 309 310 /* 311 * When the coroutine yielded, the lock for its home context was 312 * released, so we need to re-acquire it here. If it explicitly 313 * acquired a different context, the lock is still held and we don't 314 * want to lock it a second time (or AIO_WAIT_WHILE() would hang). 315 */ 316 if (ctx == co_ctx) { 317 aio_context_acquire(ctx); 318 } 319 bdrv_dec_in_flight(bs); 320 if (data->begin) { 321 assert(!data->drained_end_counter); 322 bdrv_do_drained_begin(bs, data->recursive, data->parent, 323 data->ignore_bds_parents, data->poll); 324 } else { 325 assert(!data->poll); 326 bdrv_do_drained_end(bs, data->recursive, data->parent, 327 data->ignore_bds_parents, 328 data->drained_end_counter); 329 } 330 if (ctx == co_ctx) { 331 aio_context_release(ctx); 332 } 333 } else { 334 assert(data->begin); 335 bdrv_drain_all_begin(); 336 } 337 338 data->done = true; 339 aio_co_wake(co); 340 } 341 342 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 343 bool begin, bool recursive, 344 BdrvChild *parent, 345 bool ignore_bds_parents, 346 bool poll, 347 int *drained_end_counter) 348 { 349 BdrvCoDrainData data; 350 351 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 352 * other coroutines run if they were queued by aio_co_enter(). */ 353 354 assert(qemu_in_coroutine()); 355 data = (BdrvCoDrainData) { 356 .co = qemu_coroutine_self(), 357 .bs = bs, 358 .done = false, 359 .begin = begin, 360 .recursive = recursive, 361 .parent = parent, 362 .ignore_bds_parents = ignore_bds_parents, 363 .poll = poll, 364 .drained_end_counter = drained_end_counter, 365 }; 366 367 if (bs) { 368 bdrv_inc_in_flight(bs); 369 } 370 replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs), 371 bdrv_co_drain_bh_cb, &data); 372 373 qemu_coroutine_yield(); 374 /* If we are resumed from some other event (such as an aio completion or a 375 * timer callback), it is a bug in the caller that should be fixed. */ 376 assert(data.done); 377 } 378 379 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 380 BdrvChild *parent, bool ignore_bds_parents) 381 { 382 assert(!qemu_in_coroutine()); 383 384 /* Stop things in parent-to-child order */ 385 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 386 aio_disable_external(bdrv_get_aio_context(bs)); 387 } 388 389 bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 390 bdrv_drain_invoke(bs, true, NULL); 391 } 392 393 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 394 BdrvChild *parent, bool ignore_bds_parents, 395 bool poll) 396 { 397 BdrvChild *child, *next; 398 399 if (qemu_in_coroutine()) { 400 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 401 poll, NULL); 402 return; 403 } 404 405 bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 406 407 if (recursive) { 408 assert(!ignore_bds_parents); 409 bs->recursive_quiesce_counter++; 410 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 411 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 412 false); 413 } 414 } 415 416 /* 417 * Wait for drained requests to finish. 418 * 419 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 420 * call is needed so things in this AioContext can make progress even 421 * though we don't return to the main AioContext loop - this automatically 422 * includes other nodes in the same AioContext and therefore all child 423 * nodes. 424 */ 425 if (poll) { 426 assert(!ignore_bds_parents); 427 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 428 } 429 } 430 431 void bdrv_drained_begin(BlockDriverState *bs) 432 { 433 bdrv_do_drained_begin(bs, false, NULL, false, true); 434 } 435 436 void bdrv_subtree_drained_begin(BlockDriverState *bs) 437 { 438 bdrv_do_drained_begin(bs, true, NULL, false, true); 439 } 440 441 /** 442 * This function does not poll, nor must any of its recursively called 443 * functions. The *drained_end_counter pointee will be incremented 444 * once for every background operation scheduled, and decremented once 445 * the operation settles. Therefore, the pointer must remain valid 446 * until the pointee reaches 0. That implies that whoever sets up the 447 * pointee has to poll until it is 0. 448 * 449 * We use atomic operations to access *drained_end_counter, because 450 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 451 * @bs may contain nodes in different AioContexts, 452 * (2) bdrv_drain_all_end() uses the same counter for all nodes, 453 * regardless of which AioContext they are in. 454 */ 455 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 456 BdrvChild *parent, bool ignore_bds_parents, 457 int *drained_end_counter) 458 { 459 BdrvChild *child; 460 int old_quiesce_counter; 461 462 assert(drained_end_counter != NULL); 463 464 if (qemu_in_coroutine()) { 465 bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 466 false, drained_end_counter); 467 return; 468 } 469 assert(bs->quiesce_counter > 0); 470 471 /* Re-enable things in child-to-parent order */ 472 bdrv_drain_invoke(bs, false, drained_end_counter); 473 bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 474 drained_end_counter); 475 476 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 477 if (old_quiesce_counter == 1) { 478 aio_enable_external(bdrv_get_aio_context(bs)); 479 } 480 481 if (recursive) { 482 assert(!ignore_bds_parents); 483 bs->recursive_quiesce_counter--; 484 QLIST_FOREACH(child, &bs->children, next) { 485 bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 486 drained_end_counter); 487 } 488 } 489 } 490 491 void bdrv_drained_end(BlockDriverState *bs) 492 { 493 int drained_end_counter = 0; 494 bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 495 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 496 } 497 498 void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 499 { 500 bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 501 } 502 503 void bdrv_subtree_drained_end(BlockDriverState *bs) 504 { 505 int drained_end_counter = 0; 506 bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 507 BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 508 } 509 510 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 511 { 512 int i; 513 514 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 515 bdrv_do_drained_begin(child->bs, true, child, false, true); 516 } 517 } 518 519 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 520 { 521 int drained_end_counter = 0; 522 int i; 523 524 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 525 bdrv_do_drained_end(child->bs, true, child, false, 526 &drained_end_counter); 527 } 528 529 BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0); 530 } 531 532 /* 533 * Wait for pending requests to complete on a single BlockDriverState subtree, 534 * and suspend block driver's internal I/O until next request arrives. 535 * 536 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 537 * AioContext. 538 */ 539 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 540 { 541 assert(qemu_in_coroutine()); 542 bdrv_drained_begin(bs); 543 bdrv_drained_end(bs); 544 } 545 546 void bdrv_drain(BlockDriverState *bs) 547 { 548 bdrv_drained_begin(bs); 549 bdrv_drained_end(bs); 550 } 551 552 static void bdrv_drain_assert_idle(BlockDriverState *bs) 553 { 554 BdrvChild *child, *next; 555 556 assert(atomic_read(&bs->in_flight) == 0); 557 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 558 bdrv_drain_assert_idle(child->bs); 559 } 560 } 561 562 unsigned int bdrv_drain_all_count = 0; 563 564 static bool bdrv_drain_all_poll(void) 565 { 566 BlockDriverState *bs = NULL; 567 bool result = false; 568 569 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 570 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 571 while ((bs = bdrv_next_all_states(bs))) { 572 AioContext *aio_context = bdrv_get_aio_context(bs); 573 aio_context_acquire(aio_context); 574 result |= bdrv_drain_poll(bs, false, NULL, true); 575 aio_context_release(aio_context); 576 } 577 578 return result; 579 } 580 581 /* 582 * Wait for pending requests to complete across all BlockDriverStates 583 * 584 * This function does not flush data to disk, use bdrv_flush_all() for that 585 * after calling this function. 586 * 587 * This pauses all block jobs and disables external clients. It must 588 * be paired with bdrv_drain_all_end(). 589 * 590 * NOTE: no new block jobs or BlockDriverStates can be created between 591 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 592 */ 593 void bdrv_drain_all_begin(void) 594 { 595 BlockDriverState *bs = NULL; 596 597 if (qemu_in_coroutine()) { 598 bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 599 return; 600 } 601 602 /* 603 * bdrv queue is managed by record/replay, 604 * waiting for finishing the I/O requests may 605 * be infinite 606 */ 607 if (replay_events_enabled()) { 608 return; 609 } 610 611 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 612 * loop AioContext, so make sure we're in the main context. */ 613 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 614 assert(bdrv_drain_all_count < INT_MAX); 615 bdrv_drain_all_count++; 616 617 /* Quiesce all nodes, without polling in-flight requests yet. The graph 618 * cannot change during this loop. */ 619 while ((bs = bdrv_next_all_states(bs))) { 620 AioContext *aio_context = bdrv_get_aio_context(bs); 621 622 aio_context_acquire(aio_context); 623 bdrv_do_drained_begin(bs, false, NULL, true, false); 624 aio_context_release(aio_context); 625 } 626 627 /* Now poll the in-flight requests */ 628 AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 629 630 while ((bs = bdrv_next_all_states(bs))) { 631 bdrv_drain_assert_idle(bs); 632 } 633 } 634 635 void bdrv_drain_all_end(void) 636 { 637 BlockDriverState *bs = NULL; 638 int drained_end_counter = 0; 639 640 /* 641 * bdrv queue is managed by record/replay, 642 * waiting for finishing the I/O requests may 643 * be endless 644 */ 645 if (replay_events_enabled()) { 646 return; 647 } 648 649 while ((bs = bdrv_next_all_states(bs))) { 650 AioContext *aio_context = bdrv_get_aio_context(bs); 651 652 aio_context_acquire(aio_context); 653 bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 654 aio_context_release(aio_context); 655 } 656 657 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 658 AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0); 659 660 assert(bdrv_drain_all_count > 0); 661 bdrv_drain_all_count--; 662 } 663 664 void bdrv_drain_all(void) 665 { 666 bdrv_drain_all_begin(); 667 bdrv_drain_all_end(); 668 } 669 670 /** 671 * Remove an active request from the tracked requests list 672 * 673 * This function should be called when a tracked request is completing. 674 */ 675 static void tracked_request_end(BdrvTrackedRequest *req) 676 { 677 if (req->serialising) { 678 atomic_dec(&req->bs->serialising_in_flight); 679 } 680 681 qemu_co_mutex_lock(&req->bs->reqs_lock); 682 QLIST_REMOVE(req, list); 683 qemu_co_queue_restart_all(&req->wait_queue); 684 qemu_co_mutex_unlock(&req->bs->reqs_lock); 685 } 686 687 /** 688 * Add an active request to the tracked requests list 689 */ 690 static void tracked_request_begin(BdrvTrackedRequest *req, 691 BlockDriverState *bs, 692 int64_t offset, 693 uint64_t bytes, 694 enum BdrvTrackedRequestType type) 695 { 696 assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); 697 698 *req = (BdrvTrackedRequest){ 699 .bs = bs, 700 .offset = offset, 701 .bytes = bytes, 702 .type = type, 703 .co = qemu_coroutine_self(), 704 .serialising = false, 705 .overlap_offset = offset, 706 .overlap_bytes = bytes, 707 }; 708 709 qemu_co_queue_init(&req->wait_queue); 710 711 qemu_co_mutex_lock(&bs->reqs_lock); 712 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 713 qemu_co_mutex_unlock(&bs->reqs_lock); 714 } 715 716 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 717 int64_t offset, uint64_t bytes) 718 { 719 /* aaaa bbbb */ 720 if (offset >= req->overlap_offset + req->overlap_bytes) { 721 return false; 722 } 723 /* bbbb aaaa */ 724 if (req->overlap_offset >= offset + bytes) { 725 return false; 726 } 727 return true; 728 } 729 730 static bool coroutine_fn 731 bdrv_wait_serialising_requests_locked(BlockDriverState *bs, 732 BdrvTrackedRequest *self) 733 { 734 BdrvTrackedRequest *req; 735 bool retry; 736 bool waited = false; 737 738 do { 739 retry = false; 740 QLIST_FOREACH(req, &bs->tracked_requests, list) { 741 if (req == self || (!req->serialising && !self->serialising)) { 742 continue; 743 } 744 if (tracked_request_overlaps(req, self->overlap_offset, 745 self->overlap_bytes)) 746 { 747 /* Hitting this means there was a reentrant request, for 748 * example, a block driver issuing nested requests. This must 749 * never happen since it means deadlock. 750 */ 751 assert(qemu_coroutine_self() != req->co); 752 753 /* If the request is already (indirectly) waiting for us, or 754 * will wait for us as soon as it wakes up, then just go on 755 * (instead of producing a deadlock in the former case). */ 756 if (!req->waiting_for) { 757 self->waiting_for = req; 758 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 759 self->waiting_for = NULL; 760 retry = true; 761 waited = true; 762 break; 763 } 764 } 765 } 766 } while (retry); 767 return waited; 768 } 769 770 bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 771 { 772 BlockDriverState *bs = req->bs; 773 int64_t overlap_offset = req->offset & ~(align - 1); 774 uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 775 - overlap_offset; 776 bool waited; 777 778 qemu_co_mutex_lock(&bs->reqs_lock); 779 if (!req->serialising) { 780 atomic_inc(&req->bs->serialising_in_flight); 781 req->serialising = true; 782 } 783 784 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 785 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 786 waited = bdrv_wait_serialising_requests_locked(bs, req); 787 qemu_co_mutex_unlock(&bs->reqs_lock); 788 return waited; 789 } 790 791 /** 792 * Return the tracked request on @bs for the current coroutine, or 793 * NULL if there is none. 794 */ 795 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 796 { 797 BdrvTrackedRequest *req; 798 Coroutine *self = qemu_coroutine_self(); 799 800 QLIST_FOREACH(req, &bs->tracked_requests, list) { 801 if (req->co == self) { 802 return req; 803 } 804 } 805 806 return NULL; 807 } 808 809 /** 810 * Round a region to cluster boundaries 811 */ 812 void bdrv_round_to_clusters(BlockDriverState *bs, 813 int64_t offset, int64_t bytes, 814 int64_t *cluster_offset, 815 int64_t *cluster_bytes) 816 { 817 BlockDriverInfo bdi; 818 819 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 820 *cluster_offset = offset; 821 *cluster_bytes = bytes; 822 } else { 823 int64_t c = bdi.cluster_size; 824 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 825 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 826 } 827 } 828 829 static int bdrv_get_cluster_size(BlockDriverState *bs) 830 { 831 BlockDriverInfo bdi; 832 int ret; 833 834 ret = bdrv_get_info(bs, &bdi); 835 if (ret < 0 || bdi.cluster_size == 0) { 836 return bs->bl.request_alignment; 837 } else { 838 return bdi.cluster_size; 839 } 840 } 841 842 void bdrv_inc_in_flight(BlockDriverState *bs) 843 { 844 atomic_inc(&bs->in_flight); 845 } 846 847 void bdrv_wakeup(BlockDriverState *bs) 848 { 849 aio_wait_kick(); 850 } 851 852 void bdrv_dec_in_flight(BlockDriverState *bs) 853 { 854 atomic_dec(&bs->in_flight); 855 bdrv_wakeup(bs); 856 } 857 858 static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 859 { 860 BlockDriverState *bs = self->bs; 861 bool waited = false; 862 863 if (!atomic_read(&bs->serialising_in_flight)) { 864 return false; 865 } 866 867 qemu_co_mutex_lock(&bs->reqs_lock); 868 waited = bdrv_wait_serialising_requests_locked(bs, self); 869 qemu_co_mutex_unlock(&bs->reqs_lock); 870 871 return waited; 872 } 873 874 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 875 size_t size) 876 { 877 if (size > BDRV_REQUEST_MAX_BYTES) { 878 return -EIO; 879 } 880 881 if (!bdrv_is_inserted(bs)) { 882 return -ENOMEDIUM; 883 } 884 885 if (offset < 0) { 886 return -EIO; 887 } 888 889 return 0; 890 } 891 892 typedef int coroutine_fn BdrvRequestEntry(void *opaque); 893 typedef struct BdrvRunCo { 894 BdrvRequestEntry *entry; 895 void *opaque; 896 int ret; 897 bool done; 898 Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */ 899 } BdrvRunCo; 900 901 static void coroutine_fn bdrv_run_co_entry(void *opaque) 902 { 903 BdrvRunCo *arg = opaque; 904 905 arg->ret = arg->entry(arg->opaque); 906 arg->done = true; 907 aio_wait_kick(); 908 } 909 910 static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry, 911 void *opaque) 912 { 913 if (qemu_in_coroutine()) { 914 /* Fast-path if already in coroutine context */ 915 return entry(opaque); 916 } else { 917 BdrvRunCo s = { .entry = entry, .opaque = opaque }; 918 919 s.co = qemu_coroutine_create(bdrv_run_co_entry, &s); 920 bdrv_coroutine_enter(bs, s.co); 921 922 BDRV_POLL_WHILE(bs, !s.done); 923 924 return s.ret; 925 } 926 } 927 928 typedef struct RwCo { 929 BdrvChild *child; 930 int64_t offset; 931 QEMUIOVector *qiov; 932 bool is_write; 933 BdrvRequestFlags flags; 934 } RwCo; 935 936 static int coroutine_fn bdrv_rw_co_entry(void *opaque) 937 { 938 RwCo *rwco = opaque; 939 940 if (!rwco->is_write) { 941 return bdrv_co_preadv(rwco->child, rwco->offset, 942 rwco->qiov->size, rwco->qiov, 943 rwco->flags); 944 } else { 945 return bdrv_co_pwritev(rwco->child, rwco->offset, 946 rwco->qiov->size, rwco->qiov, 947 rwco->flags); 948 } 949 } 950 951 /* 952 * Process a vectored synchronous request using coroutines 953 */ 954 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 955 QEMUIOVector *qiov, bool is_write, 956 BdrvRequestFlags flags) 957 { 958 RwCo rwco = { 959 .child = child, 960 .offset = offset, 961 .qiov = qiov, 962 .is_write = is_write, 963 .flags = flags, 964 }; 965 966 return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco); 967 } 968 969 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 970 int bytes, BdrvRequestFlags flags) 971 { 972 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); 973 974 return bdrv_prwv_co(child, offset, &qiov, true, 975 BDRV_REQ_ZERO_WRITE | flags); 976 } 977 978 /* 979 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 980 * The operation is sped up by checking the block status and only writing 981 * zeroes to the device if they currently do not return zeroes. Optional 982 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 983 * BDRV_REQ_FUA). 984 * 985 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 986 */ 987 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 988 { 989 int ret; 990 int64_t target_size, bytes, offset = 0; 991 BlockDriverState *bs = child->bs; 992 993 target_size = bdrv_getlength(bs); 994 if (target_size < 0) { 995 return target_size; 996 } 997 998 for (;;) { 999 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 1000 if (bytes <= 0) { 1001 return 0; 1002 } 1003 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 1004 if (ret < 0) { 1005 return ret; 1006 } 1007 if (ret & BDRV_BLOCK_ZERO) { 1008 offset += bytes; 1009 continue; 1010 } 1011 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 1012 if (ret < 0) { 1013 return ret; 1014 } 1015 offset += bytes; 1016 } 1017 } 1018 1019 /* return < 0 if error. See bdrv_pwrite() for the return codes */ 1020 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 1021 { 1022 int ret; 1023 1024 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 1025 if (ret < 0) { 1026 return ret; 1027 } 1028 1029 return qiov->size; 1030 } 1031 1032 /* See bdrv_pwrite() for the return codes */ 1033 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 1034 { 1035 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1036 1037 if (bytes < 0) { 1038 return -EINVAL; 1039 } 1040 1041 return bdrv_preadv(child, offset, &qiov); 1042 } 1043 1044 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 1045 { 1046 int ret; 1047 1048 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 1049 if (ret < 0) { 1050 return ret; 1051 } 1052 1053 return qiov->size; 1054 } 1055 1056 /* Return no. of bytes on success or < 0 on error. Important errors are: 1057 -EIO generic I/O error (may happen for all errors) 1058 -ENOMEDIUM No media inserted. 1059 -EINVAL Invalid offset or number of bytes 1060 -EACCES Trying to write a read-only device 1061 */ 1062 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 1063 { 1064 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1065 1066 if (bytes < 0) { 1067 return -EINVAL; 1068 } 1069 1070 return bdrv_pwritev(child, offset, &qiov); 1071 } 1072 1073 /* 1074 * Writes to the file and ensures that no writes are reordered across this 1075 * request (acts as a barrier) 1076 * 1077 * Returns 0 on success, -errno in error cases. 1078 */ 1079 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 1080 const void *buf, int count) 1081 { 1082 int ret; 1083 1084 ret = bdrv_pwrite(child, offset, buf, count); 1085 if (ret < 0) { 1086 return ret; 1087 } 1088 1089 ret = bdrv_flush(child->bs); 1090 if (ret < 0) { 1091 return ret; 1092 } 1093 1094 return 0; 1095 } 1096 1097 typedef struct CoroutineIOCompletion { 1098 Coroutine *coroutine; 1099 int ret; 1100 } CoroutineIOCompletion; 1101 1102 static void bdrv_co_io_em_complete(void *opaque, int ret) 1103 { 1104 CoroutineIOCompletion *co = opaque; 1105 1106 co->ret = ret; 1107 aio_co_wake(co->coroutine); 1108 } 1109 1110 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1111 uint64_t offset, uint64_t bytes, 1112 QEMUIOVector *qiov, 1113 size_t qiov_offset, int flags) 1114 { 1115 BlockDriver *drv = bs->drv; 1116 int64_t sector_num; 1117 unsigned int nb_sectors; 1118 QEMUIOVector local_qiov; 1119 int ret; 1120 1121 assert(!(flags & ~BDRV_REQ_MASK)); 1122 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1123 1124 if (!drv) { 1125 return -ENOMEDIUM; 1126 } 1127 1128 if (drv->bdrv_co_preadv_part) { 1129 return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1130 flags); 1131 } 1132 1133 if (qiov_offset > 0 || bytes != qiov->size) { 1134 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1135 qiov = &local_qiov; 1136 } 1137 1138 if (drv->bdrv_co_preadv) { 1139 ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1140 goto out; 1141 } 1142 1143 if (drv->bdrv_aio_preadv) { 1144 BlockAIOCB *acb; 1145 CoroutineIOCompletion co = { 1146 .coroutine = qemu_coroutine_self(), 1147 }; 1148 1149 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 1150 bdrv_co_io_em_complete, &co); 1151 if (acb == NULL) { 1152 ret = -EIO; 1153 goto out; 1154 } else { 1155 qemu_coroutine_yield(); 1156 ret = co.ret; 1157 goto out; 1158 } 1159 } 1160 1161 sector_num = offset >> BDRV_SECTOR_BITS; 1162 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1163 1164 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1165 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1166 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1167 assert(drv->bdrv_co_readv); 1168 1169 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1170 1171 out: 1172 if (qiov == &local_qiov) { 1173 qemu_iovec_destroy(&local_qiov); 1174 } 1175 1176 return ret; 1177 } 1178 1179 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1180 uint64_t offset, uint64_t bytes, 1181 QEMUIOVector *qiov, 1182 size_t qiov_offset, int flags) 1183 { 1184 BlockDriver *drv = bs->drv; 1185 int64_t sector_num; 1186 unsigned int nb_sectors; 1187 QEMUIOVector local_qiov; 1188 int ret; 1189 1190 assert(!(flags & ~BDRV_REQ_MASK)); 1191 assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1192 1193 if (!drv) { 1194 return -ENOMEDIUM; 1195 } 1196 1197 if (drv->bdrv_co_pwritev_part) { 1198 ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1199 flags & bs->supported_write_flags); 1200 flags &= ~bs->supported_write_flags; 1201 goto emulate_flags; 1202 } 1203 1204 if (qiov_offset > 0 || bytes != qiov->size) { 1205 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1206 qiov = &local_qiov; 1207 } 1208 1209 if (drv->bdrv_co_pwritev) { 1210 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1211 flags & bs->supported_write_flags); 1212 flags &= ~bs->supported_write_flags; 1213 goto emulate_flags; 1214 } 1215 1216 if (drv->bdrv_aio_pwritev) { 1217 BlockAIOCB *acb; 1218 CoroutineIOCompletion co = { 1219 .coroutine = qemu_coroutine_self(), 1220 }; 1221 1222 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1223 flags & bs->supported_write_flags, 1224 bdrv_co_io_em_complete, &co); 1225 flags &= ~bs->supported_write_flags; 1226 if (acb == NULL) { 1227 ret = -EIO; 1228 } else { 1229 qemu_coroutine_yield(); 1230 ret = co.ret; 1231 } 1232 goto emulate_flags; 1233 } 1234 1235 sector_num = offset >> BDRV_SECTOR_BITS; 1236 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1237 1238 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 1239 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 1240 assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1241 1242 assert(drv->bdrv_co_writev); 1243 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1244 flags & bs->supported_write_flags); 1245 flags &= ~bs->supported_write_flags; 1246 1247 emulate_flags: 1248 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1249 ret = bdrv_co_flush(bs); 1250 } 1251 1252 if (qiov == &local_qiov) { 1253 qemu_iovec_destroy(&local_qiov); 1254 } 1255 1256 return ret; 1257 } 1258 1259 static int coroutine_fn 1260 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1261 uint64_t bytes, QEMUIOVector *qiov, 1262 size_t qiov_offset) 1263 { 1264 BlockDriver *drv = bs->drv; 1265 QEMUIOVector local_qiov; 1266 int ret; 1267 1268 if (!drv) { 1269 return -ENOMEDIUM; 1270 } 1271 1272 if (!block_driver_can_compress(drv)) { 1273 return -ENOTSUP; 1274 } 1275 1276 if (drv->bdrv_co_pwritev_compressed_part) { 1277 return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1278 qiov, qiov_offset); 1279 } 1280 1281 if (qiov_offset == 0) { 1282 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1283 } 1284 1285 qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1286 ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1287 qemu_iovec_destroy(&local_qiov); 1288 1289 return ret; 1290 } 1291 1292 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1293 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1294 size_t qiov_offset, int flags) 1295 { 1296 BlockDriverState *bs = child->bs; 1297 1298 /* Perform I/O through a temporary buffer so that users who scribble over 1299 * their read buffer while the operation is in progress do not end up 1300 * modifying the image file. This is critical for zero-copy guest I/O 1301 * where anything might happen inside guest memory. 1302 */ 1303 void *bounce_buffer = NULL; 1304 1305 BlockDriver *drv = bs->drv; 1306 int64_t cluster_offset; 1307 int64_t cluster_bytes; 1308 size_t skip_bytes; 1309 int ret; 1310 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1311 BDRV_REQUEST_MAX_BYTES); 1312 unsigned int progress = 0; 1313 bool skip_write; 1314 1315 if (!drv) { 1316 return -ENOMEDIUM; 1317 } 1318 1319 /* 1320 * Do not write anything when the BDS is inactive. That is not 1321 * allowed, and it would not help. 1322 */ 1323 skip_write = (bs->open_flags & BDRV_O_INACTIVE); 1324 1325 /* FIXME We cannot require callers to have write permissions when all they 1326 * are doing is a read request. If we did things right, write permissions 1327 * would be obtained anyway, but internally by the copy-on-read code. As 1328 * long as it is implemented here rather than in a separate filter driver, 1329 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1330 * it could request permissions. Therefore we have to bypass the permission 1331 * system for the moment. */ 1332 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1333 1334 /* Cover entire cluster so no additional backing file I/O is required when 1335 * allocating cluster in the image file. Note that this value may exceed 1336 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1337 * is one reason we loop rather than doing it all at once. 1338 */ 1339 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1340 skip_bytes = offset - cluster_offset; 1341 1342 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1343 cluster_offset, cluster_bytes); 1344 1345 while (cluster_bytes) { 1346 int64_t pnum; 1347 1348 if (skip_write) { 1349 ret = 1; /* "already allocated", so nothing will be copied */ 1350 pnum = MIN(cluster_bytes, max_transfer); 1351 } else { 1352 ret = bdrv_is_allocated(bs, cluster_offset, 1353 MIN(cluster_bytes, max_transfer), &pnum); 1354 if (ret < 0) { 1355 /* 1356 * Safe to treat errors in querying allocation as if 1357 * unallocated; we'll probably fail again soon on the 1358 * read, but at least that will set a decent errno. 1359 */ 1360 pnum = MIN(cluster_bytes, max_transfer); 1361 } 1362 1363 /* Stop at EOF if the image ends in the middle of the cluster */ 1364 if (ret == 0 && pnum == 0) { 1365 assert(progress >= bytes); 1366 break; 1367 } 1368 1369 assert(skip_bytes < pnum); 1370 } 1371 1372 if (ret <= 0) { 1373 QEMUIOVector local_qiov; 1374 1375 /* Must copy-on-read; use the bounce buffer */ 1376 pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1377 if (!bounce_buffer) { 1378 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 1379 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 1380 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 1381 1382 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 1383 if (!bounce_buffer) { 1384 ret = -ENOMEM; 1385 goto err; 1386 } 1387 } 1388 qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1389 1390 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1391 &local_qiov, 0, 0); 1392 if (ret < 0) { 1393 goto err; 1394 } 1395 1396 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1397 if (drv->bdrv_co_pwrite_zeroes && 1398 buffer_is_zero(bounce_buffer, pnum)) { 1399 /* FIXME: Should we (perhaps conditionally) be setting 1400 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1401 * that still correctly reads as zero? */ 1402 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1403 BDRV_REQ_WRITE_UNCHANGED); 1404 } else { 1405 /* This does not change the data on the disk, it is not 1406 * necessary to flush even in cache=writethrough mode. 1407 */ 1408 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1409 &local_qiov, 0, 1410 BDRV_REQ_WRITE_UNCHANGED); 1411 } 1412 1413 if (ret < 0) { 1414 /* It might be okay to ignore write errors for guest 1415 * requests. If this is a deliberate copy-on-read 1416 * then we don't want to ignore the error. Simply 1417 * report it in all cases. 1418 */ 1419 goto err; 1420 } 1421 1422 if (!(flags & BDRV_REQ_PREFETCH)) { 1423 qemu_iovec_from_buf(qiov, qiov_offset + progress, 1424 bounce_buffer + skip_bytes, 1425 MIN(pnum - skip_bytes, bytes - progress)); 1426 } 1427 } else if (!(flags & BDRV_REQ_PREFETCH)) { 1428 /* Read directly into the destination */ 1429 ret = bdrv_driver_preadv(bs, offset + progress, 1430 MIN(pnum - skip_bytes, bytes - progress), 1431 qiov, qiov_offset + progress, 0); 1432 if (ret < 0) { 1433 goto err; 1434 } 1435 } 1436 1437 cluster_offset += pnum; 1438 cluster_bytes -= pnum; 1439 progress += pnum - skip_bytes; 1440 skip_bytes = 0; 1441 } 1442 ret = 0; 1443 1444 err: 1445 qemu_vfree(bounce_buffer); 1446 return ret; 1447 } 1448 1449 /* 1450 * Forwards an already correctly aligned request to the BlockDriver. This 1451 * handles copy on read, zeroing after EOF, and fragmentation of large 1452 * reads; any other features must be implemented by the caller. 1453 */ 1454 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1455 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1456 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1457 { 1458 BlockDriverState *bs = child->bs; 1459 int64_t total_bytes, max_bytes; 1460 int ret = 0; 1461 uint64_t bytes_remaining = bytes; 1462 int max_transfer; 1463 1464 assert(is_power_of_2(align)); 1465 assert((offset & (align - 1)) == 0); 1466 assert((bytes & (align - 1)) == 0); 1467 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1468 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1469 align); 1470 1471 /* TODO: We would need a per-BDS .supported_read_flags and 1472 * potential fallback support, if we ever implement any read flags 1473 * to pass through to drivers. For now, there aren't any 1474 * passthrough flags. */ 1475 assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); 1476 1477 /* Handle Copy on Read and associated serialisation */ 1478 if (flags & BDRV_REQ_COPY_ON_READ) { 1479 /* If we touch the same cluster it counts as an overlap. This 1480 * guarantees that allocating writes will be serialized and not race 1481 * with each other for the same cluster. For example, in copy-on-read 1482 * it ensures that the CoR read and write operations are atomic and 1483 * guest writes cannot interleave between them. */ 1484 bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1485 } else { 1486 bdrv_wait_serialising_requests(req); 1487 } 1488 1489 if (flags & BDRV_REQ_COPY_ON_READ) { 1490 int64_t pnum; 1491 1492 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1493 if (ret < 0) { 1494 goto out; 1495 } 1496 1497 if (!ret || pnum != bytes) { 1498 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 1499 qiov, qiov_offset, flags); 1500 goto out; 1501 } else if (flags & BDRV_REQ_PREFETCH) { 1502 goto out; 1503 } 1504 } 1505 1506 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1507 total_bytes = bdrv_getlength(bs); 1508 if (total_bytes < 0) { 1509 ret = total_bytes; 1510 goto out; 1511 } 1512 1513 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1514 if (bytes <= max_bytes && bytes <= max_transfer) { 1515 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); 1516 goto out; 1517 } 1518 1519 while (bytes_remaining) { 1520 int num; 1521 1522 if (max_bytes) { 1523 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1524 assert(num); 1525 1526 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1527 num, qiov, bytes - bytes_remaining, 0); 1528 max_bytes -= num; 1529 } else { 1530 num = bytes_remaining; 1531 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1532 bytes_remaining); 1533 } 1534 if (ret < 0) { 1535 goto out; 1536 } 1537 bytes_remaining -= num; 1538 } 1539 1540 out: 1541 return ret < 0 ? ret : 0; 1542 } 1543 1544 /* 1545 * Request padding 1546 * 1547 * |<---- align ----->| |<----- align ---->| 1548 * |<- head ->|<------------- bytes ------------->|<-- tail -->| 1549 * | | | | | | 1550 * -*----------$-------*-------- ... --------*-----$------------*--- 1551 * | | | | | | 1552 * | offset | | end | 1553 * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 1554 * [buf ... ) [tail_buf ) 1555 * 1556 * @buf is an aligned allocation needed to store @head and @tail paddings. @head 1557 * is placed at the beginning of @buf and @tail at the @end. 1558 * 1559 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 1560 * around tail, if tail exists. 1561 * 1562 * @merge_reads is true for small requests, 1563 * if @buf_len == @head + bytes + @tail. In this case it is possible that both 1564 * head and tail exist but @buf_len == align and @tail_buf == @buf. 1565 */ 1566 typedef struct BdrvRequestPadding { 1567 uint8_t *buf; 1568 size_t buf_len; 1569 uint8_t *tail_buf; 1570 size_t head; 1571 size_t tail; 1572 bool merge_reads; 1573 QEMUIOVector local_qiov; 1574 } BdrvRequestPadding; 1575 1576 static bool bdrv_init_padding(BlockDriverState *bs, 1577 int64_t offset, int64_t bytes, 1578 BdrvRequestPadding *pad) 1579 { 1580 uint64_t align = bs->bl.request_alignment; 1581 size_t sum; 1582 1583 memset(pad, 0, sizeof(*pad)); 1584 1585 pad->head = offset & (align - 1); 1586 pad->tail = ((offset + bytes) & (align - 1)); 1587 if (pad->tail) { 1588 pad->tail = align - pad->tail; 1589 } 1590 1591 if (!pad->head && !pad->tail) { 1592 return false; 1593 } 1594 1595 assert(bytes); /* Nothing good in aligning zero-length requests */ 1596 1597 sum = pad->head + bytes + pad->tail; 1598 pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 1599 pad->buf = qemu_blockalign(bs, pad->buf_len); 1600 pad->merge_reads = sum == pad->buf_len; 1601 if (pad->tail) { 1602 pad->tail_buf = pad->buf + pad->buf_len - align; 1603 } 1604 1605 return true; 1606 } 1607 1608 static int bdrv_padding_rmw_read(BdrvChild *child, 1609 BdrvTrackedRequest *req, 1610 BdrvRequestPadding *pad, 1611 bool zero_middle) 1612 { 1613 QEMUIOVector local_qiov; 1614 BlockDriverState *bs = child->bs; 1615 uint64_t align = bs->bl.request_alignment; 1616 int ret; 1617 1618 assert(req->serialising && pad->buf); 1619 1620 if (pad->head || pad->merge_reads) { 1621 uint64_t bytes = pad->merge_reads ? pad->buf_len : align; 1622 1623 qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 1624 1625 if (pad->head) { 1626 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1627 } 1628 if (pad->merge_reads && pad->tail) { 1629 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1630 } 1631 ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 1632 align, &local_qiov, 0, 0); 1633 if (ret < 0) { 1634 return ret; 1635 } 1636 if (pad->head) { 1637 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1638 } 1639 if (pad->merge_reads && pad->tail) { 1640 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1641 } 1642 1643 if (pad->merge_reads) { 1644 goto zero_mem; 1645 } 1646 } 1647 1648 if (pad->tail) { 1649 qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 1650 1651 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1652 ret = bdrv_aligned_preadv( 1653 child, req, 1654 req->overlap_offset + req->overlap_bytes - align, 1655 align, align, &local_qiov, 0, 0); 1656 if (ret < 0) { 1657 return ret; 1658 } 1659 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1660 } 1661 1662 zero_mem: 1663 if (zero_middle) { 1664 memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 1665 } 1666 1667 return 0; 1668 } 1669 1670 static void bdrv_padding_destroy(BdrvRequestPadding *pad) 1671 { 1672 if (pad->buf) { 1673 qemu_vfree(pad->buf); 1674 qemu_iovec_destroy(&pad->local_qiov); 1675 } 1676 } 1677 1678 /* 1679 * bdrv_pad_request 1680 * 1681 * Exchange request parameters with padded request if needed. Don't include RMW 1682 * read of padding, bdrv_padding_rmw_read() should be called separately if 1683 * needed. 1684 * 1685 * All parameters except @bs are in-out: they represent original request at 1686 * function call and padded (if padding needed) at function finish. 1687 * 1688 * Function always succeeds. 1689 */ 1690 static bool bdrv_pad_request(BlockDriverState *bs, 1691 QEMUIOVector **qiov, size_t *qiov_offset, 1692 int64_t *offset, unsigned int *bytes, 1693 BdrvRequestPadding *pad) 1694 { 1695 if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 1696 return false; 1697 } 1698 1699 qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 1700 *qiov, *qiov_offset, *bytes, 1701 pad->buf + pad->buf_len - pad->tail, pad->tail); 1702 *bytes += pad->head + pad->tail; 1703 *offset -= pad->head; 1704 *qiov = &pad->local_qiov; 1705 *qiov_offset = 0; 1706 1707 return true; 1708 } 1709 1710 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1711 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1712 BdrvRequestFlags flags) 1713 { 1714 return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 1715 } 1716 1717 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 1718 int64_t offset, unsigned int bytes, 1719 QEMUIOVector *qiov, size_t qiov_offset, 1720 BdrvRequestFlags flags) 1721 { 1722 BlockDriverState *bs = child->bs; 1723 BdrvTrackedRequest req; 1724 BdrvRequestPadding pad; 1725 int ret; 1726 1727 trace_bdrv_co_preadv(bs, offset, bytes, flags); 1728 1729 ret = bdrv_check_byte_request(bs, offset, bytes); 1730 if (ret < 0) { 1731 return ret; 1732 } 1733 1734 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1735 /* 1736 * Aligning zero request is nonsense. Even if driver has special meaning 1737 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1738 * it to driver due to request_alignment. 1739 * 1740 * Still, no reason to return an error if someone do unaligned 1741 * zero-length read occasionally. 1742 */ 1743 return 0; 1744 } 1745 1746 bdrv_inc_in_flight(bs); 1747 1748 /* Don't do copy-on-read if we read data before write operation */ 1749 if (atomic_read(&bs->copy_on_read)) { 1750 flags |= BDRV_REQ_COPY_ON_READ; 1751 } 1752 1753 bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad); 1754 1755 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1756 ret = bdrv_aligned_preadv(child, &req, offset, bytes, 1757 bs->bl.request_alignment, 1758 qiov, qiov_offset, flags); 1759 tracked_request_end(&req); 1760 bdrv_dec_in_flight(bs); 1761 1762 bdrv_padding_destroy(&pad); 1763 1764 return ret; 1765 } 1766 1767 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1768 int64_t offset, int bytes, BdrvRequestFlags flags) 1769 { 1770 BlockDriver *drv = bs->drv; 1771 QEMUIOVector qiov; 1772 void *buf = NULL; 1773 int ret = 0; 1774 bool need_flush = false; 1775 int head = 0; 1776 int tail = 0; 1777 1778 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1779 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1780 bs->bl.request_alignment); 1781 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1782 1783 if (!drv) { 1784 return -ENOMEDIUM; 1785 } 1786 1787 if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1788 return -ENOTSUP; 1789 } 1790 1791 assert(alignment % bs->bl.request_alignment == 0); 1792 head = offset % alignment; 1793 tail = (offset + bytes) % alignment; 1794 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1795 assert(max_write_zeroes >= bs->bl.request_alignment); 1796 1797 while (bytes > 0 && !ret) { 1798 int num = bytes; 1799 1800 /* Align request. Block drivers can expect the "bulk" of the request 1801 * to be aligned, and that unaligned requests do not cross cluster 1802 * boundaries. 1803 */ 1804 if (head) { 1805 /* Make a small request up to the first aligned sector. For 1806 * convenience, limit this request to max_transfer even if 1807 * we don't need to fall back to writes. */ 1808 num = MIN(MIN(bytes, max_transfer), alignment - head); 1809 head = (head + num) % alignment; 1810 assert(num < max_write_zeroes); 1811 } else if (tail && num > alignment) { 1812 /* Shorten the request to the last aligned sector. */ 1813 num -= tail; 1814 } 1815 1816 /* limit request size */ 1817 if (num > max_write_zeroes) { 1818 num = max_write_zeroes; 1819 } 1820 1821 ret = -ENOTSUP; 1822 /* First try the efficient write zeroes operation */ 1823 if (drv->bdrv_co_pwrite_zeroes) { 1824 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1825 flags & bs->supported_zero_flags); 1826 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1827 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1828 need_flush = true; 1829 } 1830 } else { 1831 assert(!bs->supported_zero_flags); 1832 } 1833 1834 if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 1835 /* Fall back to bounce buffer if write zeroes is unsupported */ 1836 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1837 1838 if ((flags & BDRV_REQ_FUA) && 1839 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1840 /* No need for bdrv_driver_pwrite() to do a fallback 1841 * flush on each chunk; use just one at the end */ 1842 write_flags &= ~BDRV_REQ_FUA; 1843 need_flush = true; 1844 } 1845 num = MIN(num, max_transfer); 1846 if (buf == NULL) { 1847 buf = qemu_try_blockalign0(bs, num); 1848 if (buf == NULL) { 1849 ret = -ENOMEM; 1850 goto fail; 1851 } 1852 } 1853 qemu_iovec_init_buf(&qiov, buf, num); 1854 1855 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 1856 1857 /* Keep bounce buffer around if it is big enough for all 1858 * all future requests. 1859 */ 1860 if (num < max_transfer) { 1861 qemu_vfree(buf); 1862 buf = NULL; 1863 } 1864 } 1865 1866 offset += num; 1867 bytes -= num; 1868 } 1869 1870 fail: 1871 if (ret == 0 && need_flush) { 1872 ret = bdrv_co_flush(bs); 1873 } 1874 qemu_vfree(buf); 1875 return ret; 1876 } 1877 1878 static inline int coroutine_fn 1879 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, 1880 BdrvTrackedRequest *req, int flags) 1881 { 1882 BlockDriverState *bs = child->bs; 1883 bool waited; 1884 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1885 1886 if (bs->read_only) { 1887 return -EPERM; 1888 } 1889 1890 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1891 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1892 assert(!(flags & ~BDRV_REQ_MASK)); 1893 1894 if (flags & BDRV_REQ_SERIALISING) { 1895 waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1896 /* 1897 * For a misaligned request we should have already waited earlier, 1898 * because we come after bdrv_padding_rmw_read which must be called 1899 * with the request already marked as serialising. 1900 */ 1901 assert(!waited || 1902 (req->offset == req->overlap_offset && 1903 req->bytes == req->overlap_bytes)); 1904 } else { 1905 bdrv_wait_serialising_requests(req); 1906 } 1907 1908 assert(req->overlap_offset <= offset); 1909 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1910 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1911 1912 switch (req->type) { 1913 case BDRV_TRACKED_WRITE: 1914 case BDRV_TRACKED_DISCARD: 1915 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1916 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1917 } else { 1918 assert(child->perm & BLK_PERM_WRITE); 1919 } 1920 return notifier_with_return_list_notify(&bs->before_write_notifiers, 1921 req); 1922 case BDRV_TRACKED_TRUNCATE: 1923 assert(child->perm & BLK_PERM_RESIZE); 1924 return 0; 1925 default: 1926 abort(); 1927 } 1928 } 1929 1930 static inline void coroutine_fn 1931 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, 1932 BdrvTrackedRequest *req, int ret) 1933 { 1934 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1935 BlockDriverState *bs = child->bs; 1936 1937 atomic_inc(&bs->write_gen); 1938 1939 /* 1940 * Discard cannot extend the image, but in error handling cases, such as 1941 * when reverting a qcow2 cluster allocation, the discarded range can pass 1942 * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 1943 * here. Instead, just skip it, since semantically a discard request 1944 * beyond EOF cannot expand the image anyway. 1945 */ 1946 if (ret == 0 && 1947 (req->type == BDRV_TRACKED_TRUNCATE || 1948 end_sector > bs->total_sectors) && 1949 req->type != BDRV_TRACKED_DISCARD) { 1950 bs->total_sectors = end_sector; 1951 bdrv_parent_cb_resize(bs); 1952 bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 1953 } 1954 if (req->bytes) { 1955 switch (req->type) { 1956 case BDRV_TRACKED_WRITE: 1957 stat64_max(&bs->wr_highest_offset, offset + bytes); 1958 /* fall through, to set dirty bits */ 1959 case BDRV_TRACKED_DISCARD: 1960 bdrv_set_dirty(bs, offset, bytes); 1961 break; 1962 default: 1963 break; 1964 } 1965 } 1966 } 1967 1968 /* 1969 * Forwards an already correctly aligned write request to the BlockDriver, 1970 * after possibly fragmenting it. 1971 */ 1972 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1973 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1974 int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 1975 { 1976 BlockDriverState *bs = child->bs; 1977 BlockDriver *drv = bs->drv; 1978 int ret; 1979 1980 uint64_t bytes_remaining = bytes; 1981 int max_transfer; 1982 1983 if (!drv) { 1984 return -ENOMEDIUM; 1985 } 1986 1987 if (bdrv_has_readonly_bitmaps(bs)) { 1988 return -EPERM; 1989 } 1990 1991 assert(is_power_of_2(align)); 1992 assert((offset & (align - 1)) == 0); 1993 assert((bytes & (align - 1)) == 0); 1994 assert(!qiov || qiov_offset + bytes <= qiov->size); 1995 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1996 align); 1997 1998 ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 1999 2000 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 2001 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 2002 qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 2003 flags |= BDRV_REQ_ZERO_WRITE; 2004 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 2005 flags |= BDRV_REQ_MAY_UNMAP; 2006 } 2007 } 2008 2009 if (ret < 0) { 2010 /* Do nothing, write notifier decided to fail this request */ 2011 } else if (flags & BDRV_REQ_ZERO_WRITE) { 2012 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 2013 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 2014 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 2015 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 2016 qiov, qiov_offset); 2017 } else if (bytes <= max_transfer) { 2018 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2019 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 2020 } else { 2021 bdrv_debug_event(bs, BLKDBG_PWRITEV); 2022 while (bytes_remaining) { 2023 int num = MIN(bytes_remaining, max_transfer); 2024 int local_flags = flags; 2025 2026 assert(num); 2027 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 2028 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 2029 /* If FUA is going to be emulated by flush, we only 2030 * need to flush on the last iteration */ 2031 local_flags &= ~BDRV_REQ_FUA; 2032 } 2033 2034 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 2035 num, qiov, bytes - bytes_remaining, 2036 local_flags); 2037 if (ret < 0) { 2038 break; 2039 } 2040 bytes_remaining -= num; 2041 } 2042 } 2043 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 2044 2045 if (ret >= 0) { 2046 ret = 0; 2047 } 2048 bdrv_co_write_req_finish(child, offset, bytes, req, ret); 2049 2050 return ret; 2051 } 2052 2053 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 2054 int64_t offset, 2055 unsigned int bytes, 2056 BdrvRequestFlags flags, 2057 BdrvTrackedRequest *req) 2058 { 2059 BlockDriverState *bs = child->bs; 2060 QEMUIOVector local_qiov; 2061 uint64_t align = bs->bl.request_alignment; 2062 int ret = 0; 2063 bool padding; 2064 BdrvRequestPadding pad; 2065 2066 padding = bdrv_init_padding(bs, offset, bytes, &pad); 2067 if (padding) { 2068 bdrv_mark_request_serialising(req, align); 2069 2070 bdrv_padding_rmw_read(child, req, &pad, true); 2071 2072 if (pad.head || pad.merge_reads) { 2073 int64_t aligned_offset = offset & ~(align - 1); 2074 int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 2075 2076 qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 2077 ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 2078 align, &local_qiov, 0, 2079 flags & ~BDRV_REQ_ZERO_WRITE); 2080 if (ret < 0 || pad.merge_reads) { 2081 /* Error or all work is done */ 2082 goto out; 2083 } 2084 offset += write_bytes - pad.head; 2085 bytes -= write_bytes - pad.head; 2086 } 2087 } 2088 2089 assert(!bytes || (offset & (align - 1)) == 0); 2090 if (bytes >= align) { 2091 /* Write the aligned part in the middle. */ 2092 uint64_t aligned_bytes = bytes & ~(align - 1); 2093 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 2094 NULL, 0, flags); 2095 if (ret < 0) { 2096 goto out; 2097 } 2098 bytes -= aligned_bytes; 2099 offset += aligned_bytes; 2100 } 2101 2102 assert(!bytes || (offset & (align - 1)) == 0); 2103 if (bytes) { 2104 assert(align == pad.tail + bytes); 2105 2106 qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 2107 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 2108 &local_qiov, 0, 2109 flags & ~BDRV_REQ_ZERO_WRITE); 2110 } 2111 2112 out: 2113 bdrv_padding_destroy(&pad); 2114 2115 return ret; 2116 } 2117 2118 /* 2119 * Handle a write request in coroutine context 2120 */ 2121 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2122 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 2123 BdrvRequestFlags flags) 2124 { 2125 return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 2126 } 2127 2128 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 2129 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, 2130 BdrvRequestFlags flags) 2131 { 2132 BlockDriverState *bs = child->bs; 2133 BdrvTrackedRequest req; 2134 uint64_t align = bs->bl.request_alignment; 2135 BdrvRequestPadding pad; 2136 int ret; 2137 2138 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 2139 2140 if (!bs->drv) { 2141 return -ENOMEDIUM; 2142 } 2143 2144 ret = bdrv_check_byte_request(bs, offset, bytes); 2145 if (ret < 0) { 2146 return ret; 2147 } 2148 2149 /* If the request is misaligned then we can't make it efficient */ 2150 if ((flags & BDRV_REQ_NO_FALLBACK) && 2151 !QEMU_IS_ALIGNED(offset | bytes, align)) 2152 { 2153 return -ENOTSUP; 2154 } 2155 2156 if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2157 /* 2158 * Aligning zero request is nonsense. Even if driver has special meaning 2159 * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2160 * it to driver due to request_alignment. 2161 * 2162 * Still, no reason to return an error if someone do unaligned 2163 * zero-length write occasionally. 2164 */ 2165 return 0; 2166 } 2167 2168 bdrv_inc_in_flight(bs); 2169 /* 2170 * Align write if necessary by performing a read-modify-write cycle. 2171 * Pad qiov with the read parts and be sure to have a tracked request not 2172 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 2173 */ 2174 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 2175 2176 if (flags & BDRV_REQ_ZERO_WRITE) { 2177 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 2178 goto out; 2179 } 2180 2181 if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { 2182 bdrv_mark_request_serialising(&req, align); 2183 bdrv_padding_rmw_read(child, &req, &pad, false); 2184 } 2185 2186 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 2187 qiov, qiov_offset, flags); 2188 2189 bdrv_padding_destroy(&pad); 2190 2191 out: 2192 tracked_request_end(&req); 2193 bdrv_dec_in_flight(bs); 2194 2195 return ret; 2196 } 2197 2198 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2199 int bytes, BdrvRequestFlags flags) 2200 { 2201 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 2202 2203 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 2204 flags &= ~BDRV_REQ_MAY_UNMAP; 2205 } 2206 2207 return bdrv_co_pwritev(child, offset, bytes, NULL, 2208 BDRV_REQ_ZERO_WRITE | flags); 2209 } 2210 2211 /* 2212 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 2213 */ 2214 int bdrv_flush_all(void) 2215 { 2216 BdrvNextIterator it; 2217 BlockDriverState *bs = NULL; 2218 int result = 0; 2219 2220 /* 2221 * bdrv queue is managed by record/replay, 2222 * creating new flush request for stopping 2223 * the VM may break the determinism 2224 */ 2225 if (replay_events_enabled()) { 2226 return result; 2227 } 2228 2229 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 2230 AioContext *aio_context = bdrv_get_aio_context(bs); 2231 int ret; 2232 2233 aio_context_acquire(aio_context); 2234 ret = bdrv_flush(bs); 2235 if (ret < 0 && !result) { 2236 result = ret; 2237 } 2238 aio_context_release(aio_context); 2239 } 2240 2241 return result; 2242 } 2243 2244 2245 typedef struct BdrvCoBlockStatusData { 2246 BlockDriverState *bs; 2247 BlockDriverState *base; 2248 bool want_zero; 2249 int64_t offset; 2250 int64_t bytes; 2251 int64_t *pnum; 2252 int64_t *map; 2253 BlockDriverState **file; 2254 } BdrvCoBlockStatusData; 2255 2256 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 2257 bool want_zero, 2258 int64_t offset, 2259 int64_t bytes, 2260 int64_t *pnum, 2261 int64_t *map, 2262 BlockDriverState **file) 2263 { 2264 assert(bs->file && bs->file->bs); 2265 *pnum = bytes; 2266 *map = offset; 2267 *file = bs->file->bs; 2268 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2269 } 2270 2271 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 2272 bool want_zero, 2273 int64_t offset, 2274 int64_t bytes, 2275 int64_t *pnum, 2276 int64_t *map, 2277 BlockDriverState **file) 2278 { 2279 assert(bs->backing && bs->backing->bs); 2280 *pnum = bytes; 2281 *map = offset; 2282 *file = bs->backing->bs; 2283 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2284 } 2285 2286 /* 2287 * Returns the allocation status of the specified sectors. 2288 * Drivers not implementing the functionality are assumed to not support 2289 * backing files, hence all their sectors are reported as allocated. 2290 * 2291 * If 'want_zero' is true, the caller is querying for mapping 2292 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 2293 * _ZERO where possible; otherwise, the result favors larger 'pnum', 2294 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2295 * 2296 * If 'offset' is beyond the end of the disk image the return value is 2297 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 2298 * 2299 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2300 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2301 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 2302 * 2303 * 'pnum' is set to the number of bytes (including and immediately 2304 * following the specified offset) that are easily known to be in the 2305 * same allocated/unallocated state. Note that a second call starting 2306 * at the original offset plus returned pnum may have the same status. 2307 * The returned value is non-zero on success except at end-of-file. 2308 * 2309 * Returns negative errno on failure. Otherwise, if the 2310 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 2311 * set to the host mapping and BDS corresponding to the guest offset. 2312 */ 2313 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2314 bool want_zero, 2315 int64_t offset, int64_t bytes, 2316 int64_t *pnum, int64_t *map, 2317 BlockDriverState **file) 2318 { 2319 int64_t total_size; 2320 int64_t n; /* bytes */ 2321 int ret; 2322 int64_t local_map = 0; 2323 BlockDriverState *local_file = NULL; 2324 int64_t aligned_offset, aligned_bytes; 2325 uint32_t align; 2326 2327 assert(pnum); 2328 *pnum = 0; 2329 total_size = bdrv_getlength(bs); 2330 if (total_size < 0) { 2331 ret = total_size; 2332 goto early_out; 2333 } 2334 2335 if (offset >= total_size) { 2336 ret = BDRV_BLOCK_EOF; 2337 goto early_out; 2338 } 2339 if (!bytes) { 2340 ret = 0; 2341 goto early_out; 2342 } 2343 2344 n = total_size - offset; 2345 if (n < bytes) { 2346 bytes = n; 2347 } 2348 2349 /* Must be non-NULL or bdrv_getlength() would have failed */ 2350 assert(bs->drv); 2351 if (!bs->drv->bdrv_co_block_status) { 2352 *pnum = bytes; 2353 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2354 if (offset + bytes == total_size) { 2355 ret |= BDRV_BLOCK_EOF; 2356 } 2357 if (bs->drv->protocol_name) { 2358 ret |= BDRV_BLOCK_OFFSET_VALID; 2359 local_map = offset; 2360 local_file = bs; 2361 } 2362 goto early_out; 2363 } 2364 2365 bdrv_inc_in_flight(bs); 2366 2367 /* Round out to request_alignment boundaries */ 2368 align = bs->bl.request_alignment; 2369 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2370 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2371 2372 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2373 aligned_bytes, pnum, &local_map, 2374 &local_file); 2375 if (ret < 0) { 2376 *pnum = 0; 2377 goto out; 2378 } 2379 2380 /* 2381 * The driver's result must be a non-zero multiple of request_alignment. 2382 * Clamp pnum and adjust map to original request. 2383 */ 2384 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2385 align > offset - aligned_offset); 2386 if (ret & BDRV_BLOCK_RECURSE) { 2387 assert(ret & BDRV_BLOCK_DATA); 2388 assert(ret & BDRV_BLOCK_OFFSET_VALID); 2389 assert(!(ret & BDRV_BLOCK_ZERO)); 2390 } 2391 2392 *pnum -= offset - aligned_offset; 2393 if (*pnum > bytes) { 2394 *pnum = bytes; 2395 } 2396 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2397 local_map += offset - aligned_offset; 2398 } 2399 2400 if (ret & BDRV_BLOCK_RAW) { 2401 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2402 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2403 *pnum, pnum, &local_map, &local_file); 2404 goto out; 2405 } 2406 2407 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2408 ret |= BDRV_BLOCK_ALLOCATED; 2409 } else if (want_zero && bs->drv->supports_backing) { 2410 if (bs->backing) { 2411 BlockDriverState *bs2 = bs->backing->bs; 2412 int64_t size2 = bdrv_getlength(bs2); 2413 2414 if (size2 >= 0 && offset >= size2) { 2415 ret |= BDRV_BLOCK_ZERO; 2416 } 2417 } else { 2418 ret |= BDRV_BLOCK_ZERO; 2419 } 2420 } 2421 2422 if (want_zero && ret & BDRV_BLOCK_RECURSE && 2423 local_file && local_file != bs && 2424 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2425 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2426 int64_t file_pnum; 2427 int ret2; 2428 2429 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2430 *pnum, &file_pnum, NULL, NULL); 2431 if (ret2 >= 0) { 2432 /* Ignore errors. This is just providing extra information, it 2433 * is useful but not necessary. 2434 */ 2435 if (ret2 & BDRV_BLOCK_EOF && 2436 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2437 /* 2438 * It is valid for the format block driver to read 2439 * beyond the end of the underlying file's current 2440 * size; such areas read as zero. 2441 */ 2442 ret |= BDRV_BLOCK_ZERO; 2443 } else { 2444 /* Limit request to the range reported by the protocol driver */ 2445 *pnum = file_pnum; 2446 ret |= (ret2 & BDRV_BLOCK_ZERO); 2447 } 2448 } 2449 } 2450 2451 out: 2452 bdrv_dec_in_flight(bs); 2453 if (ret >= 0 && offset + *pnum == total_size) { 2454 ret |= BDRV_BLOCK_EOF; 2455 } 2456 early_out: 2457 if (file) { 2458 *file = local_file; 2459 } 2460 if (map) { 2461 *map = local_map; 2462 } 2463 return ret; 2464 } 2465 2466 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2467 BlockDriverState *base, 2468 bool want_zero, 2469 int64_t offset, 2470 int64_t bytes, 2471 int64_t *pnum, 2472 int64_t *map, 2473 BlockDriverState **file) 2474 { 2475 BlockDriverState *p; 2476 int ret = 0; 2477 bool first = true; 2478 2479 assert(bs != base); 2480 for (p = bs; p != base; p = backing_bs(p)) { 2481 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2482 file); 2483 if (ret < 0) { 2484 break; 2485 } 2486 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2487 /* 2488 * Reading beyond the end of the file continues to read 2489 * zeroes, but we can only widen the result to the 2490 * unallocated length we learned from an earlier 2491 * iteration. 2492 */ 2493 *pnum = bytes; 2494 } 2495 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2496 break; 2497 } 2498 /* [offset, pnum] unallocated on this layer, which could be only 2499 * the first part of [offset, bytes]. */ 2500 bytes = MIN(bytes, *pnum); 2501 first = false; 2502 } 2503 return ret; 2504 } 2505 2506 /* Coroutine wrapper for bdrv_block_status_above() */ 2507 static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2508 { 2509 BdrvCoBlockStatusData *data = opaque; 2510 2511 return bdrv_co_block_status_above(data->bs, data->base, 2512 data->want_zero, 2513 data->offset, data->bytes, 2514 data->pnum, data->map, data->file); 2515 } 2516 2517 /* 2518 * Synchronous wrapper around bdrv_co_block_status_above(). 2519 * 2520 * See bdrv_co_block_status_above() for details. 2521 */ 2522 static int bdrv_common_block_status_above(BlockDriverState *bs, 2523 BlockDriverState *base, 2524 bool want_zero, int64_t offset, 2525 int64_t bytes, int64_t *pnum, 2526 int64_t *map, 2527 BlockDriverState **file) 2528 { 2529 BdrvCoBlockStatusData data = { 2530 .bs = bs, 2531 .base = base, 2532 .want_zero = want_zero, 2533 .offset = offset, 2534 .bytes = bytes, 2535 .pnum = pnum, 2536 .map = map, 2537 .file = file, 2538 }; 2539 2540 return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data); 2541 } 2542 2543 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2544 int64_t offset, int64_t bytes, int64_t *pnum, 2545 int64_t *map, BlockDriverState **file) 2546 { 2547 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2548 pnum, map, file); 2549 } 2550 2551 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2552 int64_t *pnum, int64_t *map, BlockDriverState **file) 2553 { 2554 return bdrv_block_status_above(bs, backing_bs(bs), 2555 offset, bytes, pnum, map, file); 2556 } 2557 2558 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2559 int64_t bytes, int64_t *pnum) 2560 { 2561 int ret; 2562 int64_t dummy; 2563 2564 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2565 bytes, pnum ? pnum : &dummy, NULL, 2566 NULL); 2567 if (ret < 0) { 2568 return ret; 2569 } 2570 return !!(ret & BDRV_BLOCK_ALLOCATED); 2571 } 2572 2573 /* 2574 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2575 * 2576 * Return 1 if (a prefix of) the given range is allocated in any image 2577 * between BASE and TOP (BASE is only included if include_base is set). 2578 * BASE can be NULL to check if the given offset is allocated in any 2579 * image of the chain. Return 0 otherwise, or negative errno on 2580 * failure. 2581 * 2582 * 'pnum' is set to the number of bytes (including and immediately 2583 * following the specified offset) that are known to be in the same 2584 * allocated/unallocated state. Note that a subsequent call starting 2585 * at 'offset + *pnum' may return the same allocation status (in other 2586 * words, the result is not necessarily the maximum possible range); 2587 * but 'pnum' will only be 0 when end of file is reached. 2588 * 2589 */ 2590 int bdrv_is_allocated_above(BlockDriverState *top, 2591 BlockDriverState *base, 2592 bool include_base, int64_t offset, 2593 int64_t bytes, int64_t *pnum) 2594 { 2595 BlockDriverState *intermediate; 2596 int ret; 2597 int64_t n = bytes; 2598 2599 assert(base || !include_base); 2600 2601 intermediate = top; 2602 while (include_base || intermediate != base) { 2603 int64_t pnum_inter; 2604 int64_t size_inter; 2605 2606 assert(intermediate); 2607 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2608 if (ret < 0) { 2609 return ret; 2610 } 2611 if (ret) { 2612 *pnum = pnum_inter; 2613 return 1; 2614 } 2615 2616 size_inter = bdrv_getlength(intermediate); 2617 if (size_inter < 0) { 2618 return size_inter; 2619 } 2620 if (n > pnum_inter && 2621 (intermediate == top || offset + pnum_inter < size_inter)) { 2622 n = pnum_inter; 2623 } 2624 2625 if (intermediate == base) { 2626 break; 2627 } 2628 2629 intermediate = backing_bs(intermediate); 2630 } 2631 2632 *pnum = n; 2633 return 0; 2634 } 2635 2636 typedef struct BdrvVmstateCo { 2637 BlockDriverState *bs; 2638 QEMUIOVector *qiov; 2639 int64_t pos; 2640 bool is_read; 2641 } BdrvVmstateCo; 2642 2643 static int coroutine_fn 2644 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2645 bool is_read) 2646 { 2647 BlockDriver *drv = bs->drv; 2648 int ret = -ENOTSUP; 2649 2650 bdrv_inc_in_flight(bs); 2651 2652 if (!drv) { 2653 ret = -ENOMEDIUM; 2654 } else if (drv->bdrv_load_vmstate) { 2655 if (is_read) { 2656 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2657 } else { 2658 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2659 } 2660 } else if (bs->file) { 2661 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2662 } 2663 2664 bdrv_dec_in_flight(bs); 2665 return ret; 2666 } 2667 2668 static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2669 { 2670 BdrvVmstateCo *co = opaque; 2671 2672 return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2673 } 2674 2675 static inline int 2676 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2677 bool is_read) 2678 { 2679 BdrvVmstateCo data = { 2680 .bs = bs, 2681 .qiov = qiov, 2682 .pos = pos, 2683 .is_read = is_read, 2684 }; 2685 2686 return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data); 2687 } 2688 2689 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2690 int64_t pos, int size) 2691 { 2692 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2693 int ret; 2694 2695 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2696 if (ret < 0) { 2697 return ret; 2698 } 2699 2700 return size; 2701 } 2702 2703 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2704 { 2705 return bdrv_rw_vmstate(bs, qiov, pos, false); 2706 } 2707 2708 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2709 int64_t pos, int size) 2710 { 2711 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2712 int ret; 2713 2714 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2715 if (ret < 0) { 2716 return ret; 2717 } 2718 2719 return size; 2720 } 2721 2722 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2723 { 2724 return bdrv_rw_vmstate(bs, qiov, pos, true); 2725 } 2726 2727 /**************************************************************/ 2728 /* async I/Os */ 2729 2730 void bdrv_aio_cancel(BlockAIOCB *acb) 2731 { 2732 qemu_aio_ref(acb); 2733 bdrv_aio_cancel_async(acb); 2734 while (acb->refcnt > 1) { 2735 if (acb->aiocb_info->get_aio_context) { 2736 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2737 } else if (acb->bs) { 2738 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2739 * assert that we're not using an I/O thread. Thread-safe 2740 * code should use bdrv_aio_cancel_async exclusively. 2741 */ 2742 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2743 aio_poll(bdrv_get_aio_context(acb->bs), true); 2744 } else { 2745 abort(); 2746 } 2747 } 2748 qemu_aio_unref(acb); 2749 } 2750 2751 /* Async version of aio cancel. The caller is not blocked if the acb implements 2752 * cancel_async, otherwise we do nothing and let the request normally complete. 2753 * In either case the completion callback must be called. */ 2754 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2755 { 2756 if (acb->aiocb_info->cancel_async) { 2757 acb->aiocb_info->cancel_async(acb); 2758 } 2759 } 2760 2761 /**************************************************************/ 2762 /* Coroutine block device emulation */ 2763 2764 static int coroutine_fn bdrv_flush_co_entry(void *opaque) 2765 { 2766 return bdrv_co_flush(opaque); 2767 } 2768 2769 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2770 { 2771 int current_gen; 2772 int ret = 0; 2773 2774 bdrv_inc_in_flight(bs); 2775 2776 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2777 bdrv_is_sg(bs)) { 2778 goto early_exit; 2779 } 2780 2781 qemu_co_mutex_lock(&bs->reqs_lock); 2782 current_gen = atomic_read(&bs->write_gen); 2783 2784 /* Wait until any previous flushes are completed */ 2785 while (bs->active_flush_req) { 2786 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2787 } 2788 2789 /* Flushes reach this point in nondecreasing current_gen order. */ 2790 bs->active_flush_req = true; 2791 qemu_co_mutex_unlock(&bs->reqs_lock); 2792 2793 /* Write back all layers by calling one driver function */ 2794 if (bs->drv->bdrv_co_flush) { 2795 ret = bs->drv->bdrv_co_flush(bs); 2796 goto out; 2797 } 2798 2799 /* Write back cached data to the OS even with cache=unsafe */ 2800 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2801 if (bs->drv->bdrv_co_flush_to_os) { 2802 ret = bs->drv->bdrv_co_flush_to_os(bs); 2803 if (ret < 0) { 2804 goto out; 2805 } 2806 } 2807 2808 /* But don't actually force it to the disk with cache=unsafe */ 2809 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2810 goto flush_parent; 2811 } 2812 2813 /* Check if we really need to flush anything */ 2814 if (bs->flushed_gen == current_gen) { 2815 goto flush_parent; 2816 } 2817 2818 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2819 if (!bs->drv) { 2820 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2821 * (even in case of apparent success) */ 2822 ret = -ENOMEDIUM; 2823 goto out; 2824 } 2825 if (bs->drv->bdrv_co_flush_to_disk) { 2826 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2827 } else if (bs->drv->bdrv_aio_flush) { 2828 BlockAIOCB *acb; 2829 CoroutineIOCompletion co = { 2830 .coroutine = qemu_coroutine_self(), 2831 }; 2832 2833 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2834 if (acb == NULL) { 2835 ret = -EIO; 2836 } else { 2837 qemu_coroutine_yield(); 2838 ret = co.ret; 2839 } 2840 } else { 2841 /* 2842 * Some block drivers always operate in either writethrough or unsafe 2843 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2844 * know how the server works (because the behaviour is hardcoded or 2845 * depends on server-side configuration), so we can't ensure that 2846 * everything is safe on disk. Returning an error doesn't work because 2847 * that would break guests even if the server operates in writethrough 2848 * mode. 2849 * 2850 * Let's hope the user knows what he's doing. 2851 */ 2852 ret = 0; 2853 } 2854 2855 if (ret < 0) { 2856 goto out; 2857 } 2858 2859 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2860 * in the case of cache=unsafe, so there are no useless flushes. 2861 */ 2862 flush_parent: 2863 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2864 out: 2865 /* Notify any pending flushes that we have completed */ 2866 if (ret == 0) { 2867 bs->flushed_gen = current_gen; 2868 } 2869 2870 qemu_co_mutex_lock(&bs->reqs_lock); 2871 bs->active_flush_req = false; 2872 /* Return value is ignored - it's ok if wait queue is empty */ 2873 qemu_co_queue_next(&bs->flush_queue); 2874 qemu_co_mutex_unlock(&bs->reqs_lock); 2875 2876 early_exit: 2877 bdrv_dec_in_flight(bs); 2878 return ret; 2879 } 2880 2881 int bdrv_flush(BlockDriverState *bs) 2882 { 2883 return bdrv_run_co(bs, bdrv_flush_co_entry, bs); 2884 } 2885 2886 typedef struct DiscardCo { 2887 BdrvChild *child; 2888 int64_t offset; 2889 int64_t bytes; 2890 } DiscardCo; 2891 2892 static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2893 { 2894 DiscardCo *rwco = opaque; 2895 2896 return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); 2897 } 2898 2899 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2900 int64_t bytes) 2901 { 2902 BdrvTrackedRequest req; 2903 int max_pdiscard, ret; 2904 int head, tail, align; 2905 BlockDriverState *bs = child->bs; 2906 2907 if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 2908 return -ENOMEDIUM; 2909 } 2910 2911 if (bdrv_has_readonly_bitmaps(bs)) { 2912 return -EPERM; 2913 } 2914 2915 if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) { 2916 return -EIO; 2917 } 2918 2919 /* Do nothing if disabled. */ 2920 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2921 return 0; 2922 } 2923 2924 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2925 return 0; 2926 } 2927 2928 /* Discard is advisory, but some devices track and coalesce 2929 * unaligned requests, so we must pass everything down rather than 2930 * round here. Still, most devices will just silently ignore 2931 * unaligned requests (by returning -ENOTSUP), so we must fragment 2932 * the request accordingly. */ 2933 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2934 assert(align % bs->bl.request_alignment == 0); 2935 head = offset % align; 2936 tail = (offset + bytes) % align; 2937 2938 bdrv_inc_in_flight(bs); 2939 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2940 2941 ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2942 if (ret < 0) { 2943 goto out; 2944 } 2945 2946 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2947 align); 2948 assert(max_pdiscard >= bs->bl.request_alignment); 2949 2950 while (bytes > 0) { 2951 int64_t num = bytes; 2952 2953 if (head) { 2954 /* Make small requests to get to alignment boundaries. */ 2955 num = MIN(bytes, align - head); 2956 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2957 num %= bs->bl.request_alignment; 2958 } 2959 head = (head + num) % align; 2960 assert(num < max_pdiscard); 2961 } else if (tail) { 2962 if (num > align) { 2963 /* Shorten the request to the last aligned cluster. */ 2964 num -= tail; 2965 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2966 tail > bs->bl.request_alignment) { 2967 tail %= bs->bl.request_alignment; 2968 num -= tail; 2969 } 2970 } 2971 /* limit request size */ 2972 if (num > max_pdiscard) { 2973 num = max_pdiscard; 2974 } 2975 2976 if (!bs->drv) { 2977 ret = -ENOMEDIUM; 2978 goto out; 2979 } 2980 if (bs->drv->bdrv_co_pdiscard) { 2981 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2982 } else { 2983 BlockAIOCB *acb; 2984 CoroutineIOCompletion co = { 2985 .coroutine = qemu_coroutine_self(), 2986 }; 2987 2988 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2989 bdrv_co_io_em_complete, &co); 2990 if (acb == NULL) { 2991 ret = -EIO; 2992 goto out; 2993 } else { 2994 qemu_coroutine_yield(); 2995 ret = co.ret; 2996 } 2997 } 2998 if (ret && ret != -ENOTSUP) { 2999 goto out; 3000 } 3001 3002 offset += num; 3003 bytes -= num; 3004 } 3005 ret = 0; 3006 out: 3007 bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3008 tracked_request_end(&req); 3009 bdrv_dec_in_flight(bs); 3010 return ret; 3011 } 3012 3013 int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) 3014 { 3015 DiscardCo rwco = { 3016 .child = child, 3017 .offset = offset, 3018 .bytes = bytes, 3019 }; 3020 3021 return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco); 3022 } 3023 3024 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 3025 { 3026 BlockDriver *drv = bs->drv; 3027 CoroutineIOCompletion co = { 3028 .coroutine = qemu_coroutine_self(), 3029 }; 3030 BlockAIOCB *acb; 3031 3032 bdrv_inc_in_flight(bs); 3033 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 3034 co.ret = -ENOTSUP; 3035 goto out; 3036 } 3037 3038 if (drv->bdrv_co_ioctl) { 3039 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 3040 } else { 3041 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 3042 if (!acb) { 3043 co.ret = -ENOTSUP; 3044 goto out; 3045 } 3046 qemu_coroutine_yield(); 3047 } 3048 out: 3049 bdrv_dec_in_flight(bs); 3050 return co.ret; 3051 } 3052 3053 void *qemu_blockalign(BlockDriverState *bs, size_t size) 3054 { 3055 return qemu_memalign(bdrv_opt_mem_align(bs), size); 3056 } 3057 3058 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 3059 { 3060 return memset(qemu_blockalign(bs, size), 0, size); 3061 } 3062 3063 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 3064 { 3065 size_t align = bdrv_opt_mem_align(bs); 3066 3067 /* Ensure that NULL is never returned on success */ 3068 assert(align > 0); 3069 if (size == 0) { 3070 size = align; 3071 } 3072 3073 return qemu_try_memalign(align, size); 3074 } 3075 3076 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 3077 { 3078 void *mem = qemu_try_blockalign(bs, size); 3079 3080 if (mem) { 3081 memset(mem, 0, size); 3082 } 3083 3084 return mem; 3085 } 3086 3087 /* 3088 * Check if all memory in this vector is sector aligned. 3089 */ 3090 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 3091 { 3092 int i; 3093 size_t alignment = bdrv_min_mem_align(bs); 3094 3095 for (i = 0; i < qiov->niov; i++) { 3096 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 3097 return false; 3098 } 3099 if (qiov->iov[i].iov_len % alignment) { 3100 return false; 3101 } 3102 } 3103 3104 return true; 3105 } 3106 3107 void bdrv_add_before_write_notifier(BlockDriverState *bs, 3108 NotifierWithReturn *notifier) 3109 { 3110 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 3111 } 3112 3113 void bdrv_io_plug(BlockDriverState *bs) 3114 { 3115 BdrvChild *child; 3116 3117 QLIST_FOREACH(child, &bs->children, next) { 3118 bdrv_io_plug(child->bs); 3119 } 3120 3121 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 3122 BlockDriver *drv = bs->drv; 3123 if (drv && drv->bdrv_io_plug) { 3124 drv->bdrv_io_plug(bs); 3125 } 3126 } 3127 } 3128 3129 void bdrv_io_unplug(BlockDriverState *bs) 3130 { 3131 BdrvChild *child; 3132 3133 assert(bs->io_plugged); 3134 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 3135 BlockDriver *drv = bs->drv; 3136 if (drv && drv->bdrv_io_unplug) { 3137 drv->bdrv_io_unplug(bs); 3138 } 3139 } 3140 3141 QLIST_FOREACH(child, &bs->children, next) { 3142 bdrv_io_unplug(child->bs); 3143 } 3144 } 3145 3146 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 3147 { 3148 BdrvChild *child; 3149 3150 if (bs->drv && bs->drv->bdrv_register_buf) { 3151 bs->drv->bdrv_register_buf(bs, host, size); 3152 } 3153 QLIST_FOREACH(child, &bs->children, next) { 3154 bdrv_register_buf(child->bs, host, size); 3155 } 3156 } 3157 3158 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 3159 { 3160 BdrvChild *child; 3161 3162 if (bs->drv && bs->drv->bdrv_unregister_buf) { 3163 bs->drv->bdrv_unregister_buf(bs, host); 3164 } 3165 QLIST_FOREACH(child, &bs->children, next) { 3166 bdrv_unregister_buf(child->bs, host); 3167 } 3168 } 3169 3170 static int coroutine_fn bdrv_co_copy_range_internal( 3171 BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 3172 uint64_t dst_offset, uint64_t bytes, 3173 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3174 bool recurse_src) 3175 { 3176 BdrvTrackedRequest req; 3177 int ret; 3178 3179 /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3180 assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3181 assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3182 3183 if (!dst || !dst->bs) { 3184 return -ENOMEDIUM; 3185 } 3186 ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 3187 if (ret) { 3188 return ret; 3189 } 3190 if (write_flags & BDRV_REQ_ZERO_WRITE) { 3191 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3192 } 3193 3194 if (!src || !src->bs) { 3195 return -ENOMEDIUM; 3196 } 3197 ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 3198 if (ret) { 3199 return ret; 3200 } 3201 3202 if (!src->bs->drv->bdrv_co_copy_range_from 3203 || !dst->bs->drv->bdrv_co_copy_range_to 3204 || src->bs->encrypted || dst->bs->encrypted) { 3205 return -ENOTSUP; 3206 } 3207 3208 if (recurse_src) { 3209 bdrv_inc_in_flight(src->bs); 3210 tracked_request_begin(&req, src->bs, src_offset, bytes, 3211 BDRV_TRACKED_READ); 3212 3213 /* BDRV_REQ_SERIALISING is only for write operation */ 3214 assert(!(read_flags & BDRV_REQ_SERIALISING)); 3215 bdrv_wait_serialising_requests(&req); 3216 3217 ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3218 src, src_offset, 3219 dst, dst_offset, 3220 bytes, 3221 read_flags, write_flags); 3222 3223 tracked_request_end(&req); 3224 bdrv_dec_in_flight(src->bs); 3225 } else { 3226 bdrv_inc_in_flight(dst->bs); 3227 tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3228 BDRV_TRACKED_WRITE); 3229 ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 3230 write_flags); 3231 if (!ret) { 3232 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3233 src, src_offset, 3234 dst, dst_offset, 3235 bytes, 3236 read_flags, write_flags); 3237 } 3238 bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3239 tracked_request_end(&req); 3240 bdrv_dec_in_flight(dst->bs); 3241 } 3242 3243 return ret; 3244 } 3245 3246 /* Copy range from @src to @dst. 3247 * 3248 * See the comment of bdrv_co_copy_range for the parameter and return value 3249 * semantics. */ 3250 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 3251 BdrvChild *dst, uint64_t dst_offset, 3252 uint64_t bytes, 3253 BdrvRequestFlags read_flags, 3254 BdrvRequestFlags write_flags) 3255 { 3256 trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3257 read_flags, write_flags); 3258 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3259 bytes, read_flags, write_flags, true); 3260 } 3261 3262 /* Copy range from @src to @dst. 3263 * 3264 * See the comment of bdrv_co_copy_range for the parameter and return value 3265 * semantics. */ 3266 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3267 BdrvChild *dst, uint64_t dst_offset, 3268 uint64_t bytes, 3269 BdrvRequestFlags read_flags, 3270 BdrvRequestFlags write_flags) 3271 { 3272 trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3273 read_flags, write_flags); 3274 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3275 bytes, read_flags, write_flags, false); 3276 } 3277 3278 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3279 BdrvChild *dst, uint64_t dst_offset, 3280 uint64_t bytes, BdrvRequestFlags read_flags, 3281 BdrvRequestFlags write_flags) 3282 { 3283 return bdrv_co_copy_range_from(src, src_offset, 3284 dst, dst_offset, 3285 bytes, read_flags, write_flags); 3286 } 3287 3288 static void bdrv_parent_cb_resize(BlockDriverState *bs) 3289 { 3290 BdrvChild *c; 3291 QLIST_FOREACH(c, &bs->parents, next_parent) { 3292 if (c->klass->resize) { 3293 c->klass->resize(c); 3294 } 3295 } 3296 } 3297 3298 /** 3299 * Truncate file to 'offset' bytes (needed only for file protocols) 3300 * 3301 * If 'exact' is true, the file must be resized to exactly the given 3302 * 'offset'. Otherwise, it is sufficient for the node to be at least 3303 * 'offset' bytes in length. 3304 */ 3305 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 3306 PreallocMode prealloc, BdrvRequestFlags flags, 3307 Error **errp) 3308 { 3309 BlockDriverState *bs = child->bs; 3310 BlockDriver *drv = bs->drv; 3311 BdrvTrackedRequest req; 3312 int64_t old_size, new_bytes; 3313 int ret; 3314 3315 3316 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 3317 if (!drv) { 3318 error_setg(errp, "No medium inserted"); 3319 return -ENOMEDIUM; 3320 } 3321 if (offset < 0) { 3322 error_setg(errp, "Image size cannot be negative"); 3323 return -EINVAL; 3324 } 3325 3326 old_size = bdrv_getlength(bs); 3327 if (old_size < 0) { 3328 error_setg_errno(errp, -old_size, "Failed to get old image size"); 3329 return old_size; 3330 } 3331 3332 if (offset > old_size) { 3333 new_bytes = offset - old_size; 3334 } else { 3335 new_bytes = 0; 3336 } 3337 3338 bdrv_inc_in_flight(bs); 3339 tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 3340 BDRV_TRACKED_TRUNCATE); 3341 3342 /* If we are growing the image and potentially using preallocation for the 3343 * new area, we need to make sure that no write requests are made to it 3344 * concurrently or they might be overwritten by preallocation. */ 3345 if (new_bytes) { 3346 bdrv_mark_request_serialising(&req, 1); 3347 } 3348 if (bs->read_only) { 3349 error_setg(errp, "Image is read-only"); 3350 ret = -EACCES; 3351 goto out; 3352 } 3353 ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3354 0); 3355 if (ret < 0) { 3356 error_setg_errno(errp, -ret, 3357 "Failed to prepare request for truncation"); 3358 goto out; 3359 } 3360 3361 /* 3362 * If the image has a backing file that is large enough that it would 3363 * provide data for the new area, we cannot leave it unallocated because 3364 * then the backing file content would become visible. Instead, zero-fill 3365 * the new area. 3366 * 3367 * Note that if the image has a backing file, but was opened without the 3368 * backing file, taking care of keeping things consistent with that backing 3369 * file is the user's responsibility. 3370 */ 3371 if (new_bytes && bs->backing) { 3372 int64_t backing_len; 3373 3374 backing_len = bdrv_getlength(backing_bs(bs)); 3375 if (backing_len < 0) { 3376 ret = backing_len; 3377 error_setg_errno(errp, -ret, "Could not get backing file size"); 3378 goto out; 3379 } 3380 3381 if (backing_len > old_size) { 3382 flags |= BDRV_REQ_ZERO_WRITE; 3383 } 3384 } 3385 3386 if (drv->bdrv_co_truncate) { 3387 if (flags & ~bs->supported_truncate_flags) { 3388 error_setg(errp, "Block driver does not support requested flags"); 3389 ret = -ENOTSUP; 3390 goto out; 3391 } 3392 ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 3393 } else if (bs->file && drv->is_filter) { 3394 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); 3395 } else { 3396 error_setg(errp, "Image format driver does not support resize"); 3397 ret = -ENOTSUP; 3398 goto out; 3399 } 3400 if (ret < 0) { 3401 goto out; 3402 } 3403 3404 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3405 if (ret < 0) { 3406 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 3407 } else { 3408 offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3409 } 3410 /* It's possible that truncation succeeded but refresh_total_sectors 3411 * failed, but the latter doesn't affect how we should finish the request. 3412 * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3413 bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 3414 3415 out: 3416 tracked_request_end(&req); 3417 bdrv_dec_in_flight(bs); 3418 3419 return ret; 3420 } 3421 3422 typedef struct TruncateCo { 3423 BdrvChild *child; 3424 int64_t offset; 3425 bool exact; 3426 PreallocMode prealloc; 3427 BdrvRequestFlags flags; 3428 Error **errp; 3429 } TruncateCo; 3430 3431 static int coroutine_fn bdrv_truncate_co_entry(void *opaque) 3432 { 3433 TruncateCo *tco = opaque; 3434 3435 return bdrv_co_truncate(tco->child, tco->offset, tco->exact, 3436 tco->prealloc, tco->flags, tco->errp); 3437 } 3438 3439 int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, 3440 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) 3441 { 3442 TruncateCo tco = { 3443 .child = child, 3444 .offset = offset, 3445 .exact = exact, 3446 .prealloc = prealloc, 3447 .flags = flags, 3448 .errp = errp, 3449 }; 3450 3451 return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco); 3452 } 3453