1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 36 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 37 38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40 41 static AioWait drain_all_aio_wait; 42 43 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 44 int64_t offset, int bytes, BdrvRequestFlags flags); 45 46 void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 47 bool ignore_bds_parents) 48 { 49 BdrvChild *c, *next; 50 51 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 52 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 53 continue; 54 } 55 if (c->role->drained_begin) { 56 c->role->drained_begin(c); 57 } 58 } 59 } 60 61 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 62 bool ignore_bds_parents) 63 { 64 BdrvChild *c, *next; 65 66 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 67 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 68 continue; 69 } 70 if (c->role->drained_end) { 71 c->role->drained_end(c); 72 } 73 } 74 } 75 76 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 77 bool ignore_bds_parents) 78 { 79 BdrvChild *c, *next; 80 bool busy = false; 81 82 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 83 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 84 continue; 85 } 86 if (c->role->drained_poll) { 87 busy |= c->role->drained_poll(c); 88 } 89 } 90 91 return busy; 92 } 93 94 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 95 { 96 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 97 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 98 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 99 src->opt_mem_alignment); 100 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 101 src->min_mem_alignment); 102 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 103 } 104 105 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 106 { 107 BlockDriver *drv = bs->drv; 108 Error *local_err = NULL; 109 110 memset(&bs->bl, 0, sizeof(bs->bl)); 111 112 if (!drv) { 113 return; 114 } 115 116 /* Default alignment based on whether driver has byte interface */ 117 bs->bl.request_alignment = (drv->bdrv_co_preadv || 118 drv->bdrv_aio_preadv) ? 1 : 512; 119 120 /* Take some limits from the children as a default */ 121 if (bs->file) { 122 bdrv_refresh_limits(bs->file->bs, &local_err); 123 if (local_err) { 124 error_propagate(errp, local_err); 125 return; 126 } 127 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 128 } else { 129 bs->bl.min_mem_alignment = 512; 130 bs->bl.opt_mem_alignment = getpagesize(); 131 132 /* Safe default since most protocols use readv()/writev()/etc */ 133 bs->bl.max_iov = IOV_MAX; 134 } 135 136 if (bs->backing) { 137 bdrv_refresh_limits(bs->backing->bs, &local_err); 138 if (local_err) { 139 error_propagate(errp, local_err); 140 return; 141 } 142 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 143 } 144 145 /* Then let the driver override it */ 146 if (drv->bdrv_refresh_limits) { 147 drv->bdrv_refresh_limits(bs, errp); 148 } 149 } 150 151 /** 152 * The copy-on-read flag is actually a reference count so multiple users may 153 * use the feature without worrying about clobbering its previous state. 154 * Copy-on-read stays enabled until all users have called to disable it. 155 */ 156 void bdrv_enable_copy_on_read(BlockDriverState *bs) 157 { 158 atomic_inc(&bs->copy_on_read); 159 } 160 161 void bdrv_disable_copy_on_read(BlockDriverState *bs) 162 { 163 int old = atomic_fetch_dec(&bs->copy_on_read); 164 assert(old >= 1); 165 } 166 167 typedef struct { 168 Coroutine *co; 169 BlockDriverState *bs; 170 bool done; 171 bool begin; 172 bool recursive; 173 bool poll; 174 BdrvChild *parent; 175 bool ignore_bds_parents; 176 } BdrvCoDrainData; 177 178 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 179 { 180 BdrvCoDrainData *data = opaque; 181 BlockDriverState *bs = data->bs; 182 183 if (data->begin) { 184 bs->drv->bdrv_co_drain_begin(bs); 185 } else { 186 bs->drv->bdrv_co_drain_end(bs); 187 } 188 189 /* Set data->done before reading bs->wakeup. */ 190 atomic_mb_set(&data->done, true); 191 bdrv_dec_in_flight(bs); 192 193 if (data->begin) { 194 g_free(data); 195 } 196 } 197 198 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 199 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) 200 { 201 BdrvCoDrainData *data; 202 203 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 204 (!begin && !bs->drv->bdrv_co_drain_end)) { 205 return; 206 } 207 208 data = g_new(BdrvCoDrainData, 1); 209 *data = (BdrvCoDrainData) { 210 .bs = bs, 211 .done = false, 212 .begin = begin 213 }; 214 215 /* Make sure the driver callback completes during the polling phase for 216 * drain_begin. */ 217 bdrv_inc_in_flight(bs); 218 data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 219 aio_co_schedule(bdrv_get_aio_context(bs), data->co); 220 221 if (!begin) { 222 BDRV_POLL_WHILE(bs, !data->done); 223 g_free(data); 224 } 225 } 226 227 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 228 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 229 BdrvChild *ignore_parent, bool ignore_bds_parents) 230 { 231 BdrvChild *child, *next; 232 233 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 234 return true; 235 } 236 237 if (atomic_read(&bs->in_flight)) { 238 return true; 239 } 240 241 if (recursive) { 242 assert(!ignore_bds_parents); 243 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 244 if (bdrv_drain_poll(child->bs, recursive, child, false)) { 245 return true; 246 } 247 } 248 } 249 250 return false; 251 } 252 253 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 254 BdrvChild *ignore_parent) 255 { 256 /* Execute pending BHs first and check everything else only after the BHs 257 * have executed. */ 258 while (aio_poll(bs->aio_context, false)); 259 260 return bdrv_drain_poll(bs, recursive, ignore_parent, false); 261 } 262 263 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 264 BdrvChild *parent, bool ignore_bds_parents, 265 bool poll); 266 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 267 BdrvChild *parent, bool ignore_bds_parents); 268 269 static void bdrv_co_drain_bh_cb(void *opaque) 270 { 271 BdrvCoDrainData *data = opaque; 272 Coroutine *co = data->co; 273 BlockDriverState *bs = data->bs; 274 275 if (bs) { 276 bdrv_dec_in_flight(bs); 277 if (data->begin) { 278 bdrv_do_drained_begin(bs, data->recursive, data->parent, 279 data->ignore_bds_parents, data->poll); 280 } else { 281 bdrv_do_drained_end(bs, data->recursive, data->parent, 282 data->ignore_bds_parents); 283 } 284 } else { 285 assert(data->begin); 286 bdrv_drain_all_begin(); 287 } 288 289 data->done = true; 290 aio_co_wake(co); 291 } 292 293 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 294 bool begin, bool recursive, 295 BdrvChild *parent, 296 bool ignore_bds_parents, 297 bool poll) 298 { 299 BdrvCoDrainData data; 300 301 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 302 * other coroutines run if they were queued by aio_co_enter(). */ 303 304 assert(qemu_in_coroutine()); 305 data = (BdrvCoDrainData) { 306 .co = qemu_coroutine_self(), 307 .bs = bs, 308 .done = false, 309 .begin = begin, 310 .recursive = recursive, 311 .parent = parent, 312 .ignore_bds_parents = ignore_bds_parents, 313 .poll = poll, 314 }; 315 if (bs) { 316 bdrv_inc_in_flight(bs); 317 } 318 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 319 bdrv_co_drain_bh_cb, &data); 320 321 qemu_coroutine_yield(); 322 /* If we are resumed from some other event (such as an aio completion or a 323 * timer callback), it is a bug in the caller that should be fixed. */ 324 assert(data.done); 325 } 326 327 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 328 BdrvChild *parent, bool ignore_bds_parents) 329 { 330 assert(!qemu_in_coroutine()); 331 332 /* Stop things in parent-to-child order */ 333 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 334 aio_disable_external(bdrv_get_aio_context(bs)); 335 } 336 337 bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 338 bdrv_drain_invoke(bs, true); 339 } 340 341 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 342 BdrvChild *parent, bool ignore_bds_parents, 343 bool poll) 344 { 345 BdrvChild *child, *next; 346 347 if (qemu_in_coroutine()) { 348 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 349 poll); 350 return; 351 } 352 353 bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 354 355 if (recursive) { 356 assert(!ignore_bds_parents); 357 bs->recursive_quiesce_counter++; 358 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 359 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 360 false); 361 } 362 } 363 364 /* 365 * Wait for drained requests to finish. 366 * 367 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 368 * call is needed so things in this AioContext can make progress even 369 * though we don't return to the main AioContext loop - this automatically 370 * includes other nodes in the same AioContext and therefore all child 371 * nodes. 372 */ 373 if (poll) { 374 assert(!ignore_bds_parents); 375 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 376 } 377 } 378 379 void bdrv_drained_begin(BlockDriverState *bs) 380 { 381 bdrv_do_drained_begin(bs, false, NULL, false, true); 382 } 383 384 void bdrv_subtree_drained_begin(BlockDriverState *bs) 385 { 386 bdrv_do_drained_begin(bs, true, NULL, false, true); 387 } 388 389 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 390 BdrvChild *parent, bool ignore_bds_parents) 391 { 392 BdrvChild *child, *next; 393 int old_quiesce_counter; 394 395 if (qemu_in_coroutine()) { 396 bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 397 false); 398 return; 399 } 400 assert(bs->quiesce_counter > 0); 401 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 402 403 /* Re-enable things in child-to-parent order */ 404 bdrv_drain_invoke(bs, false); 405 bdrv_parent_drained_end(bs, parent, ignore_bds_parents); 406 if (old_quiesce_counter == 1) { 407 aio_enable_external(bdrv_get_aio_context(bs)); 408 } 409 410 if (recursive) { 411 assert(!ignore_bds_parents); 412 bs->recursive_quiesce_counter--; 413 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 414 bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents); 415 } 416 } 417 } 418 419 void bdrv_drained_end(BlockDriverState *bs) 420 { 421 bdrv_do_drained_end(bs, false, NULL, false); 422 } 423 424 void bdrv_subtree_drained_end(BlockDriverState *bs) 425 { 426 bdrv_do_drained_end(bs, true, NULL, false); 427 } 428 429 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 430 { 431 int i; 432 433 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 434 bdrv_do_drained_begin(child->bs, true, child, false, true); 435 } 436 } 437 438 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 439 { 440 int i; 441 442 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 443 bdrv_do_drained_end(child->bs, true, child, false); 444 } 445 } 446 447 /* 448 * Wait for pending requests to complete on a single BlockDriverState subtree, 449 * and suspend block driver's internal I/O until next request arrives. 450 * 451 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 452 * AioContext. 453 */ 454 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 455 { 456 assert(qemu_in_coroutine()); 457 bdrv_drained_begin(bs); 458 bdrv_drained_end(bs); 459 } 460 461 void bdrv_drain(BlockDriverState *bs) 462 { 463 bdrv_drained_begin(bs); 464 bdrv_drained_end(bs); 465 } 466 467 static void bdrv_drain_assert_idle(BlockDriverState *bs) 468 { 469 BdrvChild *child, *next; 470 471 assert(atomic_read(&bs->in_flight) == 0); 472 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 473 bdrv_drain_assert_idle(child->bs); 474 } 475 } 476 477 unsigned int bdrv_drain_all_count = 0; 478 479 static bool bdrv_drain_all_poll(void) 480 { 481 BlockDriverState *bs = NULL; 482 bool result = false; 483 484 /* Execute pending BHs first (may modify the graph) and check everything 485 * else only after the BHs have executed. */ 486 while (aio_poll(qemu_get_aio_context(), false)); 487 488 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 489 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 490 while ((bs = bdrv_next_all_states(bs))) { 491 AioContext *aio_context = bdrv_get_aio_context(bs); 492 aio_context_acquire(aio_context); 493 result |= bdrv_drain_poll(bs, false, NULL, true); 494 aio_context_release(aio_context); 495 } 496 497 return result; 498 } 499 500 /* 501 * Wait for pending requests to complete across all BlockDriverStates 502 * 503 * This function does not flush data to disk, use bdrv_flush_all() for that 504 * after calling this function. 505 * 506 * This pauses all block jobs and disables external clients. It must 507 * be paired with bdrv_drain_all_end(). 508 * 509 * NOTE: no new block jobs or BlockDriverStates can be created between 510 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 511 */ 512 void bdrv_drain_all_begin(void) 513 { 514 BlockDriverState *bs = NULL; 515 516 if (qemu_in_coroutine()) { 517 bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true); 518 return; 519 } 520 521 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 522 * loop AioContext, so make sure we're in the main context. */ 523 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 524 assert(bdrv_drain_all_count < INT_MAX); 525 bdrv_drain_all_count++; 526 527 /* Quiesce all nodes, without polling in-flight requests yet. The graph 528 * cannot change during this loop. */ 529 while ((bs = bdrv_next_all_states(bs))) { 530 AioContext *aio_context = bdrv_get_aio_context(bs); 531 532 aio_context_acquire(aio_context); 533 bdrv_do_drained_begin(bs, false, NULL, true, false); 534 aio_context_release(aio_context); 535 } 536 537 /* Now poll the in-flight requests */ 538 AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll()); 539 540 while ((bs = bdrv_next_all_states(bs))) { 541 bdrv_drain_assert_idle(bs); 542 } 543 } 544 545 void bdrv_drain_all_end(void) 546 { 547 BlockDriverState *bs = NULL; 548 549 while ((bs = bdrv_next_all_states(bs))) { 550 AioContext *aio_context = bdrv_get_aio_context(bs); 551 552 aio_context_acquire(aio_context); 553 bdrv_do_drained_end(bs, false, NULL, true); 554 aio_context_release(aio_context); 555 } 556 557 assert(bdrv_drain_all_count > 0); 558 bdrv_drain_all_count--; 559 } 560 561 void bdrv_drain_all(void) 562 { 563 bdrv_drain_all_begin(); 564 bdrv_drain_all_end(); 565 } 566 567 /** 568 * Remove an active request from the tracked requests list 569 * 570 * This function should be called when a tracked request is completing. 571 */ 572 static void tracked_request_end(BdrvTrackedRequest *req) 573 { 574 if (req->serialising) { 575 atomic_dec(&req->bs->serialising_in_flight); 576 } 577 578 qemu_co_mutex_lock(&req->bs->reqs_lock); 579 QLIST_REMOVE(req, list); 580 qemu_co_queue_restart_all(&req->wait_queue); 581 qemu_co_mutex_unlock(&req->bs->reqs_lock); 582 } 583 584 /** 585 * Add an active request to the tracked requests list 586 */ 587 static void tracked_request_begin(BdrvTrackedRequest *req, 588 BlockDriverState *bs, 589 int64_t offset, 590 unsigned int bytes, 591 enum BdrvTrackedRequestType type) 592 { 593 *req = (BdrvTrackedRequest){ 594 .bs = bs, 595 .offset = offset, 596 .bytes = bytes, 597 .type = type, 598 .co = qemu_coroutine_self(), 599 .serialising = false, 600 .overlap_offset = offset, 601 .overlap_bytes = bytes, 602 }; 603 604 qemu_co_queue_init(&req->wait_queue); 605 606 qemu_co_mutex_lock(&bs->reqs_lock); 607 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 608 qemu_co_mutex_unlock(&bs->reqs_lock); 609 } 610 611 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 612 { 613 int64_t overlap_offset = req->offset & ~(align - 1); 614 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 615 - overlap_offset; 616 617 if (!req->serialising) { 618 atomic_inc(&req->bs->serialising_in_flight); 619 req->serialising = true; 620 } 621 622 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 623 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 624 } 625 626 /** 627 * Round a region to cluster boundaries 628 */ 629 void bdrv_round_to_clusters(BlockDriverState *bs, 630 int64_t offset, int64_t bytes, 631 int64_t *cluster_offset, 632 int64_t *cluster_bytes) 633 { 634 BlockDriverInfo bdi; 635 636 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 637 *cluster_offset = offset; 638 *cluster_bytes = bytes; 639 } else { 640 int64_t c = bdi.cluster_size; 641 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 642 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 643 } 644 } 645 646 static int bdrv_get_cluster_size(BlockDriverState *bs) 647 { 648 BlockDriverInfo bdi; 649 int ret; 650 651 ret = bdrv_get_info(bs, &bdi); 652 if (ret < 0 || bdi.cluster_size == 0) { 653 return bs->bl.request_alignment; 654 } else { 655 return bdi.cluster_size; 656 } 657 } 658 659 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 660 int64_t offset, unsigned int bytes) 661 { 662 /* aaaa bbbb */ 663 if (offset >= req->overlap_offset + req->overlap_bytes) { 664 return false; 665 } 666 /* bbbb aaaa */ 667 if (req->overlap_offset >= offset + bytes) { 668 return false; 669 } 670 return true; 671 } 672 673 void bdrv_inc_in_flight(BlockDriverState *bs) 674 { 675 atomic_inc(&bs->in_flight); 676 } 677 678 void bdrv_wakeup(BlockDriverState *bs) 679 { 680 aio_wait_kick(bdrv_get_aio_wait(bs)); 681 aio_wait_kick(&drain_all_aio_wait); 682 } 683 684 void bdrv_dec_in_flight(BlockDriverState *bs) 685 { 686 atomic_dec(&bs->in_flight); 687 bdrv_wakeup(bs); 688 } 689 690 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 691 { 692 BlockDriverState *bs = self->bs; 693 BdrvTrackedRequest *req; 694 bool retry; 695 bool waited = false; 696 697 if (!atomic_read(&bs->serialising_in_flight)) { 698 return false; 699 } 700 701 do { 702 retry = false; 703 qemu_co_mutex_lock(&bs->reqs_lock); 704 QLIST_FOREACH(req, &bs->tracked_requests, list) { 705 if (req == self || (!req->serialising && !self->serialising)) { 706 continue; 707 } 708 if (tracked_request_overlaps(req, self->overlap_offset, 709 self->overlap_bytes)) 710 { 711 /* Hitting this means there was a reentrant request, for 712 * example, a block driver issuing nested requests. This must 713 * never happen since it means deadlock. 714 */ 715 assert(qemu_coroutine_self() != req->co); 716 717 /* If the request is already (indirectly) waiting for us, or 718 * will wait for us as soon as it wakes up, then just go on 719 * (instead of producing a deadlock in the former case). */ 720 if (!req->waiting_for) { 721 self->waiting_for = req; 722 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 723 self->waiting_for = NULL; 724 retry = true; 725 waited = true; 726 break; 727 } 728 } 729 } 730 qemu_co_mutex_unlock(&bs->reqs_lock); 731 } while (retry); 732 733 return waited; 734 } 735 736 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 737 size_t size) 738 { 739 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 740 return -EIO; 741 } 742 743 if (!bdrv_is_inserted(bs)) { 744 return -ENOMEDIUM; 745 } 746 747 if (offset < 0) { 748 return -EIO; 749 } 750 751 return 0; 752 } 753 754 typedef struct RwCo { 755 BdrvChild *child; 756 int64_t offset; 757 QEMUIOVector *qiov; 758 bool is_write; 759 int ret; 760 BdrvRequestFlags flags; 761 } RwCo; 762 763 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 764 { 765 RwCo *rwco = opaque; 766 767 if (!rwco->is_write) { 768 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 769 rwco->qiov->size, rwco->qiov, 770 rwco->flags); 771 } else { 772 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 773 rwco->qiov->size, rwco->qiov, 774 rwco->flags); 775 } 776 } 777 778 /* 779 * Process a vectored synchronous request using coroutines 780 */ 781 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 782 QEMUIOVector *qiov, bool is_write, 783 BdrvRequestFlags flags) 784 { 785 Coroutine *co; 786 RwCo rwco = { 787 .child = child, 788 .offset = offset, 789 .qiov = qiov, 790 .is_write = is_write, 791 .ret = NOT_DONE, 792 .flags = flags, 793 }; 794 795 if (qemu_in_coroutine()) { 796 /* Fast-path if already in coroutine context */ 797 bdrv_rw_co_entry(&rwco); 798 } else { 799 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 800 bdrv_coroutine_enter(child->bs, co); 801 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 802 } 803 return rwco.ret; 804 } 805 806 /* 807 * Process a synchronous request using coroutines 808 */ 809 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 810 int nb_sectors, bool is_write, BdrvRequestFlags flags) 811 { 812 QEMUIOVector qiov; 813 struct iovec iov = { 814 .iov_base = (void *)buf, 815 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 816 }; 817 818 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 819 return -EINVAL; 820 } 821 822 qemu_iovec_init_external(&qiov, &iov, 1); 823 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 824 &qiov, is_write, flags); 825 } 826 827 /* return < 0 if error. See bdrv_write() for the return codes */ 828 int bdrv_read(BdrvChild *child, int64_t sector_num, 829 uint8_t *buf, int nb_sectors) 830 { 831 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 832 } 833 834 /* Return < 0 if error. Important errors are: 835 -EIO generic I/O error (may happen for all errors) 836 -ENOMEDIUM No media inserted. 837 -EINVAL Invalid sector number or nb_sectors 838 -EACCES Trying to write a read-only device 839 */ 840 int bdrv_write(BdrvChild *child, int64_t sector_num, 841 const uint8_t *buf, int nb_sectors) 842 { 843 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 844 } 845 846 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 847 int bytes, BdrvRequestFlags flags) 848 { 849 QEMUIOVector qiov; 850 struct iovec iov = { 851 .iov_base = NULL, 852 .iov_len = bytes, 853 }; 854 855 qemu_iovec_init_external(&qiov, &iov, 1); 856 return bdrv_prwv_co(child, offset, &qiov, true, 857 BDRV_REQ_ZERO_WRITE | flags); 858 } 859 860 /* 861 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 862 * The operation is sped up by checking the block status and only writing 863 * zeroes to the device if they currently do not return zeroes. Optional 864 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 865 * BDRV_REQ_FUA). 866 * 867 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 868 */ 869 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 870 { 871 int ret; 872 int64_t target_size, bytes, offset = 0; 873 BlockDriverState *bs = child->bs; 874 875 target_size = bdrv_getlength(bs); 876 if (target_size < 0) { 877 return target_size; 878 } 879 880 for (;;) { 881 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 882 if (bytes <= 0) { 883 return 0; 884 } 885 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 886 if (ret < 0) { 887 error_report("error getting block status at offset %" PRId64 ": %s", 888 offset, strerror(-ret)); 889 return ret; 890 } 891 if (ret & BDRV_BLOCK_ZERO) { 892 offset += bytes; 893 continue; 894 } 895 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 896 if (ret < 0) { 897 error_report("error writing zeroes at offset %" PRId64 ": %s", 898 offset, strerror(-ret)); 899 return ret; 900 } 901 offset += bytes; 902 } 903 } 904 905 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 906 { 907 int ret; 908 909 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 910 if (ret < 0) { 911 return ret; 912 } 913 914 return qiov->size; 915 } 916 917 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 918 { 919 QEMUIOVector qiov; 920 struct iovec iov = { 921 .iov_base = (void *)buf, 922 .iov_len = bytes, 923 }; 924 925 if (bytes < 0) { 926 return -EINVAL; 927 } 928 929 qemu_iovec_init_external(&qiov, &iov, 1); 930 return bdrv_preadv(child, offset, &qiov); 931 } 932 933 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 934 { 935 int ret; 936 937 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 938 if (ret < 0) { 939 return ret; 940 } 941 942 return qiov->size; 943 } 944 945 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 946 { 947 QEMUIOVector qiov; 948 struct iovec iov = { 949 .iov_base = (void *) buf, 950 .iov_len = bytes, 951 }; 952 953 if (bytes < 0) { 954 return -EINVAL; 955 } 956 957 qemu_iovec_init_external(&qiov, &iov, 1); 958 return bdrv_pwritev(child, offset, &qiov); 959 } 960 961 /* 962 * Writes to the file and ensures that no writes are reordered across this 963 * request (acts as a barrier) 964 * 965 * Returns 0 on success, -errno in error cases. 966 */ 967 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 968 const void *buf, int count) 969 { 970 int ret; 971 972 ret = bdrv_pwrite(child, offset, buf, count); 973 if (ret < 0) { 974 return ret; 975 } 976 977 ret = bdrv_flush(child->bs); 978 if (ret < 0) { 979 return ret; 980 } 981 982 return 0; 983 } 984 985 typedef struct CoroutineIOCompletion { 986 Coroutine *coroutine; 987 int ret; 988 } CoroutineIOCompletion; 989 990 static void bdrv_co_io_em_complete(void *opaque, int ret) 991 { 992 CoroutineIOCompletion *co = opaque; 993 994 co->ret = ret; 995 aio_co_wake(co->coroutine); 996 } 997 998 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 999 uint64_t offset, uint64_t bytes, 1000 QEMUIOVector *qiov, int flags) 1001 { 1002 BlockDriver *drv = bs->drv; 1003 int64_t sector_num; 1004 unsigned int nb_sectors; 1005 1006 assert(!(flags & ~BDRV_REQ_MASK)); 1007 1008 if (!drv) { 1009 return -ENOMEDIUM; 1010 } 1011 1012 if (drv->bdrv_co_preadv) { 1013 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1014 } 1015 1016 if (drv->bdrv_aio_preadv) { 1017 BlockAIOCB *acb; 1018 CoroutineIOCompletion co = { 1019 .coroutine = qemu_coroutine_self(), 1020 }; 1021 1022 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 1023 bdrv_co_io_em_complete, &co); 1024 if (acb == NULL) { 1025 return -EIO; 1026 } else { 1027 qemu_coroutine_yield(); 1028 return co.ret; 1029 } 1030 } 1031 1032 sector_num = offset >> BDRV_SECTOR_BITS; 1033 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1034 1035 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1036 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1037 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 1038 assert(drv->bdrv_co_readv); 1039 1040 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1041 } 1042 1043 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1044 uint64_t offset, uint64_t bytes, 1045 QEMUIOVector *qiov, int flags) 1046 { 1047 BlockDriver *drv = bs->drv; 1048 int64_t sector_num; 1049 unsigned int nb_sectors; 1050 int ret; 1051 1052 assert(!(flags & ~BDRV_REQ_MASK)); 1053 1054 if (!drv) { 1055 return -ENOMEDIUM; 1056 } 1057 1058 if (drv->bdrv_co_pwritev) { 1059 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1060 flags & bs->supported_write_flags); 1061 flags &= ~bs->supported_write_flags; 1062 goto emulate_flags; 1063 } 1064 1065 if (drv->bdrv_aio_pwritev) { 1066 BlockAIOCB *acb; 1067 CoroutineIOCompletion co = { 1068 .coroutine = qemu_coroutine_self(), 1069 }; 1070 1071 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1072 flags & bs->supported_write_flags, 1073 bdrv_co_io_em_complete, &co); 1074 flags &= ~bs->supported_write_flags; 1075 if (acb == NULL) { 1076 ret = -EIO; 1077 } else { 1078 qemu_coroutine_yield(); 1079 ret = co.ret; 1080 } 1081 goto emulate_flags; 1082 } 1083 1084 sector_num = offset >> BDRV_SECTOR_BITS; 1085 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1086 1087 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1088 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1089 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 1090 1091 assert(drv->bdrv_co_writev); 1092 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1093 flags & bs->supported_write_flags); 1094 flags &= ~bs->supported_write_flags; 1095 1096 emulate_flags: 1097 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1098 ret = bdrv_co_flush(bs); 1099 } 1100 1101 return ret; 1102 } 1103 1104 static int coroutine_fn 1105 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1106 uint64_t bytes, QEMUIOVector *qiov) 1107 { 1108 BlockDriver *drv = bs->drv; 1109 1110 if (!drv) { 1111 return -ENOMEDIUM; 1112 } 1113 1114 if (!drv->bdrv_co_pwritev_compressed) { 1115 return -ENOTSUP; 1116 } 1117 1118 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1119 } 1120 1121 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1122 int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 1123 { 1124 BlockDriverState *bs = child->bs; 1125 1126 /* Perform I/O through a temporary buffer so that users who scribble over 1127 * their read buffer while the operation is in progress do not end up 1128 * modifying the image file. This is critical for zero-copy guest I/O 1129 * where anything might happen inside guest memory. 1130 */ 1131 void *bounce_buffer; 1132 1133 BlockDriver *drv = bs->drv; 1134 struct iovec iov; 1135 QEMUIOVector local_qiov; 1136 int64_t cluster_offset; 1137 int64_t cluster_bytes; 1138 size_t skip_bytes; 1139 int ret; 1140 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1141 BDRV_REQUEST_MAX_BYTES); 1142 unsigned int progress = 0; 1143 1144 if (!drv) { 1145 return -ENOMEDIUM; 1146 } 1147 1148 /* FIXME We cannot require callers to have write permissions when all they 1149 * are doing is a read request. If we did things right, write permissions 1150 * would be obtained anyway, but internally by the copy-on-read code. As 1151 * long as it is implemented here rather than in a separate filter driver, 1152 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1153 * it could request permissions. Therefore we have to bypass the permission 1154 * system for the moment. */ 1155 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1156 1157 /* Cover entire cluster so no additional backing file I/O is required when 1158 * allocating cluster in the image file. Note that this value may exceed 1159 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1160 * is one reason we loop rather than doing it all at once. 1161 */ 1162 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1163 skip_bytes = offset - cluster_offset; 1164 1165 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1166 cluster_offset, cluster_bytes); 1167 1168 bounce_buffer = qemu_try_blockalign(bs, 1169 MIN(MIN(max_transfer, cluster_bytes), 1170 MAX_BOUNCE_BUFFER)); 1171 if (bounce_buffer == NULL) { 1172 ret = -ENOMEM; 1173 goto err; 1174 } 1175 1176 while (cluster_bytes) { 1177 int64_t pnum; 1178 1179 ret = bdrv_is_allocated(bs, cluster_offset, 1180 MIN(cluster_bytes, max_transfer), &pnum); 1181 if (ret < 0) { 1182 /* Safe to treat errors in querying allocation as if 1183 * unallocated; we'll probably fail again soon on the 1184 * read, but at least that will set a decent errno. 1185 */ 1186 pnum = MIN(cluster_bytes, max_transfer); 1187 } 1188 1189 assert(skip_bytes < pnum); 1190 1191 if (ret <= 0) { 1192 /* Must copy-on-read; use the bounce buffer */ 1193 iov.iov_base = bounce_buffer; 1194 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1195 qemu_iovec_init_external(&local_qiov, &iov, 1); 1196 1197 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1198 &local_qiov, 0); 1199 if (ret < 0) { 1200 goto err; 1201 } 1202 1203 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1204 if (drv->bdrv_co_pwrite_zeroes && 1205 buffer_is_zero(bounce_buffer, pnum)) { 1206 /* FIXME: Should we (perhaps conditionally) be setting 1207 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1208 * that still correctly reads as zero? */ 1209 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1210 BDRV_REQ_WRITE_UNCHANGED); 1211 } else { 1212 /* This does not change the data on the disk, it is not 1213 * necessary to flush even in cache=writethrough mode. 1214 */ 1215 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1216 &local_qiov, 1217 BDRV_REQ_WRITE_UNCHANGED); 1218 } 1219 1220 if (ret < 0) { 1221 /* It might be okay to ignore write errors for guest 1222 * requests. If this is a deliberate copy-on-read 1223 * then we don't want to ignore the error. Simply 1224 * report it in all cases. 1225 */ 1226 goto err; 1227 } 1228 1229 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, 1230 pnum - skip_bytes); 1231 } else { 1232 /* Read directly into the destination */ 1233 qemu_iovec_init(&local_qiov, qiov->niov); 1234 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); 1235 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, 1236 &local_qiov, 0); 1237 qemu_iovec_destroy(&local_qiov); 1238 if (ret < 0) { 1239 goto err; 1240 } 1241 } 1242 1243 cluster_offset += pnum; 1244 cluster_bytes -= pnum; 1245 progress += pnum - skip_bytes; 1246 skip_bytes = 0; 1247 } 1248 ret = 0; 1249 1250 err: 1251 qemu_vfree(bounce_buffer); 1252 return ret; 1253 } 1254 1255 /* 1256 * Forwards an already correctly aligned request to the BlockDriver. This 1257 * handles copy on read, zeroing after EOF, and fragmentation of large 1258 * reads; any other features must be implemented by the caller. 1259 */ 1260 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1261 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1262 int64_t align, QEMUIOVector *qiov, int flags) 1263 { 1264 BlockDriverState *bs = child->bs; 1265 int64_t total_bytes, max_bytes; 1266 int ret = 0; 1267 uint64_t bytes_remaining = bytes; 1268 int max_transfer; 1269 1270 assert(is_power_of_2(align)); 1271 assert((offset & (align - 1)) == 0); 1272 assert((bytes & (align - 1)) == 0); 1273 assert(!qiov || bytes == qiov->size); 1274 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1275 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1276 align); 1277 1278 /* TODO: We would need a per-BDS .supported_read_flags and 1279 * potential fallback support, if we ever implement any read flags 1280 * to pass through to drivers. For now, there aren't any 1281 * passthrough flags. */ 1282 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 1283 1284 /* Handle Copy on Read and associated serialisation */ 1285 if (flags & BDRV_REQ_COPY_ON_READ) { 1286 /* If we touch the same cluster it counts as an overlap. This 1287 * guarantees that allocating writes will be serialized and not race 1288 * with each other for the same cluster. For example, in copy-on-read 1289 * it ensures that the CoR read and write operations are atomic and 1290 * guest writes cannot interleave between them. */ 1291 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1292 } 1293 1294 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 1295 wait_serialising_requests(req); 1296 } 1297 1298 if (flags & BDRV_REQ_COPY_ON_READ) { 1299 int64_t pnum; 1300 1301 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1302 if (ret < 0) { 1303 goto out; 1304 } 1305 1306 if (!ret || pnum != bytes) { 1307 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 1308 goto out; 1309 } 1310 } 1311 1312 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1313 total_bytes = bdrv_getlength(bs); 1314 if (total_bytes < 0) { 1315 ret = total_bytes; 1316 goto out; 1317 } 1318 1319 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1320 if (bytes <= max_bytes && bytes <= max_transfer) { 1321 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 1322 goto out; 1323 } 1324 1325 while (bytes_remaining) { 1326 int num; 1327 1328 if (max_bytes) { 1329 QEMUIOVector local_qiov; 1330 1331 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1332 assert(num); 1333 qemu_iovec_init(&local_qiov, qiov->niov); 1334 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1335 1336 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1337 num, &local_qiov, 0); 1338 max_bytes -= num; 1339 qemu_iovec_destroy(&local_qiov); 1340 } else { 1341 num = bytes_remaining; 1342 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1343 bytes_remaining); 1344 } 1345 if (ret < 0) { 1346 goto out; 1347 } 1348 bytes_remaining -= num; 1349 } 1350 1351 out: 1352 return ret < 0 ? ret : 0; 1353 } 1354 1355 /* 1356 * Handle a read request in coroutine context 1357 */ 1358 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1359 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1360 BdrvRequestFlags flags) 1361 { 1362 BlockDriverState *bs = child->bs; 1363 BlockDriver *drv = bs->drv; 1364 BdrvTrackedRequest req; 1365 1366 uint64_t align = bs->bl.request_alignment; 1367 uint8_t *head_buf = NULL; 1368 uint8_t *tail_buf = NULL; 1369 QEMUIOVector local_qiov; 1370 bool use_local_qiov = false; 1371 int ret; 1372 1373 trace_bdrv_co_preadv(child->bs, offset, bytes, flags); 1374 1375 if (!drv) { 1376 return -ENOMEDIUM; 1377 } 1378 1379 ret = bdrv_check_byte_request(bs, offset, bytes); 1380 if (ret < 0) { 1381 return ret; 1382 } 1383 1384 bdrv_inc_in_flight(bs); 1385 1386 /* Don't do copy-on-read if we read data before write operation */ 1387 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 1388 flags |= BDRV_REQ_COPY_ON_READ; 1389 } 1390 1391 /* Align read if necessary by padding qiov */ 1392 if (offset & (align - 1)) { 1393 head_buf = qemu_blockalign(bs, align); 1394 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1395 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1396 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1397 use_local_qiov = true; 1398 1399 bytes += offset & (align - 1); 1400 offset = offset & ~(align - 1); 1401 } 1402 1403 if ((offset + bytes) & (align - 1)) { 1404 if (!use_local_qiov) { 1405 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1406 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1407 use_local_qiov = true; 1408 } 1409 tail_buf = qemu_blockalign(bs, align); 1410 qemu_iovec_add(&local_qiov, tail_buf, 1411 align - ((offset + bytes) & (align - 1))); 1412 1413 bytes = ROUND_UP(bytes, align); 1414 } 1415 1416 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1417 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 1418 use_local_qiov ? &local_qiov : qiov, 1419 flags); 1420 tracked_request_end(&req); 1421 bdrv_dec_in_flight(bs); 1422 1423 if (use_local_qiov) { 1424 qemu_iovec_destroy(&local_qiov); 1425 qemu_vfree(head_buf); 1426 qemu_vfree(tail_buf); 1427 } 1428 1429 return ret; 1430 } 1431 1432 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1433 int64_t offset, int bytes, BdrvRequestFlags flags) 1434 { 1435 BlockDriver *drv = bs->drv; 1436 QEMUIOVector qiov; 1437 struct iovec iov = {0}; 1438 int ret = 0; 1439 bool need_flush = false; 1440 int head = 0; 1441 int tail = 0; 1442 1443 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1444 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1445 bs->bl.request_alignment); 1446 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1447 1448 if (!drv) { 1449 return -ENOMEDIUM; 1450 } 1451 1452 assert(alignment % bs->bl.request_alignment == 0); 1453 head = offset % alignment; 1454 tail = (offset + bytes) % alignment; 1455 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1456 assert(max_write_zeroes >= bs->bl.request_alignment); 1457 1458 while (bytes > 0 && !ret) { 1459 int num = bytes; 1460 1461 /* Align request. Block drivers can expect the "bulk" of the request 1462 * to be aligned, and that unaligned requests do not cross cluster 1463 * boundaries. 1464 */ 1465 if (head) { 1466 /* Make a small request up to the first aligned sector. For 1467 * convenience, limit this request to max_transfer even if 1468 * we don't need to fall back to writes. */ 1469 num = MIN(MIN(bytes, max_transfer), alignment - head); 1470 head = (head + num) % alignment; 1471 assert(num < max_write_zeroes); 1472 } else if (tail && num > alignment) { 1473 /* Shorten the request to the last aligned sector. */ 1474 num -= tail; 1475 } 1476 1477 /* limit request size */ 1478 if (num > max_write_zeroes) { 1479 num = max_write_zeroes; 1480 } 1481 1482 ret = -ENOTSUP; 1483 /* First try the efficient write zeroes operation */ 1484 if (drv->bdrv_co_pwrite_zeroes) { 1485 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1486 flags & bs->supported_zero_flags); 1487 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1488 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1489 need_flush = true; 1490 } 1491 } else { 1492 assert(!bs->supported_zero_flags); 1493 } 1494 1495 if (ret == -ENOTSUP) { 1496 /* Fall back to bounce buffer if write zeroes is unsupported */ 1497 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1498 1499 if ((flags & BDRV_REQ_FUA) && 1500 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1501 /* No need for bdrv_driver_pwrite() to do a fallback 1502 * flush on each chunk; use just one at the end */ 1503 write_flags &= ~BDRV_REQ_FUA; 1504 need_flush = true; 1505 } 1506 num = MIN(num, max_transfer); 1507 iov.iov_len = num; 1508 if (iov.iov_base == NULL) { 1509 iov.iov_base = qemu_try_blockalign(bs, num); 1510 if (iov.iov_base == NULL) { 1511 ret = -ENOMEM; 1512 goto fail; 1513 } 1514 memset(iov.iov_base, 0, num); 1515 } 1516 qemu_iovec_init_external(&qiov, &iov, 1); 1517 1518 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 1519 1520 /* Keep bounce buffer around if it is big enough for all 1521 * all future requests. 1522 */ 1523 if (num < max_transfer) { 1524 qemu_vfree(iov.iov_base); 1525 iov.iov_base = NULL; 1526 } 1527 } 1528 1529 offset += num; 1530 bytes -= num; 1531 } 1532 1533 fail: 1534 if (ret == 0 && need_flush) { 1535 ret = bdrv_co_flush(bs); 1536 } 1537 qemu_vfree(iov.iov_base); 1538 return ret; 1539 } 1540 1541 /* 1542 * Forwards an already correctly aligned write request to the BlockDriver, 1543 * after possibly fragmenting it. 1544 */ 1545 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1546 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1547 int64_t align, QEMUIOVector *qiov, int flags) 1548 { 1549 BlockDriverState *bs = child->bs; 1550 BlockDriver *drv = bs->drv; 1551 bool waited; 1552 int ret; 1553 1554 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1555 uint64_t bytes_remaining = bytes; 1556 int max_transfer; 1557 1558 if (!drv) { 1559 return -ENOMEDIUM; 1560 } 1561 1562 if (bdrv_has_readonly_bitmaps(bs)) { 1563 return -EPERM; 1564 } 1565 1566 assert(is_power_of_2(align)); 1567 assert((offset & (align - 1)) == 0); 1568 assert((bytes & (align - 1)) == 0); 1569 assert(!qiov || bytes == qiov->size); 1570 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1571 assert(!(flags & ~BDRV_REQ_MASK)); 1572 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1573 align); 1574 1575 waited = wait_serialising_requests(req); 1576 assert(!waited || !req->serialising); 1577 assert(req->overlap_offset <= offset); 1578 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1579 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1580 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1581 } else { 1582 assert(child->perm & BLK_PERM_WRITE); 1583 } 1584 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1585 1586 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1587 1588 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1589 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1590 qemu_iovec_is_zero(qiov)) { 1591 flags |= BDRV_REQ_ZERO_WRITE; 1592 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1593 flags |= BDRV_REQ_MAY_UNMAP; 1594 } 1595 } 1596 1597 if (ret < 0) { 1598 /* Do nothing, write notifier decided to fail this request */ 1599 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1600 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1601 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1602 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1603 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 1604 } else if (bytes <= max_transfer) { 1605 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1606 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 1607 } else { 1608 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1609 while (bytes_remaining) { 1610 int num = MIN(bytes_remaining, max_transfer); 1611 QEMUIOVector local_qiov; 1612 int local_flags = flags; 1613 1614 assert(num); 1615 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1616 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1617 /* If FUA is going to be emulated by flush, we only 1618 * need to flush on the last iteration */ 1619 local_flags &= ~BDRV_REQ_FUA; 1620 } 1621 qemu_iovec_init(&local_qiov, qiov->niov); 1622 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1623 1624 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1625 num, &local_qiov, local_flags); 1626 qemu_iovec_destroy(&local_qiov); 1627 if (ret < 0) { 1628 break; 1629 } 1630 bytes_remaining -= num; 1631 } 1632 } 1633 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1634 1635 atomic_inc(&bs->write_gen); 1636 bdrv_set_dirty(bs, offset, bytes); 1637 1638 stat64_max(&bs->wr_highest_offset, offset + bytes); 1639 1640 if (ret >= 0) { 1641 bs->total_sectors = MAX(bs->total_sectors, end_sector); 1642 ret = 0; 1643 } 1644 1645 return ret; 1646 } 1647 1648 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1649 int64_t offset, 1650 unsigned int bytes, 1651 BdrvRequestFlags flags, 1652 BdrvTrackedRequest *req) 1653 { 1654 BlockDriverState *bs = child->bs; 1655 uint8_t *buf = NULL; 1656 QEMUIOVector local_qiov; 1657 struct iovec iov; 1658 uint64_t align = bs->bl.request_alignment; 1659 unsigned int head_padding_bytes, tail_padding_bytes; 1660 int ret = 0; 1661 1662 head_padding_bytes = offset & (align - 1); 1663 tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 1664 1665 1666 assert(flags & BDRV_REQ_ZERO_WRITE); 1667 if (head_padding_bytes || tail_padding_bytes) { 1668 buf = qemu_blockalign(bs, align); 1669 iov = (struct iovec) { 1670 .iov_base = buf, 1671 .iov_len = align, 1672 }; 1673 qemu_iovec_init_external(&local_qiov, &iov, 1); 1674 } 1675 if (head_padding_bytes) { 1676 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1677 1678 /* RMW the unaligned part before head. */ 1679 mark_request_serialising(req, align); 1680 wait_serialising_requests(req); 1681 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1682 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 1683 align, &local_qiov, 0); 1684 if (ret < 0) { 1685 goto fail; 1686 } 1687 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1688 1689 memset(buf + head_padding_bytes, 0, zero_bytes); 1690 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1691 align, &local_qiov, 1692 flags & ~BDRV_REQ_ZERO_WRITE); 1693 if (ret < 0) { 1694 goto fail; 1695 } 1696 offset += zero_bytes; 1697 bytes -= zero_bytes; 1698 } 1699 1700 assert(!bytes || (offset & (align - 1)) == 0); 1701 if (bytes >= align) { 1702 /* Write the aligned part in the middle. */ 1703 uint64_t aligned_bytes = bytes & ~(align - 1); 1704 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 1705 NULL, flags); 1706 if (ret < 0) { 1707 goto fail; 1708 } 1709 bytes -= aligned_bytes; 1710 offset += aligned_bytes; 1711 } 1712 1713 assert(!bytes || (offset & (align - 1)) == 0); 1714 if (bytes) { 1715 assert(align == tail_padding_bytes + bytes); 1716 /* RMW the unaligned part after tail. */ 1717 mark_request_serialising(req, align); 1718 wait_serialising_requests(req); 1719 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1720 ret = bdrv_aligned_preadv(child, req, offset, align, 1721 align, &local_qiov, 0); 1722 if (ret < 0) { 1723 goto fail; 1724 } 1725 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1726 1727 memset(buf, 0, bytes); 1728 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 1729 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1730 } 1731 fail: 1732 qemu_vfree(buf); 1733 return ret; 1734 1735 } 1736 1737 /* 1738 * Handle a write request in coroutine context 1739 */ 1740 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 1741 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1742 BdrvRequestFlags flags) 1743 { 1744 BlockDriverState *bs = child->bs; 1745 BdrvTrackedRequest req; 1746 uint64_t align = bs->bl.request_alignment; 1747 uint8_t *head_buf = NULL; 1748 uint8_t *tail_buf = NULL; 1749 QEMUIOVector local_qiov; 1750 bool use_local_qiov = false; 1751 int ret; 1752 1753 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 1754 1755 if (!bs->drv) { 1756 return -ENOMEDIUM; 1757 } 1758 if (bs->read_only) { 1759 return -EPERM; 1760 } 1761 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1762 1763 ret = bdrv_check_byte_request(bs, offset, bytes); 1764 if (ret < 0) { 1765 return ret; 1766 } 1767 1768 bdrv_inc_in_flight(bs); 1769 /* 1770 * Align write if necessary by performing a read-modify-write cycle. 1771 * Pad qiov with the read parts and be sure to have a tracked request not 1772 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1773 */ 1774 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1775 1776 if (flags & BDRV_REQ_ZERO_WRITE) { 1777 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 1778 goto out; 1779 } 1780 1781 if (offset & (align - 1)) { 1782 QEMUIOVector head_qiov; 1783 struct iovec head_iov; 1784 1785 mark_request_serialising(&req, align); 1786 wait_serialising_requests(&req); 1787 1788 head_buf = qemu_blockalign(bs, align); 1789 head_iov = (struct iovec) { 1790 .iov_base = head_buf, 1791 .iov_len = align, 1792 }; 1793 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1794 1795 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1796 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 1797 align, &head_qiov, 0); 1798 if (ret < 0) { 1799 goto fail; 1800 } 1801 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1802 1803 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1804 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1805 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1806 use_local_qiov = true; 1807 1808 bytes += offset & (align - 1); 1809 offset = offset & ~(align - 1); 1810 1811 /* We have read the tail already if the request is smaller 1812 * than one aligned block. 1813 */ 1814 if (bytes < align) { 1815 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1816 bytes = align; 1817 } 1818 } 1819 1820 if ((offset + bytes) & (align - 1)) { 1821 QEMUIOVector tail_qiov; 1822 struct iovec tail_iov; 1823 size_t tail_bytes; 1824 bool waited; 1825 1826 mark_request_serialising(&req, align); 1827 waited = wait_serialising_requests(&req); 1828 assert(!waited || !use_local_qiov); 1829 1830 tail_buf = qemu_blockalign(bs, align); 1831 tail_iov = (struct iovec) { 1832 .iov_base = tail_buf, 1833 .iov_len = align, 1834 }; 1835 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1836 1837 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1838 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 1839 align, align, &tail_qiov, 0); 1840 if (ret < 0) { 1841 goto fail; 1842 } 1843 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1844 1845 if (!use_local_qiov) { 1846 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1847 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1848 use_local_qiov = true; 1849 } 1850 1851 tail_bytes = (offset + bytes) & (align - 1); 1852 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1853 1854 bytes = ROUND_UP(bytes, align); 1855 } 1856 1857 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 1858 use_local_qiov ? &local_qiov : qiov, 1859 flags); 1860 1861 fail: 1862 1863 if (use_local_qiov) { 1864 qemu_iovec_destroy(&local_qiov); 1865 } 1866 qemu_vfree(head_buf); 1867 qemu_vfree(tail_buf); 1868 out: 1869 tracked_request_end(&req); 1870 bdrv_dec_in_flight(bs); 1871 return ret; 1872 } 1873 1874 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1875 int bytes, BdrvRequestFlags flags) 1876 { 1877 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 1878 1879 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 1880 flags &= ~BDRV_REQ_MAY_UNMAP; 1881 } 1882 1883 return bdrv_co_pwritev(child, offset, bytes, NULL, 1884 BDRV_REQ_ZERO_WRITE | flags); 1885 } 1886 1887 /* 1888 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 1889 */ 1890 int bdrv_flush_all(void) 1891 { 1892 BdrvNextIterator it; 1893 BlockDriverState *bs = NULL; 1894 int result = 0; 1895 1896 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 1897 AioContext *aio_context = bdrv_get_aio_context(bs); 1898 int ret; 1899 1900 aio_context_acquire(aio_context); 1901 ret = bdrv_flush(bs); 1902 if (ret < 0 && !result) { 1903 result = ret; 1904 } 1905 aio_context_release(aio_context); 1906 } 1907 1908 return result; 1909 } 1910 1911 1912 typedef struct BdrvCoBlockStatusData { 1913 BlockDriverState *bs; 1914 BlockDriverState *base; 1915 bool want_zero; 1916 int64_t offset; 1917 int64_t bytes; 1918 int64_t *pnum; 1919 int64_t *map; 1920 BlockDriverState **file; 1921 int ret; 1922 bool done; 1923 } BdrvCoBlockStatusData; 1924 1925 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 1926 bool want_zero, 1927 int64_t offset, 1928 int64_t bytes, 1929 int64_t *pnum, 1930 int64_t *map, 1931 BlockDriverState **file) 1932 { 1933 assert(bs->file && bs->file->bs); 1934 *pnum = bytes; 1935 *map = offset; 1936 *file = bs->file->bs; 1937 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1938 } 1939 1940 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 1941 bool want_zero, 1942 int64_t offset, 1943 int64_t bytes, 1944 int64_t *pnum, 1945 int64_t *map, 1946 BlockDriverState **file) 1947 { 1948 assert(bs->backing && bs->backing->bs); 1949 *pnum = bytes; 1950 *map = offset; 1951 *file = bs->backing->bs; 1952 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1953 } 1954 1955 /* 1956 * Returns the allocation status of the specified sectors. 1957 * Drivers not implementing the functionality are assumed to not support 1958 * backing files, hence all their sectors are reported as allocated. 1959 * 1960 * If 'want_zero' is true, the caller is querying for mapping 1961 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 1962 * _ZERO where possible; otherwise, the result favors larger 'pnum', 1963 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 1964 * 1965 * If 'offset' is beyond the end of the disk image the return value is 1966 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 1967 * 1968 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 1969 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 1970 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 1971 * 1972 * 'pnum' is set to the number of bytes (including and immediately 1973 * following the specified offset) that are easily known to be in the 1974 * same allocated/unallocated state. Note that a second call starting 1975 * at the original offset plus returned pnum may have the same status. 1976 * The returned value is non-zero on success except at end-of-file. 1977 * 1978 * Returns negative errno on failure. Otherwise, if the 1979 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 1980 * set to the host mapping and BDS corresponding to the guest offset. 1981 */ 1982 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 1983 bool want_zero, 1984 int64_t offset, int64_t bytes, 1985 int64_t *pnum, int64_t *map, 1986 BlockDriverState **file) 1987 { 1988 int64_t total_size; 1989 int64_t n; /* bytes */ 1990 int ret; 1991 int64_t local_map = 0; 1992 BlockDriverState *local_file = NULL; 1993 int64_t aligned_offset, aligned_bytes; 1994 uint32_t align; 1995 1996 assert(pnum); 1997 *pnum = 0; 1998 total_size = bdrv_getlength(bs); 1999 if (total_size < 0) { 2000 ret = total_size; 2001 goto early_out; 2002 } 2003 2004 if (offset >= total_size) { 2005 ret = BDRV_BLOCK_EOF; 2006 goto early_out; 2007 } 2008 if (!bytes) { 2009 ret = 0; 2010 goto early_out; 2011 } 2012 2013 n = total_size - offset; 2014 if (n < bytes) { 2015 bytes = n; 2016 } 2017 2018 /* Must be non-NULL or bdrv_getlength() would have failed */ 2019 assert(bs->drv); 2020 if (!bs->drv->bdrv_co_block_status) { 2021 *pnum = bytes; 2022 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2023 if (offset + bytes == total_size) { 2024 ret |= BDRV_BLOCK_EOF; 2025 } 2026 if (bs->drv->protocol_name) { 2027 ret |= BDRV_BLOCK_OFFSET_VALID; 2028 local_map = offset; 2029 local_file = bs; 2030 } 2031 goto early_out; 2032 } 2033 2034 bdrv_inc_in_flight(bs); 2035 2036 /* Round out to request_alignment boundaries */ 2037 align = bs->bl.request_alignment; 2038 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2039 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2040 2041 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2042 aligned_bytes, pnum, &local_map, 2043 &local_file); 2044 if (ret < 0) { 2045 *pnum = 0; 2046 goto out; 2047 } 2048 2049 /* 2050 * The driver's result must be a non-zero multiple of request_alignment. 2051 * Clamp pnum and adjust map to original request. 2052 */ 2053 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2054 align > offset - aligned_offset); 2055 *pnum -= offset - aligned_offset; 2056 if (*pnum > bytes) { 2057 *pnum = bytes; 2058 } 2059 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2060 local_map += offset - aligned_offset; 2061 } 2062 2063 if (ret & BDRV_BLOCK_RAW) { 2064 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2065 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2066 *pnum, pnum, &local_map, &local_file); 2067 goto out; 2068 } 2069 2070 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2071 ret |= BDRV_BLOCK_ALLOCATED; 2072 } else if (want_zero) { 2073 if (bdrv_unallocated_blocks_are_zero(bs)) { 2074 ret |= BDRV_BLOCK_ZERO; 2075 } else if (bs->backing) { 2076 BlockDriverState *bs2 = bs->backing->bs; 2077 int64_t size2 = bdrv_getlength(bs2); 2078 2079 if (size2 >= 0 && offset >= size2) { 2080 ret |= BDRV_BLOCK_ZERO; 2081 } 2082 } 2083 } 2084 2085 if (want_zero && local_file && local_file != bs && 2086 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2087 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2088 int64_t file_pnum; 2089 int ret2; 2090 2091 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2092 *pnum, &file_pnum, NULL, NULL); 2093 if (ret2 >= 0) { 2094 /* Ignore errors. This is just providing extra information, it 2095 * is useful but not necessary. 2096 */ 2097 if (ret2 & BDRV_BLOCK_EOF && 2098 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2099 /* 2100 * It is valid for the format block driver to read 2101 * beyond the end of the underlying file's current 2102 * size; such areas read as zero. 2103 */ 2104 ret |= BDRV_BLOCK_ZERO; 2105 } else { 2106 /* Limit request to the range reported by the protocol driver */ 2107 *pnum = file_pnum; 2108 ret |= (ret2 & BDRV_BLOCK_ZERO); 2109 } 2110 } 2111 } 2112 2113 out: 2114 bdrv_dec_in_flight(bs); 2115 if (ret >= 0 && offset + *pnum == total_size) { 2116 ret |= BDRV_BLOCK_EOF; 2117 } 2118 early_out: 2119 if (file) { 2120 *file = local_file; 2121 } 2122 if (map) { 2123 *map = local_map; 2124 } 2125 return ret; 2126 } 2127 2128 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2129 BlockDriverState *base, 2130 bool want_zero, 2131 int64_t offset, 2132 int64_t bytes, 2133 int64_t *pnum, 2134 int64_t *map, 2135 BlockDriverState **file) 2136 { 2137 BlockDriverState *p; 2138 int ret = 0; 2139 bool first = true; 2140 2141 assert(bs != base); 2142 for (p = bs; p != base; p = backing_bs(p)) { 2143 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2144 file); 2145 if (ret < 0) { 2146 break; 2147 } 2148 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2149 /* 2150 * Reading beyond the end of the file continues to read 2151 * zeroes, but we can only widen the result to the 2152 * unallocated length we learned from an earlier 2153 * iteration. 2154 */ 2155 *pnum = bytes; 2156 } 2157 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2158 break; 2159 } 2160 /* [offset, pnum] unallocated on this layer, which could be only 2161 * the first part of [offset, bytes]. */ 2162 bytes = MIN(bytes, *pnum); 2163 first = false; 2164 } 2165 return ret; 2166 } 2167 2168 /* Coroutine wrapper for bdrv_block_status_above() */ 2169 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2170 { 2171 BdrvCoBlockStatusData *data = opaque; 2172 2173 data->ret = bdrv_co_block_status_above(data->bs, data->base, 2174 data->want_zero, 2175 data->offset, data->bytes, 2176 data->pnum, data->map, data->file); 2177 data->done = true; 2178 } 2179 2180 /* 2181 * Synchronous wrapper around bdrv_co_block_status_above(). 2182 * 2183 * See bdrv_co_block_status_above() for details. 2184 */ 2185 static int bdrv_common_block_status_above(BlockDriverState *bs, 2186 BlockDriverState *base, 2187 bool want_zero, int64_t offset, 2188 int64_t bytes, int64_t *pnum, 2189 int64_t *map, 2190 BlockDriverState **file) 2191 { 2192 Coroutine *co; 2193 BdrvCoBlockStatusData data = { 2194 .bs = bs, 2195 .base = base, 2196 .want_zero = want_zero, 2197 .offset = offset, 2198 .bytes = bytes, 2199 .pnum = pnum, 2200 .map = map, 2201 .file = file, 2202 .done = false, 2203 }; 2204 2205 if (qemu_in_coroutine()) { 2206 /* Fast-path if already in coroutine context */ 2207 bdrv_block_status_above_co_entry(&data); 2208 } else { 2209 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2210 bdrv_coroutine_enter(bs, co); 2211 BDRV_POLL_WHILE(bs, !data.done); 2212 } 2213 return data.ret; 2214 } 2215 2216 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2217 int64_t offset, int64_t bytes, int64_t *pnum, 2218 int64_t *map, BlockDriverState **file) 2219 { 2220 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2221 pnum, map, file); 2222 } 2223 2224 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2225 int64_t *pnum, int64_t *map, BlockDriverState **file) 2226 { 2227 return bdrv_block_status_above(bs, backing_bs(bs), 2228 offset, bytes, pnum, map, file); 2229 } 2230 2231 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2232 int64_t bytes, int64_t *pnum) 2233 { 2234 int ret; 2235 int64_t dummy; 2236 2237 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2238 bytes, pnum ? pnum : &dummy, NULL, 2239 NULL); 2240 if (ret < 0) { 2241 return ret; 2242 } 2243 return !!(ret & BDRV_BLOCK_ALLOCATED); 2244 } 2245 2246 /* 2247 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2248 * 2249 * Return true if (a prefix of) the given range is allocated in any image 2250 * between BASE and TOP (inclusive). BASE can be NULL to check if the given 2251 * offset is allocated in any image of the chain. Return false otherwise, 2252 * or negative errno on failure. 2253 * 2254 * 'pnum' is set to the number of bytes (including and immediately 2255 * following the specified offset) that are known to be in the same 2256 * allocated/unallocated state. Note that a subsequent call starting 2257 * at 'offset + *pnum' may return the same allocation status (in other 2258 * words, the result is not necessarily the maximum possible range); 2259 * but 'pnum' will only be 0 when end of file is reached. 2260 * 2261 */ 2262 int bdrv_is_allocated_above(BlockDriverState *top, 2263 BlockDriverState *base, 2264 int64_t offset, int64_t bytes, int64_t *pnum) 2265 { 2266 BlockDriverState *intermediate; 2267 int ret; 2268 int64_t n = bytes; 2269 2270 intermediate = top; 2271 while (intermediate && intermediate != base) { 2272 int64_t pnum_inter; 2273 int64_t size_inter; 2274 2275 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2276 if (ret < 0) { 2277 return ret; 2278 } 2279 if (ret) { 2280 *pnum = pnum_inter; 2281 return 1; 2282 } 2283 2284 size_inter = bdrv_getlength(intermediate); 2285 if (size_inter < 0) { 2286 return size_inter; 2287 } 2288 if (n > pnum_inter && 2289 (intermediate == top || offset + pnum_inter < size_inter)) { 2290 n = pnum_inter; 2291 } 2292 2293 intermediate = backing_bs(intermediate); 2294 } 2295 2296 *pnum = n; 2297 return 0; 2298 } 2299 2300 typedef struct BdrvVmstateCo { 2301 BlockDriverState *bs; 2302 QEMUIOVector *qiov; 2303 int64_t pos; 2304 bool is_read; 2305 int ret; 2306 } BdrvVmstateCo; 2307 2308 static int coroutine_fn 2309 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2310 bool is_read) 2311 { 2312 BlockDriver *drv = bs->drv; 2313 int ret = -ENOTSUP; 2314 2315 bdrv_inc_in_flight(bs); 2316 2317 if (!drv) { 2318 ret = -ENOMEDIUM; 2319 } else if (drv->bdrv_load_vmstate) { 2320 if (is_read) { 2321 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2322 } else { 2323 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2324 } 2325 } else if (bs->file) { 2326 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2327 } 2328 2329 bdrv_dec_in_flight(bs); 2330 return ret; 2331 } 2332 2333 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2334 { 2335 BdrvVmstateCo *co = opaque; 2336 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2337 } 2338 2339 static inline int 2340 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2341 bool is_read) 2342 { 2343 if (qemu_in_coroutine()) { 2344 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2345 } else { 2346 BdrvVmstateCo data = { 2347 .bs = bs, 2348 .qiov = qiov, 2349 .pos = pos, 2350 .is_read = is_read, 2351 .ret = -EINPROGRESS, 2352 }; 2353 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2354 2355 bdrv_coroutine_enter(bs, co); 2356 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 2357 return data.ret; 2358 } 2359 } 2360 2361 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2362 int64_t pos, int size) 2363 { 2364 QEMUIOVector qiov; 2365 struct iovec iov = { 2366 .iov_base = (void *) buf, 2367 .iov_len = size, 2368 }; 2369 int ret; 2370 2371 qemu_iovec_init_external(&qiov, &iov, 1); 2372 2373 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2374 if (ret < 0) { 2375 return ret; 2376 } 2377 2378 return size; 2379 } 2380 2381 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2382 { 2383 return bdrv_rw_vmstate(bs, qiov, pos, false); 2384 } 2385 2386 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2387 int64_t pos, int size) 2388 { 2389 QEMUIOVector qiov; 2390 struct iovec iov = { 2391 .iov_base = buf, 2392 .iov_len = size, 2393 }; 2394 int ret; 2395 2396 qemu_iovec_init_external(&qiov, &iov, 1); 2397 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2398 if (ret < 0) { 2399 return ret; 2400 } 2401 2402 return size; 2403 } 2404 2405 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2406 { 2407 return bdrv_rw_vmstate(bs, qiov, pos, true); 2408 } 2409 2410 /**************************************************************/ 2411 /* async I/Os */ 2412 2413 void bdrv_aio_cancel(BlockAIOCB *acb) 2414 { 2415 qemu_aio_ref(acb); 2416 bdrv_aio_cancel_async(acb); 2417 while (acb->refcnt > 1) { 2418 if (acb->aiocb_info->get_aio_context) { 2419 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2420 } else if (acb->bs) { 2421 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2422 * assert that we're not using an I/O thread. Thread-safe 2423 * code should use bdrv_aio_cancel_async exclusively. 2424 */ 2425 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2426 aio_poll(bdrv_get_aio_context(acb->bs), true); 2427 } else { 2428 abort(); 2429 } 2430 } 2431 qemu_aio_unref(acb); 2432 } 2433 2434 /* Async version of aio cancel. The caller is not blocked if the acb implements 2435 * cancel_async, otherwise we do nothing and let the request normally complete. 2436 * In either case the completion callback must be called. */ 2437 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2438 { 2439 if (acb->aiocb_info->cancel_async) { 2440 acb->aiocb_info->cancel_async(acb); 2441 } 2442 } 2443 2444 /**************************************************************/ 2445 /* Coroutine block device emulation */ 2446 2447 typedef struct FlushCo { 2448 BlockDriverState *bs; 2449 int ret; 2450 } FlushCo; 2451 2452 2453 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2454 { 2455 FlushCo *rwco = opaque; 2456 2457 rwco->ret = bdrv_co_flush(rwco->bs); 2458 } 2459 2460 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2461 { 2462 int current_gen; 2463 int ret = 0; 2464 2465 bdrv_inc_in_flight(bs); 2466 2467 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2468 bdrv_is_sg(bs)) { 2469 goto early_exit; 2470 } 2471 2472 qemu_co_mutex_lock(&bs->reqs_lock); 2473 current_gen = atomic_read(&bs->write_gen); 2474 2475 /* Wait until any previous flushes are completed */ 2476 while (bs->active_flush_req) { 2477 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2478 } 2479 2480 /* Flushes reach this point in nondecreasing current_gen order. */ 2481 bs->active_flush_req = true; 2482 qemu_co_mutex_unlock(&bs->reqs_lock); 2483 2484 /* Write back all layers by calling one driver function */ 2485 if (bs->drv->bdrv_co_flush) { 2486 ret = bs->drv->bdrv_co_flush(bs); 2487 goto out; 2488 } 2489 2490 /* Write back cached data to the OS even with cache=unsafe */ 2491 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2492 if (bs->drv->bdrv_co_flush_to_os) { 2493 ret = bs->drv->bdrv_co_flush_to_os(bs); 2494 if (ret < 0) { 2495 goto out; 2496 } 2497 } 2498 2499 /* But don't actually force it to the disk with cache=unsafe */ 2500 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2501 goto flush_parent; 2502 } 2503 2504 /* Check if we really need to flush anything */ 2505 if (bs->flushed_gen == current_gen) { 2506 goto flush_parent; 2507 } 2508 2509 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2510 if (!bs->drv) { 2511 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2512 * (even in case of apparent success) */ 2513 ret = -ENOMEDIUM; 2514 goto out; 2515 } 2516 if (bs->drv->bdrv_co_flush_to_disk) { 2517 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2518 } else if (bs->drv->bdrv_aio_flush) { 2519 BlockAIOCB *acb; 2520 CoroutineIOCompletion co = { 2521 .coroutine = qemu_coroutine_self(), 2522 }; 2523 2524 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2525 if (acb == NULL) { 2526 ret = -EIO; 2527 } else { 2528 qemu_coroutine_yield(); 2529 ret = co.ret; 2530 } 2531 } else { 2532 /* 2533 * Some block drivers always operate in either writethrough or unsafe 2534 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2535 * know how the server works (because the behaviour is hardcoded or 2536 * depends on server-side configuration), so we can't ensure that 2537 * everything is safe on disk. Returning an error doesn't work because 2538 * that would break guests even if the server operates in writethrough 2539 * mode. 2540 * 2541 * Let's hope the user knows what he's doing. 2542 */ 2543 ret = 0; 2544 } 2545 2546 if (ret < 0) { 2547 goto out; 2548 } 2549 2550 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2551 * in the case of cache=unsafe, so there are no useless flushes. 2552 */ 2553 flush_parent: 2554 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2555 out: 2556 /* Notify any pending flushes that we have completed */ 2557 if (ret == 0) { 2558 bs->flushed_gen = current_gen; 2559 } 2560 2561 qemu_co_mutex_lock(&bs->reqs_lock); 2562 bs->active_flush_req = false; 2563 /* Return value is ignored - it's ok if wait queue is empty */ 2564 qemu_co_queue_next(&bs->flush_queue); 2565 qemu_co_mutex_unlock(&bs->reqs_lock); 2566 2567 early_exit: 2568 bdrv_dec_in_flight(bs); 2569 return ret; 2570 } 2571 2572 int bdrv_flush(BlockDriverState *bs) 2573 { 2574 Coroutine *co; 2575 FlushCo flush_co = { 2576 .bs = bs, 2577 .ret = NOT_DONE, 2578 }; 2579 2580 if (qemu_in_coroutine()) { 2581 /* Fast-path if already in coroutine context */ 2582 bdrv_flush_co_entry(&flush_co); 2583 } else { 2584 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2585 bdrv_coroutine_enter(bs, co); 2586 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2587 } 2588 2589 return flush_co.ret; 2590 } 2591 2592 typedef struct DiscardCo { 2593 BlockDriverState *bs; 2594 int64_t offset; 2595 int bytes; 2596 int ret; 2597 } DiscardCo; 2598 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2599 { 2600 DiscardCo *rwco = opaque; 2601 2602 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); 2603 } 2604 2605 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2606 int bytes) 2607 { 2608 BdrvTrackedRequest req; 2609 int max_pdiscard, ret; 2610 int head, tail, align; 2611 2612 if (!bs->drv) { 2613 return -ENOMEDIUM; 2614 } 2615 2616 if (bdrv_has_readonly_bitmaps(bs)) { 2617 return -EPERM; 2618 } 2619 2620 ret = bdrv_check_byte_request(bs, offset, bytes); 2621 if (ret < 0) { 2622 return ret; 2623 } else if (bs->read_only) { 2624 return -EPERM; 2625 } 2626 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2627 2628 /* Do nothing if disabled. */ 2629 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2630 return 0; 2631 } 2632 2633 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2634 return 0; 2635 } 2636 2637 /* Discard is advisory, but some devices track and coalesce 2638 * unaligned requests, so we must pass everything down rather than 2639 * round here. Still, most devices will just silently ignore 2640 * unaligned requests (by returning -ENOTSUP), so we must fragment 2641 * the request accordingly. */ 2642 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2643 assert(align % bs->bl.request_alignment == 0); 2644 head = offset % align; 2645 tail = (offset + bytes) % align; 2646 2647 bdrv_inc_in_flight(bs); 2648 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2649 2650 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2651 if (ret < 0) { 2652 goto out; 2653 } 2654 2655 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2656 align); 2657 assert(max_pdiscard >= bs->bl.request_alignment); 2658 2659 while (bytes > 0) { 2660 int num = bytes; 2661 2662 if (head) { 2663 /* Make small requests to get to alignment boundaries. */ 2664 num = MIN(bytes, align - head); 2665 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2666 num %= bs->bl.request_alignment; 2667 } 2668 head = (head + num) % align; 2669 assert(num < max_pdiscard); 2670 } else if (tail) { 2671 if (num > align) { 2672 /* Shorten the request to the last aligned cluster. */ 2673 num -= tail; 2674 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2675 tail > bs->bl.request_alignment) { 2676 tail %= bs->bl.request_alignment; 2677 num -= tail; 2678 } 2679 } 2680 /* limit request size */ 2681 if (num > max_pdiscard) { 2682 num = max_pdiscard; 2683 } 2684 2685 if (!bs->drv) { 2686 ret = -ENOMEDIUM; 2687 goto out; 2688 } 2689 if (bs->drv->bdrv_co_pdiscard) { 2690 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2691 } else { 2692 BlockAIOCB *acb; 2693 CoroutineIOCompletion co = { 2694 .coroutine = qemu_coroutine_self(), 2695 }; 2696 2697 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2698 bdrv_co_io_em_complete, &co); 2699 if (acb == NULL) { 2700 ret = -EIO; 2701 goto out; 2702 } else { 2703 qemu_coroutine_yield(); 2704 ret = co.ret; 2705 } 2706 } 2707 if (ret && ret != -ENOTSUP) { 2708 goto out; 2709 } 2710 2711 offset += num; 2712 bytes -= num; 2713 } 2714 ret = 0; 2715 out: 2716 atomic_inc(&bs->write_gen); 2717 bdrv_set_dirty(bs, req.offset, req.bytes); 2718 tracked_request_end(&req); 2719 bdrv_dec_in_flight(bs); 2720 return ret; 2721 } 2722 2723 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 2724 { 2725 Coroutine *co; 2726 DiscardCo rwco = { 2727 .bs = bs, 2728 .offset = offset, 2729 .bytes = bytes, 2730 .ret = NOT_DONE, 2731 }; 2732 2733 if (qemu_in_coroutine()) { 2734 /* Fast-path if already in coroutine context */ 2735 bdrv_pdiscard_co_entry(&rwco); 2736 } else { 2737 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2738 bdrv_coroutine_enter(bs, co); 2739 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 2740 } 2741 2742 return rwco.ret; 2743 } 2744 2745 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 2746 { 2747 BlockDriver *drv = bs->drv; 2748 CoroutineIOCompletion co = { 2749 .coroutine = qemu_coroutine_self(), 2750 }; 2751 BlockAIOCB *acb; 2752 2753 bdrv_inc_in_flight(bs); 2754 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 2755 co.ret = -ENOTSUP; 2756 goto out; 2757 } 2758 2759 if (drv->bdrv_co_ioctl) { 2760 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 2761 } else { 2762 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2763 if (!acb) { 2764 co.ret = -ENOTSUP; 2765 goto out; 2766 } 2767 qemu_coroutine_yield(); 2768 } 2769 out: 2770 bdrv_dec_in_flight(bs); 2771 return co.ret; 2772 } 2773 2774 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2775 { 2776 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2777 } 2778 2779 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2780 { 2781 return memset(qemu_blockalign(bs, size), 0, size); 2782 } 2783 2784 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2785 { 2786 size_t align = bdrv_opt_mem_align(bs); 2787 2788 /* Ensure that NULL is never returned on success */ 2789 assert(align > 0); 2790 if (size == 0) { 2791 size = align; 2792 } 2793 2794 return qemu_try_memalign(align, size); 2795 } 2796 2797 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2798 { 2799 void *mem = qemu_try_blockalign(bs, size); 2800 2801 if (mem) { 2802 memset(mem, 0, size); 2803 } 2804 2805 return mem; 2806 } 2807 2808 /* 2809 * Check if all memory in this vector is sector aligned. 2810 */ 2811 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2812 { 2813 int i; 2814 size_t alignment = bdrv_min_mem_align(bs); 2815 2816 for (i = 0; i < qiov->niov; i++) { 2817 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2818 return false; 2819 } 2820 if (qiov->iov[i].iov_len % alignment) { 2821 return false; 2822 } 2823 } 2824 2825 return true; 2826 } 2827 2828 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2829 NotifierWithReturn *notifier) 2830 { 2831 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2832 } 2833 2834 void bdrv_io_plug(BlockDriverState *bs) 2835 { 2836 BdrvChild *child; 2837 2838 QLIST_FOREACH(child, &bs->children, next) { 2839 bdrv_io_plug(child->bs); 2840 } 2841 2842 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 2843 BlockDriver *drv = bs->drv; 2844 if (drv && drv->bdrv_io_plug) { 2845 drv->bdrv_io_plug(bs); 2846 } 2847 } 2848 } 2849 2850 void bdrv_io_unplug(BlockDriverState *bs) 2851 { 2852 BdrvChild *child; 2853 2854 assert(bs->io_plugged); 2855 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 2856 BlockDriver *drv = bs->drv; 2857 if (drv && drv->bdrv_io_unplug) { 2858 drv->bdrv_io_unplug(bs); 2859 } 2860 } 2861 2862 QLIST_FOREACH(child, &bs->children, next) { 2863 bdrv_io_unplug(child->bs); 2864 } 2865 } 2866 2867 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 2868 { 2869 BdrvChild *child; 2870 2871 if (bs->drv && bs->drv->bdrv_register_buf) { 2872 bs->drv->bdrv_register_buf(bs, host, size); 2873 } 2874 QLIST_FOREACH(child, &bs->children, next) { 2875 bdrv_register_buf(child->bs, host, size); 2876 } 2877 } 2878 2879 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 2880 { 2881 BdrvChild *child; 2882 2883 if (bs->drv && bs->drv->bdrv_unregister_buf) { 2884 bs->drv->bdrv_unregister_buf(bs, host); 2885 } 2886 QLIST_FOREACH(child, &bs->children, next) { 2887 bdrv_unregister_buf(child->bs, host); 2888 } 2889 } 2890 2891 static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, 2892 uint64_t src_offset, 2893 BdrvChild *dst, 2894 uint64_t dst_offset, 2895 uint64_t bytes, 2896 BdrvRequestFlags flags, 2897 bool recurse_src) 2898 { 2899 BdrvTrackedRequest src_req, dst_req; 2900 BlockDriverState *src_bs = src->bs; 2901 BlockDriverState *dst_bs = dst->bs; 2902 int ret; 2903 2904 if (!src || !dst || !src->bs || !dst->bs) { 2905 return -ENOMEDIUM; 2906 } 2907 ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 2908 if (ret) { 2909 return ret; 2910 } 2911 2912 ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 2913 if (ret) { 2914 return ret; 2915 } 2916 if (flags & BDRV_REQ_ZERO_WRITE) { 2917 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags); 2918 } 2919 2920 if (!src->bs->drv->bdrv_co_copy_range_from 2921 || !dst->bs->drv->bdrv_co_copy_range_to 2922 || src->bs->encrypted || dst->bs->encrypted) { 2923 return -ENOTSUP; 2924 } 2925 bdrv_inc_in_flight(src_bs); 2926 bdrv_inc_in_flight(dst_bs); 2927 tracked_request_begin(&src_req, src_bs, src_offset, 2928 bytes, BDRV_TRACKED_READ); 2929 tracked_request_begin(&dst_req, dst_bs, dst_offset, 2930 bytes, BDRV_TRACKED_WRITE); 2931 2932 wait_serialising_requests(&src_req); 2933 wait_serialising_requests(&dst_req); 2934 if (recurse_src) { 2935 ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 2936 src, src_offset, 2937 dst, dst_offset, 2938 bytes, flags); 2939 } else { 2940 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 2941 src, src_offset, 2942 dst, dst_offset, 2943 bytes, flags); 2944 } 2945 tracked_request_end(&src_req); 2946 tracked_request_end(&dst_req); 2947 bdrv_dec_in_flight(src_bs); 2948 bdrv_dec_in_flight(dst_bs); 2949 return ret; 2950 } 2951 2952 /* Copy range from @src to @dst. 2953 * 2954 * See the comment of bdrv_co_copy_range for the parameter and return value 2955 * semantics. */ 2956 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 2957 BdrvChild *dst, uint64_t dst_offset, 2958 uint64_t bytes, BdrvRequestFlags flags) 2959 { 2960 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 2961 bytes, flags, true); 2962 } 2963 2964 /* Copy range from @src to @dst. 2965 * 2966 * See the comment of bdrv_co_copy_range for the parameter and return value 2967 * semantics. */ 2968 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 2969 BdrvChild *dst, uint64_t dst_offset, 2970 uint64_t bytes, BdrvRequestFlags flags) 2971 { 2972 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 2973 bytes, flags, false); 2974 } 2975 2976 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 2977 BdrvChild *dst, uint64_t dst_offset, 2978 uint64_t bytes, BdrvRequestFlags flags) 2979 { 2980 return bdrv_co_copy_range_from(src, src_offset, 2981 dst, dst_offset, 2982 bytes, flags); 2983 } 2984 2985 static void bdrv_parent_cb_resize(BlockDriverState *bs) 2986 { 2987 BdrvChild *c; 2988 QLIST_FOREACH(c, &bs->parents, next_parent) { 2989 if (c->role->resize) { 2990 c->role->resize(c); 2991 } 2992 } 2993 } 2994 2995 /** 2996 * Truncate file to 'offset' bytes (needed only for file protocols) 2997 */ 2998 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, 2999 PreallocMode prealloc, Error **errp) 3000 { 3001 BlockDriverState *bs = child->bs; 3002 BlockDriver *drv = bs->drv; 3003 BdrvTrackedRequest req; 3004 int64_t old_size, new_bytes; 3005 int ret; 3006 3007 assert(child->perm & BLK_PERM_RESIZE); 3008 3009 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 3010 if (!drv) { 3011 error_setg(errp, "No medium inserted"); 3012 return -ENOMEDIUM; 3013 } 3014 if (offset < 0) { 3015 error_setg(errp, "Image size cannot be negative"); 3016 return -EINVAL; 3017 } 3018 3019 old_size = bdrv_getlength(bs); 3020 if (old_size < 0) { 3021 error_setg_errno(errp, -old_size, "Failed to get old image size"); 3022 return old_size; 3023 } 3024 3025 if (offset > old_size) { 3026 new_bytes = offset - old_size; 3027 } else { 3028 new_bytes = 0; 3029 } 3030 3031 bdrv_inc_in_flight(bs); 3032 tracked_request_begin(&req, bs, offset, new_bytes, BDRV_TRACKED_TRUNCATE); 3033 3034 /* If we are growing the image and potentially using preallocation for the 3035 * new area, we need to make sure that no write requests are made to it 3036 * concurrently or they might be overwritten by preallocation. */ 3037 if (new_bytes) { 3038 mark_request_serialising(&req, 1); 3039 wait_serialising_requests(&req); 3040 } 3041 3042 if (!drv->bdrv_co_truncate) { 3043 if (bs->file && drv->is_filter) { 3044 ret = bdrv_co_truncate(bs->file, offset, prealloc, errp); 3045 goto out; 3046 } 3047 error_setg(errp, "Image format driver does not support resize"); 3048 ret = -ENOTSUP; 3049 goto out; 3050 } 3051 if (bs->read_only) { 3052 error_setg(errp, "Image is read-only"); 3053 ret = -EACCES; 3054 goto out; 3055 } 3056 3057 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 3058 3059 ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp); 3060 if (ret < 0) { 3061 goto out; 3062 } 3063 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 3064 if (ret < 0) { 3065 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 3066 } else { 3067 offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3068 } 3069 bdrv_dirty_bitmap_truncate(bs, offset); 3070 bdrv_parent_cb_resize(bs); 3071 atomic_inc(&bs->write_gen); 3072 3073 out: 3074 tracked_request_end(&req); 3075 bdrv_dec_in_flight(bs); 3076 3077 return ret; 3078 } 3079 3080 typedef struct TruncateCo { 3081 BdrvChild *child; 3082 int64_t offset; 3083 PreallocMode prealloc; 3084 Error **errp; 3085 int ret; 3086 } TruncateCo; 3087 3088 static void coroutine_fn bdrv_truncate_co_entry(void *opaque) 3089 { 3090 TruncateCo *tco = opaque; 3091 tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc, 3092 tco->errp); 3093 } 3094 3095 int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc, 3096 Error **errp) 3097 { 3098 Coroutine *co; 3099 TruncateCo tco = { 3100 .child = child, 3101 .offset = offset, 3102 .prealloc = prealloc, 3103 .errp = errp, 3104 .ret = NOT_DONE, 3105 }; 3106 3107 if (qemu_in_coroutine()) { 3108 /* Fast-path if already in coroutine context */ 3109 bdrv_truncate_co_entry(&tco); 3110 } else { 3111 co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco); 3112 qemu_coroutine_enter(co); 3113 BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE); 3114 } 3115 3116 return tco.ret; 3117 } 3118