1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 36 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 37 38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40 41 static AioWait drain_all_aio_wait; 42 43 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 44 int64_t offset, int bytes, BdrvRequestFlags flags); 45 46 void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 47 bool ignore_bds_parents) 48 { 49 BdrvChild *c, *next; 50 51 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 52 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 53 continue; 54 } 55 if (c->role->drained_begin) { 56 c->role->drained_begin(c); 57 } 58 } 59 } 60 61 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 62 bool ignore_bds_parents) 63 { 64 BdrvChild *c, *next; 65 66 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 67 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 68 continue; 69 } 70 if (c->role->drained_end) { 71 c->role->drained_end(c); 72 } 73 } 74 } 75 76 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 77 bool ignore_bds_parents) 78 { 79 BdrvChild *c, *next; 80 bool busy = false; 81 82 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 83 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 84 continue; 85 } 86 if (c->role->drained_poll) { 87 busy |= c->role->drained_poll(c); 88 } 89 } 90 91 return busy; 92 } 93 94 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 95 { 96 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 97 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 98 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 99 src->opt_mem_alignment); 100 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 101 src->min_mem_alignment); 102 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 103 } 104 105 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 106 { 107 BlockDriver *drv = bs->drv; 108 Error *local_err = NULL; 109 110 memset(&bs->bl, 0, sizeof(bs->bl)); 111 112 if (!drv) { 113 return; 114 } 115 116 /* Default alignment based on whether driver has byte interface */ 117 bs->bl.request_alignment = (drv->bdrv_co_preadv || 118 drv->bdrv_aio_preadv) ? 1 : 512; 119 120 /* Take some limits from the children as a default */ 121 if (bs->file) { 122 bdrv_refresh_limits(bs->file->bs, &local_err); 123 if (local_err) { 124 error_propagate(errp, local_err); 125 return; 126 } 127 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 128 } else { 129 bs->bl.min_mem_alignment = 512; 130 bs->bl.opt_mem_alignment = getpagesize(); 131 132 /* Safe default since most protocols use readv()/writev()/etc */ 133 bs->bl.max_iov = IOV_MAX; 134 } 135 136 if (bs->backing) { 137 bdrv_refresh_limits(bs->backing->bs, &local_err); 138 if (local_err) { 139 error_propagate(errp, local_err); 140 return; 141 } 142 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 143 } 144 145 /* Then let the driver override it */ 146 if (drv->bdrv_refresh_limits) { 147 drv->bdrv_refresh_limits(bs, errp); 148 } 149 } 150 151 /** 152 * The copy-on-read flag is actually a reference count so multiple users may 153 * use the feature without worrying about clobbering its previous state. 154 * Copy-on-read stays enabled until all users have called to disable it. 155 */ 156 void bdrv_enable_copy_on_read(BlockDriverState *bs) 157 { 158 atomic_inc(&bs->copy_on_read); 159 } 160 161 void bdrv_disable_copy_on_read(BlockDriverState *bs) 162 { 163 int old = atomic_fetch_dec(&bs->copy_on_read); 164 assert(old >= 1); 165 } 166 167 typedef struct { 168 Coroutine *co; 169 BlockDriverState *bs; 170 bool done; 171 bool begin; 172 bool recursive; 173 bool poll; 174 BdrvChild *parent; 175 bool ignore_bds_parents; 176 } BdrvCoDrainData; 177 178 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 179 { 180 BdrvCoDrainData *data = opaque; 181 BlockDriverState *bs = data->bs; 182 183 if (data->begin) { 184 bs->drv->bdrv_co_drain_begin(bs); 185 } else { 186 bs->drv->bdrv_co_drain_end(bs); 187 } 188 189 /* Set data->done before reading bs->wakeup. */ 190 atomic_mb_set(&data->done, true); 191 bdrv_dec_in_flight(bs); 192 193 if (data->begin) { 194 g_free(data); 195 } 196 } 197 198 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 199 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) 200 { 201 BdrvCoDrainData *data; 202 203 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 204 (!begin && !bs->drv->bdrv_co_drain_end)) { 205 return; 206 } 207 208 data = g_new(BdrvCoDrainData, 1); 209 *data = (BdrvCoDrainData) { 210 .bs = bs, 211 .done = false, 212 .begin = begin 213 }; 214 215 /* Make sure the driver callback completes during the polling phase for 216 * drain_begin. */ 217 bdrv_inc_in_flight(bs); 218 data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 219 aio_co_schedule(bdrv_get_aio_context(bs), data->co); 220 221 if (!begin) { 222 BDRV_POLL_WHILE(bs, !data->done); 223 g_free(data); 224 } 225 } 226 227 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 228 bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 229 BdrvChild *ignore_parent, bool ignore_bds_parents) 230 { 231 BdrvChild *child, *next; 232 233 if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 234 return true; 235 } 236 237 if (atomic_read(&bs->in_flight)) { 238 return true; 239 } 240 241 if (recursive) { 242 assert(!ignore_bds_parents); 243 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 244 if (bdrv_drain_poll(child->bs, recursive, child, false)) { 245 return true; 246 } 247 } 248 } 249 250 return false; 251 } 252 253 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 254 BdrvChild *ignore_parent) 255 { 256 /* Execute pending BHs first and check everything else only after the BHs 257 * have executed. */ 258 while (aio_poll(bs->aio_context, false)); 259 260 return bdrv_drain_poll(bs, recursive, ignore_parent, false); 261 } 262 263 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 264 BdrvChild *parent, bool ignore_bds_parents, 265 bool poll); 266 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 267 BdrvChild *parent, bool ignore_bds_parents); 268 269 static void bdrv_co_drain_bh_cb(void *opaque) 270 { 271 BdrvCoDrainData *data = opaque; 272 Coroutine *co = data->co; 273 BlockDriverState *bs = data->bs; 274 275 if (bs) { 276 bdrv_dec_in_flight(bs); 277 if (data->begin) { 278 bdrv_do_drained_begin(bs, data->recursive, data->parent, 279 data->ignore_bds_parents, data->poll); 280 } else { 281 bdrv_do_drained_end(bs, data->recursive, data->parent, 282 data->ignore_bds_parents); 283 } 284 } else { 285 assert(data->begin); 286 bdrv_drain_all_begin(); 287 } 288 289 data->done = true; 290 aio_co_wake(co); 291 } 292 293 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 294 bool begin, bool recursive, 295 BdrvChild *parent, 296 bool ignore_bds_parents, 297 bool poll) 298 { 299 BdrvCoDrainData data; 300 301 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 302 * other coroutines run if they were queued by aio_co_enter(). */ 303 304 assert(qemu_in_coroutine()); 305 data = (BdrvCoDrainData) { 306 .co = qemu_coroutine_self(), 307 .bs = bs, 308 .done = false, 309 .begin = begin, 310 .recursive = recursive, 311 .parent = parent, 312 .ignore_bds_parents = ignore_bds_parents, 313 .poll = poll, 314 }; 315 if (bs) { 316 bdrv_inc_in_flight(bs); 317 } 318 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 319 bdrv_co_drain_bh_cb, &data); 320 321 qemu_coroutine_yield(); 322 /* If we are resumed from some other event (such as an aio completion or a 323 * timer callback), it is a bug in the caller that should be fixed. */ 324 assert(data.done); 325 } 326 327 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 328 BdrvChild *parent, bool ignore_bds_parents) 329 { 330 assert(!qemu_in_coroutine()); 331 332 /* Stop things in parent-to-child order */ 333 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 334 aio_disable_external(bdrv_get_aio_context(bs)); 335 } 336 337 bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 338 bdrv_drain_invoke(bs, true); 339 } 340 341 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 342 BdrvChild *parent, bool ignore_bds_parents, 343 bool poll) 344 { 345 BdrvChild *child, *next; 346 347 if (qemu_in_coroutine()) { 348 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 349 poll); 350 return; 351 } 352 353 bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 354 355 if (recursive) { 356 assert(!ignore_bds_parents); 357 bs->recursive_quiesce_counter++; 358 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 359 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 360 false); 361 } 362 } 363 364 /* 365 * Wait for drained requests to finish. 366 * 367 * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 368 * call is needed so things in this AioContext can make progress even 369 * though we don't return to the main AioContext loop - this automatically 370 * includes other nodes in the same AioContext and therefore all child 371 * nodes. 372 */ 373 if (poll) { 374 assert(!ignore_bds_parents); 375 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 376 } 377 } 378 379 void bdrv_drained_begin(BlockDriverState *bs) 380 { 381 bdrv_do_drained_begin(bs, false, NULL, false, true); 382 } 383 384 void bdrv_subtree_drained_begin(BlockDriverState *bs) 385 { 386 bdrv_do_drained_begin(bs, true, NULL, false, true); 387 } 388 389 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 390 BdrvChild *parent, bool ignore_bds_parents) 391 { 392 BdrvChild *child, *next; 393 int old_quiesce_counter; 394 395 if (qemu_in_coroutine()) { 396 bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 397 false); 398 return; 399 } 400 assert(bs->quiesce_counter > 0); 401 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 402 403 /* Re-enable things in child-to-parent order */ 404 bdrv_drain_invoke(bs, false); 405 bdrv_parent_drained_end(bs, parent, ignore_bds_parents); 406 if (old_quiesce_counter == 1) { 407 aio_enable_external(bdrv_get_aio_context(bs)); 408 } 409 410 if (recursive) { 411 assert(!ignore_bds_parents); 412 bs->recursive_quiesce_counter--; 413 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 414 bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents); 415 } 416 } 417 } 418 419 void bdrv_drained_end(BlockDriverState *bs) 420 { 421 bdrv_do_drained_end(bs, false, NULL, false); 422 } 423 424 void bdrv_subtree_drained_end(BlockDriverState *bs) 425 { 426 bdrv_do_drained_end(bs, true, NULL, false); 427 } 428 429 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 430 { 431 int i; 432 433 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 434 bdrv_do_drained_begin(child->bs, true, child, false, true); 435 } 436 } 437 438 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 439 { 440 int i; 441 442 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 443 bdrv_do_drained_end(child->bs, true, child, false); 444 } 445 } 446 447 /* 448 * Wait for pending requests to complete on a single BlockDriverState subtree, 449 * and suspend block driver's internal I/O until next request arrives. 450 * 451 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 452 * AioContext. 453 */ 454 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 455 { 456 assert(qemu_in_coroutine()); 457 bdrv_drained_begin(bs); 458 bdrv_drained_end(bs); 459 } 460 461 void bdrv_drain(BlockDriverState *bs) 462 { 463 bdrv_drained_begin(bs); 464 bdrv_drained_end(bs); 465 } 466 467 static void bdrv_drain_assert_idle(BlockDriverState *bs) 468 { 469 BdrvChild *child, *next; 470 471 assert(atomic_read(&bs->in_flight) == 0); 472 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 473 bdrv_drain_assert_idle(child->bs); 474 } 475 } 476 477 unsigned int bdrv_drain_all_count = 0; 478 479 static bool bdrv_drain_all_poll(void) 480 { 481 BlockDriverState *bs = NULL; 482 bool result = false; 483 484 /* Execute pending BHs first (may modify the graph) and check everything 485 * else only after the BHs have executed. */ 486 while (aio_poll(qemu_get_aio_context(), false)); 487 488 /* bdrv_drain_poll() can't make changes to the graph and we are holding the 489 * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 490 while ((bs = bdrv_next_all_states(bs))) { 491 AioContext *aio_context = bdrv_get_aio_context(bs); 492 aio_context_acquire(aio_context); 493 result |= bdrv_drain_poll(bs, false, NULL, true); 494 aio_context_release(aio_context); 495 } 496 497 return result; 498 } 499 500 /* 501 * Wait for pending requests to complete across all BlockDriverStates 502 * 503 * This function does not flush data to disk, use bdrv_flush_all() for that 504 * after calling this function. 505 * 506 * This pauses all block jobs and disables external clients. It must 507 * be paired with bdrv_drain_all_end(). 508 * 509 * NOTE: no new block jobs or BlockDriverStates can be created between 510 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 511 */ 512 void bdrv_drain_all_begin(void) 513 { 514 BlockDriverState *bs = NULL; 515 516 if (qemu_in_coroutine()) { 517 bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true); 518 return; 519 } 520 521 /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 522 * loop AioContext, so make sure we're in the main context. */ 523 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 524 assert(bdrv_drain_all_count < INT_MAX); 525 bdrv_drain_all_count++; 526 527 /* Quiesce all nodes, without polling in-flight requests yet. The graph 528 * cannot change during this loop. */ 529 while ((bs = bdrv_next_all_states(bs))) { 530 AioContext *aio_context = bdrv_get_aio_context(bs); 531 532 aio_context_acquire(aio_context); 533 bdrv_do_drained_begin(bs, false, NULL, true, false); 534 aio_context_release(aio_context); 535 } 536 537 /* Now poll the in-flight requests */ 538 AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll()); 539 540 while ((bs = bdrv_next_all_states(bs))) { 541 bdrv_drain_assert_idle(bs); 542 } 543 } 544 545 void bdrv_drain_all_end(void) 546 { 547 BlockDriverState *bs = NULL; 548 549 while ((bs = bdrv_next_all_states(bs))) { 550 AioContext *aio_context = bdrv_get_aio_context(bs); 551 552 aio_context_acquire(aio_context); 553 bdrv_do_drained_end(bs, false, NULL, true); 554 aio_context_release(aio_context); 555 } 556 557 assert(bdrv_drain_all_count > 0); 558 bdrv_drain_all_count--; 559 } 560 561 void bdrv_drain_all(void) 562 { 563 bdrv_drain_all_begin(); 564 bdrv_drain_all_end(); 565 } 566 567 /** 568 * Remove an active request from the tracked requests list 569 * 570 * This function should be called when a tracked request is completing. 571 */ 572 static void tracked_request_end(BdrvTrackedRequest *req) 573 { 574 if (req->serialising) { 575 atomic_dec(&req->bs->serialising_in_flight); 576 } 577 578 qemu_co_mutex_lock(&req->bs->reqs_lock); 579 QLIST_REMOVE(req, list); 580 qemu_co_queue_restart_all(&req->wait_queue); 581 qemu_co_mutex_unlock(&req->bs->reqs_lock); 582 } 583 584 /** 585 * Add an active request to the tracked requests list 586 */ 587 static void tracked_request_begin(BdrvTrackedRequest *req, 588 BlockDriverState *bs, 589 int64_t offset, 590 unsigned int bytes, 591 enum BdrvTrackedRequestType type) 592 { 593 *req = (BdrvTrackedRequest){ 594 .bs = bs, 595 .offset = offset, 596 .bytes = bytes, 597 .type = type, 598 .co = qemu_coroutine_self(), 599 .serialising = false, 600 .overlap_offset = offset, 601 .overlap_bytes = bytes, 602 }; 603 604 qemu_co_queue_init(&req->wait_queue); 605 606 qemu_co_mutex_lock(&bs->reqs_lock); 607 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 608 qemu_co_mutex_unlock(&bs->reqs_lock); 609 } 610 611 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 612 { 613 int64_t overlap_offset = req->offset & ~(align - 1); 614 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 615 - overlap_offset; 616 617 if (!req->serialising) { 618 atomic_inc(&req->bs->serialising_in_flight); 619 req->serialising = true; 620 } 621 622 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 623 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 624 } 625 626 /** 627 * Round a region to cluster boundaries 628 */ 629 void bdrv_round_to_clusters(BlockDriverState *bs, 630 int64_t offset, int64_t bytes, 631 int64_t *cluster_offset, 632 int64_t *cluster_bytes) 633 { 634 BlockDriverInfo bdi; 635 636 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 637 *cluster_offset = offset; 638 *cluster_bytes = bytes; 639 } else { 640 int64_t c = bdi.cluster_size; 641 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 642 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 643 } 644 } 645 646 static int bdrv_get_cluster_size(BlockDriverState *bs) 647 { 648 BlockDriverInfo bdi; 649 int ret; 650 651 ret = bdrv_get_info(bs, &bdi); 652 if (ret < 0 || bdi.cluster_size == 0) { 653 return bs->bl.request_alignment; 654 } else { 655 return bdi.cluster_size; 656 } 657 } 658 659 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 660 int64_t offset, unsigned int bytes) 661 { 662 /* aaaa bbbb */ 663 if (offset >= req->overlap_offset + req->overlap_bytes) { 664 return false; 665 } 666 /* bbbb aaaa */ 667 if (req->overlap_offset >= offset + bytes) { 668 return false; 669 } 670 return true; 671 } 672 673 void bdrv_inc_in_flight(BlockDriverState *bs) 674 { 675 atomic_inc(&bs->in_flight); 676 } 677 678 void bdrv_wakeup(BlockDriverState *bs) 679 { 680 aio_wait_kick(bdrv_get_aio_wait(bs)); 681 aio_wait_kick(&drain_all_aio_wait); 682 } 683 684 void bdrv_dec_in_flight(BlockDriverState *bs) 685 { 686 atomic_dec(&bs->in_flight); 687 bdrv_wakeup(bs); 688 } 689 690 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 691 { 692 BlockDriverState *bs = self->bs; 693 BdrvTrackedRequest *req; 694 bool retry; 695 bool waited = false; 696 697 if (!atomic_read(&bs->serialising_in_flight)) { 698 return false; 699 } 700 701 do { 702 retry = false; 703 qemu_co_mutex_lock(&bs->reqs_lock); 704 QLIST_FOREACH(req, &bs->tracked_requests, list) { 705 if (req == self || (!req->serialising && !self->serialising)) { 706 continue; 707 } 708 if (tracked_request_overlaps(req, self->overlap_offset, 709 self->overlap_bytes)) 710 { 711 /* Hitting this means there was a reentrant request, for 712 * example, a block driver issuing nested requests. This must 713 * never happen since it means deadlock. 714 */ 715 assert(qemu_coroutine_self() != req->co); 716 717 /* If the request is already (indirectly) waiting for us, or 718 * will wait for us as soon as it wakes up, then just go on 719 * (instead of producing a deadlock in the former case). */ 720 if (!req->waiting_for) { 721 self->waiting_for = req; 722 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 723 self->waiting_for = NULL; 724 retry = true; 725 waited = true; 726 break; 727 } 728 } 729 } 730 qemu_co_mutex_unlock(&bs->reqs_lock); 731 } while (retry); 732 733 return waited; 734 } 735 736 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 737 size_t size) 738 { 739 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 740 return -EIO; 741 } 742 743 if (!bdrv_is_inserted(bs)) { 744 return -ENOMEDIUM; 745 } 746 747 if (offset < 0) { 748 return -EIO; 749 } 750 751 return 0; 752 } 753 754 typedef struct RwCo { 755 BdrvChild *child; 756 int64_t offset; 757 QEMUIOVector *qiov; 758 bool is_write; 759 int ret; 760 BdrvRequestFlags flags; 761 } RwCo; 762 763 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 764 { 765 RwCo *rwco = opaque; 766 767 if (!rwco->is_write) { 768 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 769 rwco->qiov->size, rwco->qiov, 770 rwco->flags); 771 } else { 772 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 773 rwco->qiov->size, rwco->qiov, 774 rwco->flags); 775 } 776 } 777 778 /* 779 * Process a vectored synchronous request using coroutines 780 */ 781 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 782 QEMUIOVector *qiov, bool is_write, 783 BdrvRequestFlags flags) 784 { 785 Coroutine *co; 786 RwCo rwco = { 787 .child = child, 788 .offset = offset, 789 .qiov = qiov, 790 .is_write = is_write, 791 .ret = NOT_DONE, 792 .flags = flags, 793 }; 794 795 if (qemu_in_coroutine()) { 796 /* Fast-path if already in coroutine context */ 797 bdrv_rw_co_entry(&rwco); 798 } else { 799 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 800 bdrv_coroutine_enter(child->bs, co); 801 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 802 } 803 return rwco.ret; 804 } 805 806 /* 807 * Process a synchronous request using coroutines 808 */ 809 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 810 int nb_sectors, bool is_write, BdrvRequestFlags flags) 811 { 812 QEMUIOVector qiov; 813 struct iovec iov = { 814 .iov_base = (void *)buf, 815 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 816 }; 817 818 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 819 return -EINVAL; 820 } 821 822 qemu_iovec_init_external(&qiov, &iov, 1); 823 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 824 &qiov, is_write, flags); 825 } 826 827 /* return < 0 if error. See bdrv_write() for the return codes */ 828 int bdrv_read(BdrvChild *child, int64_t sector_num, 829 uint8_t *buf, int nb_sectors) 830 { 831 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 832 } 833 834 /* Return < 0 if error. Important errors are: 835 -EIO generic I/O error (may happen for all errors) 836 -ENOMEDIUM No media inserted. 837 -EINVAL Invalid sector number or nb_sectors 838 -EACCES Trying to write a read-only device 839 */ 840 int bdrv_write(BdrvChild *child, int64_t sector_num, 841 const uint8_t *buf, int nb_sectors) 842 { 843 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 844 } 845 846 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 847 int bytes, BdrvRequestFlags flags) 848 { 849 QEMUIOVector qiov; 850 struct iovec iov = { 851 .iov_base = NULL, 852 .iov_len = bytes, 853 }; 854 855 qemu_iovec_init_external(&qiov, &iov, 1); 856 return bdrv_prwv_co(child, offset, &qiov, true, 857 BDRV_REQ_ZERO_WRITE | flags); 858 } 859 860 /* 861 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 862 * The operation is sped up by checking the block status and only writing 863 * zeroes to the device if they currently do not return zeroes. Optional 864 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 865 * BDRV_REQ_FUA). 866 * 867 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 868 */ 869 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 870 { 871 int ret; 872 int64_t target_size, bytes, offset = 0; 873 BlockDriverState *bs = child->bs; 874 875 target_size = bdrv_getlength(bs); 876 if (target_size < 0) { 877 return target_size; 878 } 879 880 for (;;) { 881 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 882 if (bytes <= 0) { 883 return 0; 884 } 885 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 886 if (ret < 0) { 887 error_report("error getting block status at offset %" PRId64 ": %s", 888 offset, strerror(-ret)); 889 return ret; 890 } 891 if (ret & BDRV_BLOCK_ZERO) { 892 offset += bytes; 893 continue; 894 } 895 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 896 if (ret < 0) { 897 error_report("error writing zeroes at offset %" PRId64 ": %s", 898 offset, strerror(-ret)); 899 return ret; 900 } 901 offset += bytes; 902 } 903 } 904 905 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 906 { 907 int ret; 908 909 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 910 if (ret < 0) { 911 return ret; 912 } 913 914 return qiov->size; 915 } 916 917 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 918 { 919 QEMUIOVector qiov; 920 struct iovec iov = { 921 .iov_base = (void *)buf, 922 .iov_len = bytes, 923 }; 924 925 if (bytes < 0) { 926 return -EINVAL; 927 } 928 929 qemu_iovec_init_external(&qiov, &iov, 1); 930 return bdrv_preadv(child, offset, &qiov); 931 } 932 933 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 934 { 935 int ret; 936 937 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 938 if (ret < 0) { 939 return ret; 940 } 941 942 return qiov->size; 943 } 944 945 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 946 { 947 QEMUIOVector qiov; 948 struct iovec iov = { 949 .iov_base = (void *) buf, 950 .iov_len = bytes, 951 }; 952 953 if (bytes < 0) { 954 return -EINVAL; 955 } 956 957 qemu_iovec_init_external(&qiov, &iov, 1); 958 return bdrv_pwritev(child, offset, &qiov); 959 } 960 961 /* 962 * Writes to the file and ensures that no writes are reordered across this 963 * request (acts as a barrier) 964 * 965 * Returns 0 on success, -errno in error cases. 966 */ 967 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 968 const void *buf, int count) 969 { 970 int ret; 971 972 ret = bdrv_pwrite(child, offset, buf, count); 973 if (ret < 0) { 974 return ret; 975 } 976 977 ret = bdrv_flush(child->bs); 978 if (ret < 0) { 979 return ret; 980 } 981 982 return 0; 983 } 984 985 typedef struct CoroutineIOCompletion { 986 Coroutine *coroutine; 987 int ret; 988 } CoroutineIOCompletion; 989 990 static void bdrv_co_io_em_complete(void *opaque, int ret) 991 { 992 CoroutineIOCompletion *co = opaque; 993 994 co->ret = ret; 995 aio_co_wake(co->coroutine); 996 } 997 998 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 999 uint64_t offset, uint64_t bytes, 1000 QEMUIOVector *qiov, int flags) 1001 { 1002 BlockDriver *drv = bs->drv; 1003 int64_t sector_num; 1004 unsigned int nb_sectors; 1005 1006 assert(!(flags & ~BDRV_REQ_MASK)); 1007 1008 if (!drv) { 1009 return -ENOMEDIUM; 1010 } 1011 1012 if (drv->bdrv_co_preadv) { 1013 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1014 } 1015 1016 if (drv->bdrv_aio_preadv) { 1017 BlockAIOCB *acb; 1018 CoroutineIOCompletion co = { 1019 .coroutine = qemu_coroutine_self(), 1020 }; 1021 1022 acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 1023 bdrv_co_io_em_complete, &co); 1024 if (acb == NULL) { 1025 return -EIO; 1026 } else { 1027 qemu_coroutine_yield(); 1028 return co.ret; 1029 } 1030 } 1031 1032 sector_num = offset >> BDRV_SECTOR_BITS; 1033 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1034 1035 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1036 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1037 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 1038 assert(drv->bdrv_co_readv); 1039 1040 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1041 } 1042 1043 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 1044 uint64_t offset, uint64_t bytes, 1045 QEMUIOVector *qiov, int flags) 1046 { 1047 BlockDriver *drv = bs->drv; 1048 int64_t sector_num; 1049 unsigned int nb_sectors; 1050 int ret; 1051 1052 assert(!(flags & ~BDRV_REQ_MASK)); 1053 1054 if (!drv) { 1055 return -ENOMEDIUM; 1056 } 1057 1058 if (drv->bdrv_co_pwritev) { 1059 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1060 flags & bs->supported_write_flags); 1061 flags &= ~bs->supported_write_flags; 1062 goto emulate_flags; 1063 } 1064 1065 if (drv->bdrv_aio_pwritev) { 1066 BlockAIOCB *acb; 1067 CoroutineIOCompletion co = { 1068 .coroutine = qemu_coroutine_self(), 1069 }; 1070 1071 acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1072 flags & bs->supported_write_flags, 1073 bdrv_co_io_em_complete, &co); 1074 flags &= ~bs->supported_write_flags; 1075 if (acb == NULL) { 1076 ret = -EIO; 1077 } else { 1078 qemu_coroutine_yield(); 1079 ret = co.ret; 1080 } 1081 goto emulate_flags; 1082 } 1083 1084 sector_num = offset >> BDRV_SECTOR_BITS; 1085 nb_sectors = bytes >> BDRV_SECTOR_BITS; 1086 1087 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1088 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1089 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 1090 1091 assert(drv->bdrv_co_writev); 1092 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1093 flags & bs->supported_write_flags); 1094 flags &= ~bs->supported_write_flags; 1095 1096 emulate_flags: 1097 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1098 ret = bdrv_co_flush(bs); 1099 } 1100 1101 return ret; 1102 } 1103 1104 static int coroutine_fn 1105 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1106 uint64_t bytes, QEMUIOVector *qiov) 1107 { 1108 BlockDriver *drv = bs->drv; 1109 1110 if (!drv) { 1111 return -ENOMEDIUM; 1112 } 1113 1114 if (!drv->bdrv_co_pwritev_compressed) { 1115 return -ENOTSUP; 1116 } 1117 1118 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1119 } 1120 1121 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1122 int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 1123 { 1124 BlockDriverState *bs = child->bs; 1125 1126 /* Perform I/O through a temporary buffer so that users who scribble over 1127 * their read buffer while the operation is in progress do not end up 1128 * modifying the image file. This is critical for zero-copy guest I/O 1129 * where anything might happen inside guest memory. 1130 */ 1131 void *bounce_buffer; 1132 1133 BlockDriver *drv = bs->drv; 1134 struct iovec iov; 1135 QEMUIOVector local_qiov; 1136 int64_t cluster_offset; 1137 int64_t cluster_bytes; 1138 size_t skip_bytes; 1139 int ret; 1140 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1141 BDRV_REQUEST_MAX_BYTES); 1142 unsigned int progress = 0; 1143 1144 if (!drv) { 1145 return -ENOMEDIUM; 1146 } 1147 1148 /* FIXME We cannot require callers to have write permissions when all they 1149 * are doing is a read request. If we did things right, write permissions 1150 * would be obtained anyway, but internally by the copy-on-read code. As 1151 * long as it is implemented here rather than in a separate filter driver, 1152 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1153 * it could request permissions. Therefore we have to bypass the permission 1154 * system for the moment. */ 1155 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1156 1157 /* Cover entire cluster so no additional backing file I/O is required when 1158 * allocating cluster in the image file. Note that this value may exceed 1159 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1160 * is one reason we loop rather than doing it all at once. 1161 */ 1162 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1163 skip_bytes = offset - cluster_offset; 1164 1165 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1166 cluster_offset, cluster_bytes); 1167 1168 bounce_buffer = qemu_try_blockalign(bs, 1169 MIN(MIN(max_transfer, cluster_bytes), 1170 MAX_BOUNCE_BUFFER)); 1171 if (bounce_buffer == NULL) { 1172 ret = -ENOMEM; 1173 goto err; 1174 } 1175 1176 while (cluster_bytes) { 1177 int64_t pnum; 1178 1179 ret = bdrv_is_allocated(bs, cluster_offset, 1180 MIN(cluster_bytes, max_transfer), &pnum); 1181 if (ret < 0) { 1182 /* Safe to treat errors in querying allocation as if 1183 * unallocated; we'll probably fail again soon on the 1184 * read, but at least that will set a decent errno. 1185 */ 1186 pnum = MIN(cluster_bytes, max_transfer); 1187 } 1188 1189 assert(skip_bytes < pnum); 1190 1191 if (ret <= 0) { 1192 /* Must copy-on-read; use the bounce buffer */ 1193 iov.iov_base = bounce_buffer; 1194 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1195 qemu_iovec_init_external(&local_qiov, &iov, 1); 1196 1197 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1198 &local_qiov, 0); 1199 if (ret < 0) { 1200 goto err; 1201 } 1202 1203 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1204 if (drv->bdrv_co_pwrite_zeroes && 1205 buffer_is_zero(bounce_buffer, pnum)) { 1206 /* FIXME: Should we (perhaps conditionally) be setting 1207 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1208 * that still correctly reads as zero? */ 1209 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 1210 BDRV_REQ_WRITE_UNCHANGED); 1211 } else { 1212 /* This does not change the data on the disk, it is not 1213 * necessary to flush even in cache=writethrough mode. 1214 */ 1215 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1216 &local_qiov, 1217 BDRV_REQ_WRITE_UNCHANGED); 1218 } 1219 1220 if (ret < 0) { 1221 /* It might be okay to ignore write errors for guest 1222 * requests. If this is a deliberate copy-on-read 1223 * then we don't want to ignore the error. Simply 1224 * report it in all cases. 1225 */ 1226 goto err; 1227 } 1228 1229 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, 1230 pnum - skip_bytes); 1231 } else { 1232 /* Read directly into the destination */ 1233 qemu_iovec_init(&local_qiov, qiov->niov); 1234 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); 1235 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, 1236 &local_qiov, 0); 1237 qemu_iovec_destroy(&local_qiov); 1238 if (ret < 0) { 1239 goto err; 1240 } 1241 } 1242 1243 cluster_offset += pnum; 1244 cluster_bytes -= pnum; 1245 progress += pnum - skip_bytes; 1246 skip_bytes = 0; 1247 } 1248 ret = 0; 1249 1250 err: 1251 qemu_vfree(bounce_buffer); 1252 return ret; 1253 } 1254 1255 /* 1256 * Forwards an already correctly aligned request to the BlockDriver. This 1257 * handles copy on read, zeroing after EOF, and fragmentation of large 1258 * reads; any other features must be implemented by the caller. 1259 */ 1260 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1261 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1262 int64_t align, QEMUIOVector *qiov, int flags) 1263 { 1264 BlockDriverState *bs = child->bs; 1265 int64_t total_bytes, max_bytes; 1266 int ret = 0; 1267 uint64_t bytes_remaining = bytes; 1268 int max_transfer; 1269 1270 assert(is_power_of_2(align)); 1271 assert((offset & (align - 1)) == 0); 1272 assert((bytes & (align - 1)) == 0); 1273 assert(!qiov || bytes == qiov->size); 1274 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1275 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1276 align); 1277 1278 /* TODO: We would need a per-BDS .supported_read_flags and 1279 * potential fallback support, if we ever implement any read flags 1280 * to pass through to drivers. For now, there aren't any 1281 * passthrough flags. */ 1282 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 1283 1284 /* Handle Copy on Read and associated serialisation */ 1285 if (flags & BDRV_REQ_COPY_ON_READ) { 1286 /* If we touch the same cluster it counts as an overlap. This 1287 * guarantees that allocating writes will be serialized and not race 1288 * with each other for the same cluster. For example, in copy-on-read 1289 * it ensures that the CoR read and write operations are atomic and 1290 * guest writes cannot interleave between them. */ 1291 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1292 } 1293 1294 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 1295 wait_serialising_requests(req); 1296 } 1297 1298 if (flags & BDRV_REQ_COPY_ON_READ) { 1299 int64_t pnum; 1300 1301 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1302 if (ret < 0) { 1303 goto out; 1304 } 1305 1306 if (!ret || pnum != bytes) { 1307 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 1308 goto out; 1309 } 1310 } 1311 1312 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1313 total_bytes = bdrv_getlength(bs); 1314 if (total_bytes < 0) { 1315 ret = total_bytes; 1316 goto out; 1317 } 1318 1319 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1320 if (bytes <= max_bytes && bytes <= max_transfer) { 1321 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 1322 goto out; 1323 } 1324 1325 while (bytes_remaining) { 1326 int num; 1327 1328 if (max_bytes) { 1329 QEMUIOVector local_qiov; 1330 1331 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1332 assert(num); 1333 qemu_iovec_init(&local_qiov, qiov->niov); 1334 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1335 1336 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1337 num, &local_qiov, 0); 1338 max_bytes -= num; 1339 qemu_iovec_destroy(&local_qiov); 1340 } else { 1341 num = bytes_remaining; 1342 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1343 bytes_remaining); 1344 } 1345 if (ret < 0) { 1346 goto out; 1347 } 1348 bytes_remaining -= num; 1349 } 1350 1351 out: 1352 return ret < 0 ? ret : 0; 1353 } 1354 1355 /* 1356 * Handle a read request in coroutine context 1357 */ 1358 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1359 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1360 BdrvRequestFlags flags) 1361 { 1362 BlockDriverState *bs = child->bs; 1363 BlockDriver *drv = bs->drv; 1364 BdrvTrackedRequest req; 1365 1366 uint64_t align = bs->bl.request_alignment; 1367 uint8_t *head_buf = NULL; 1368 uint8_t *tail_buf = NULL; 1369 QEMUIOVector local_qiov; 1370 bool use_local_qiov = false; 1371 int ret; 1372 1373 trace_bdrv_co_preadv(child->bs, offset, bytes, flags); 1374 1375 if (!drv) { 1376 return -ENOMEDIUM; 1377 } 1378 1379 ret = bdrv_check_byte_request(bs, offset, bytes); 1380 if (ret < 0) { 1381 return ret; 1382 } 1383 1384 bdrv_inc_in_flight(bs); 1385 1386 /* Don't do copy-on-read if we read data before write operation */ 1387 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 1388 flags |= BDRV_REQ_COPY_ON_READ; 1389 } 1390 1391 /* Align read if necessary by padding qiov */ 1392 if (offset & (align - 1)) { 1393 head_buf = qemu_blockalign(bs, align); 1394 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1395 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1396 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1397 use_local_qiov = true; 1398 1399 bytes += offset & (align - 1); 1400 offset = offset & ~(align - 1); 1401 } 1402 1403 if ((offset + bytes) & (align - 1)) { 1404 if (!use_local_qiov) { 1405 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1406 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1407 use_local_qiov = true; 1408 } 1409 tail_buf = qemu_blockalign(bs, align); 1410 qemu_iovec_add(&local_qiov, tail_buf, 1411 align - ((offset + bytes) & (align - 1))); 1412 1413 bytes = ROUND_UP(bytes, align); 1414 } 1415 1416 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1417 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 1418 use_local_qiov ? &local_qiov : qiov, 1419 flags); 1420 tracked_request_end(&req); 1421 bdrv_dec_in_flight(bs); 1422 1423 if (use_local_qiov) { 1424 qemu_iovec_destroy(&local_qiov); 1425 qemu_vfree(head_buf); 1426 qemu_vfree(tail_buf); 1427 } 1428 1429 return ret; 1430 } 1431 1432 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, 1433 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1434 BdrvRequestFlags flags) 1435 { 1436 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1437 return -EINVAL; 1438 } 1439 1440 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, 1441 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1442 } 1443 1444 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, 1445 int nb_sectors, QEMUIOVector *qiov) 1446 { 1447 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); 1448 } 1449 1450 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1451 int64_t offset, int bytes, BdrvRequestFlags flags) 1452 { 1453 BlockDriver *drv = bs->drv; 1454 QEMUIOVector qiov; 1455 struct iovec iov = {0}; 1456 int ret = 0; 1457 bool need_flush = false; 1458 int head = 0; 1459 int tail = 0; 1460 1461 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1462 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1463 bs->bl.request_alignment); 1464 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1465 1466 if (!drv) { 1467 return -ENOMEDIUM; 1468 } 1469 1470 assert(alignment % bs->bl.request_alignment == 0); 1471 head = offset % alignment; 1472 tail = (offset + bytes) % alignment; 1473 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1474 assert(max_write_zeroes >= bs->bl.request_alignment); 1475 1476 while (bytes > 0 && !ret) { 1477 int num = bytes; 1478 1479 /* Align request. Block drivers can expect the "bulk" of the request 1480 * to be aligned, and that unaligned requests do not cross cluster 1481 * boundaries. 1482 */ 1483 if (head) { 1484 /* Make a small request up to the first aligned sector. For 1485 * convenience, limit this request to max_transfer even if 1486 * we don't need to fall back to writes. */ 1487 num = MIN(MIN(bytes, max_transfer), alignment - head); 1488 head = (head + num) % alignment; 1489 assert(num < max_write_zeroes); 1490 } else if (tail && num > alignment) { 1491 /* Shorten the request to the last aligned sector. */ 1492 num -= tail; 1493 } 1494 1495 /* limit request size */ 1496 if (num > max_write_zeroes) { 1497 num = max_write_zeroes; 1498 } 1499 1500 ret = -ENOTSUP; 1501 /* First try the efficient write zeroes operation */ 1502 if (drv->bdrv_co_pwrite_zeroes) { 1503 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1504 flags & bs->supported_zero_flags); 1505 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1506 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1507 need_flush = true; 1508 } 1509 } else { 1510 assert(!bs->supported_zero_flags); 1511 } 1512 1513 if (ret == -ENOTSUP) { 1514 /* Fall back to bounce buffer if write zeroes is unsupported */ 1515 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1516 1517 if ((flags & BDRV_REQ_FUA) && 1518 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1519 /* No need for bdrv_driver_pwrite() to do a fallback 1520 * flush on each chunk; use just one at the end */ 1521 write_flags &= ~BDRV_REQ_FUA; 1522 need_flush = true; 1523 } 1524 num = MIN(num, max_transfer); 1525 iov.iov_len = num; 1526 if (iov.iov_base == NULL) { 1527 iov.iov_base = qemu_try_blockalign(bs, num); 1528 if (iov.iov_base == NULL) { 1529 ret = -ENOMEM; 1530 goto fail; 1531 } 1532 memset(iov.iov_base, 0, num); 1533 } 1534 qemu_iovec_init_external(&qiov, &iov, 1); 1535 1536 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 1537 1538 /* Keep bounce buffer around if it is big enough for all 1539 * all future requests. 1540 */ 1541 if (num < max_transfer) { 1542 qemu_vfree(iov.iov_base); 1543 iov.iov_base = NULL; 1544 } 1545 } 1546 1547 offset += num; 1548 bytes -= num; 1549 } 1550 1551 fail: 1552 if (ret == 0 && need_flush) { 1553 ret = bdrv_co_flush(bs); 1554 } 1555 qemu_vfree(iov.iov_base); 1556 return ret; 1557 } 1558 1559 /* 1560 * Forwards an already correctly aligned write request to the BlockDriver, 1561 * after possibly fragmenting it. 1562 */ 1563 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1564 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1565 int64_t align, QEMUIOVector *qiov, int flags) 1566 { 1567 BlockDriverState *bs = child->bs; 1568 BlockDriver *drv = bs->drv; 1569 bool waited; 1570 int ret; 1571 1572 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1573 uint64_t bytes_remaining = bytes; 1574 int max_transfer; 1575 1576 if (!drv) { 1577 return -ENOMEDIUM; 1578 } 1579 1580 if (bdrv_has_readonly_bitmaps(bs)) { 1581 return -EPERM; 1582 } 1583 1584 assert(is_power_of_2(align)); 1585 assert((offset & (align - 1)) == 0); 1586 assert((bytes & (align - 1)) == 0); 1587 assert(!qiov || bytes == qiov->size); 1588 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1589 assert(!(flags & ~BDRV_REQ_MASK)); 1590 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1591 align); 1592 1593 waited = wait_serialising_requests(req); 1594 assert(!waited || !req->serialising); 1595 assert(req->overlap_offset <= offset); 1596 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1597 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1598 assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1599 } else { 1600 assert(child->perm & BLK_PERM_WRITE); 1601 } 1602 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1603 1604 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1605 1606 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1607 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1608 qemu_iovec_is_zero(qiov)) { 1609 flags |= BDRV_REQ_ZERO_WRITE; 1610 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1611 flags |= BDRV_REQ_MAY_UNMAP; 1612 } 1613 } 1614 1615 if (ret < 0) { 1616 /* Do nothing, write notifier decided to fail this request */ 1617 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1618 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1619 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1620 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1621 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 1622 } else if (bytes <= max_transfer) { 1623 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1624 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 1625 } else { 1626 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1627 while (bytes_remaining) { 1628 int num = MIN(bytes_remaining, max_transfer); 1629 QEMUIOVector local_qiov; 1630 int local_flags = flags; 1631 1632 assert(num); 1633 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1634 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1635 /* If FUA is going to be emulated by flush, we only 1636 * need to flush on the last iteration */ 1637 local_flags &= ~BDRV_REQ_FUA; 1638 } 1639 qemu_iovec_init(&local_qiov, qiov->niov); 1640 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1641 1642 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1643 num, &local_qiov, local_flags); 1644 qemu_iovec_destroy(&local_qiov); 1645 if (ret < 0) { 1646 break; 1647 } 1648 bytes_remaining -= num; 1649 } 1650 } 1651 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1652 1653 atomic_inc(&bs->write_gen); 1654 bdrv_set_dirty(bs, offset, bytes); 1655 1656 stat64_max(&bs->wr_highest_offset, offset + bytes); 1657 1658 if (ret >= 0) { 1659 bs->total_sectors = MAX(bs->total_sectors, end_sector); 1660 ret = 0; 1661 } 1662 1663 return ret; 1664 } 1665 1666 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1667 int64_t offset, 1668 unsigned int bytes, 1669 BdrvRequestFlags flags, 1670 BdrvTrackedRequest *req) 1671 { 1672 BlockDriverState *bs = child->bs; 1673 uint8_t *buf = NULL; 1674 QEMUIOVector local_qiov; 1675 struct iovec iov; 1676 uint64_t align = bs->bl.request_alignment; 1677 unsigned int head_padding_bytes, tail_padding_bytes; 1678 int ret = 0; 1679 1680 head_padding_bytes = offset & (align - 1); 1681 tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 1682 1683 1684 assert(flags & BDRV_REQ_ZERO_WRITE); 1685 if (head_padding_bytes || tail_padding_bytes) { 1686 buf = qemu_blockalign(bs, align); 1687 iov = (struct iovec) { 1688 .iov_base = buf, 1689 .iov_len = align, 1690 }; 1691 qemu_iovec_init_external(&local_qiov, &iov, 1); 1692 } 1693 if (head_padding_bytes) { 1694 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1695 1696 /* RMW the unaligned part before head. */ 1697 mark_request_serialising(req, align); 1698 wait_serialising_requests(req); 1699 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1700 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 1701 align, &local_qiov, 0); 1702 if (ret < 0) { 1703 goto fail; 1704 } 1705 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1706 1707 memset(buf + head_padding_bytes, 0, zero_bytes); 1708 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1709 align, &local_qiov, 1710 flags & ~BDRV_REQ_ZERO_WRITE); 1711 if (ret < 0) { 1712 goto fail; 1713 } 1714 offset += zero_bytes; 1715 bytes -= zero_bytes; 1716 } 1717 1718 assert(!bytes || (offset & (align - 1)) == 0); 1719 if (bytes >= align) { 1720 /* Write the aligned part in the middle. */ 1721 uint64_t aligned_bytes = bytes & ~(align - 1); 1722 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 1723 NULL, flags); 1724 if (ret < 0) { 1725 goto fail; 1726 } 1727 bytes -= aligned_bytes; 1728 offset += aligned_bytes; 1729 } 1730 1731 assert(!bytes || (offset & (align - 1)) == 0); 1732 if (bytes) { 1733 assert(align == tail_padding_bytes + bytes); 1734 /* RMW the unaligned part after tail. */ 1735 mark_request_serialising(req, align); 1736 wait_serialising_requests(req); 1737 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1738 ret = bdrv_aligned_preadv(child, req, offset, align, 1739 align, &local_qiov, 0); 1740 if (ret < 0) { 1741 goto fail; 1742 } 1743 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1744 1745 memset(buf, 0, bytes); 1746 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 1747 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1748 } 1749 fail: 1750 qemu_vfree(buf); 1751 return ret; 1752 1753 } 1754 1755 /* 1756 * Handle a write request in coroutine context 1757 */ 1758 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 1759 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1760 BdrvRequestFlags flags) 1761 { 1762 BlockDriverState *bs = child->bs; 1763 BdrvTrackedRequest req; 1764 uint64_t align = bs->bl.request_alignment; 1765 uint8_t *head_buf = NULL; 1766 uint8_t *tail_buf = NULL; 1767 QEMUIOVector local_qiov; 1768 bool use_local_qiov = false; 1769 int ret; 1770 1771 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 1772 1773 if (!bs->drv) { 1774 return -ENOMEDIUM; 1775 } 1776 if (bs->read_only) { 1777 return -EPERM; 1778 } 1779 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1780 1781 ret = bdrv_check_byte_request(bs, offset, bytes); 1782 if (ret < 0) { 1783 return ret; 1784 } 1785 1786 bdrv_inc_in_flight(bs); 1787 /* 1788 * Align write if necessary by performing a read-modify-write cycle. 1789 * Pad qiov with the read parts and be sure to have a tracked request not 1790 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1791 */ 1792 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1793 1794 if (flags & BDRV_REQ_ZERO_WRITE) { 1795 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 1796 goto out; 1797 } 1798 1799 if (offset & (align - 1)) { 1800 QEMUIOVector head_qiov; 1801 struct iovec head_iov; 1802 1803 mark_request_serialising(&req, align); 1804 wait_serialising_requests(&req); 1805 1806 head_buf = qemu_blockalign(bs, align); 1807 head_iov = (struct iovec) { 1808 .iov_base = head_buf, 1809 .iov_len = align, 1810 }; 1811 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1812 1813 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1814 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 1815 align, &head_qiov, 0); 1816 if (ret < 0) { 1817 goto fail; 1818 } 1819 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1820 1821 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1822 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1823 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1824 use_local_qiov = true; 1825 1826 bytes += offset & (align - 1); 1827 offset = offset & ~(align - 1); 1828 1829 /* We have read the tail already if the request is smaller 1830 * than one aligned block. 1831 */ 1832 if (bytes < align) { 1833 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1834 bytes = align; 1835 } 1836 } 1837 1838 if ((offset + bytes) & (align - 1)) { 1839 QEMUIOVector tail_qiov; 1840 struct iovec tail_iov; 1841 size_t tail_bytes; 1842 bool waited; 1843 1844 mark_request_serialising(&req, align); 1845 waited = wait_serialising_requests(&req); 1846 assert(!waited || !use_local_qiov); 1847 1848 tail_buf = qemu_blockalign(bs, align); 1849 tail_iov = (struct iovec) { 1850 .iov_base = tail_buf, 1851 .iov_len = align, 1852 }; 1853 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1854 1855 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1856 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 1857 align, align, &tail_qiov, 0); 1858 if (ret < 0) { 1859 goto fail; 1860 } 1861 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1862 1863 if (!use_local_qiov) { 1864 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1865 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1866 use_local_qiov = true; 1867 } 1868 1869 tail_bytes = (offset + bytes) & (align - 1); 1870 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1871 1872 bytes = ROUND_UP(bytes, align); 1873 } 1874 1875 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 1876 use_local_qiov ? &local_qiov : qiov, 1877 flags); 1878 1879 fail: 1880 1881 if (use_local_qiov) { 1882 qemu_iovec_destroy(&local_qiov); 1883 } 1884 qemu_vfree(head_buf); 1885 qemu_vfree(tail_buf); 1886 out: 1887 tracked_request_end(&req); 1888 bdrv_dec_in_flight(bs); 1889 return ret; 1890 } 1891 1892 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, 1893 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1894 BdrvRequestFlags flags) 1895 { 1896 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1897 return -EINVAL; 1898 } 1899 1900 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, 1901 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1902 } 1903 1904 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, 1905 int nb_sectors, QEMUIOVector *qiov) 1906 { 1907 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); 1908 } 1909 1910 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1911 int bytes, BdrvRequestFlags flags) 1912 { 1913 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 1914 1915 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 1916 flags &= ~BDRV_REQ_MAY_UNMAP; 1917 } 1918 1919 return bdrv_co_pwritev(child, offset, bytes, NULL, 1920 BDRV_REQ_ZERO_WRITE | flags); 1921 } 1922 1923 /* 1924 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 1925 */ 1926 int bdrv_flush_all(void) 1927 { 1928 BdrvNextIterator it; 1929 BlockDriverState *bs = NULL; 1930 int result = 0; 1931 1932 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 1933 AioContext *aio_context = bdrv_get_aio_context(bs); 1934 int ret; 1935 1936 aio_context_acquire(aio_context); 1937 ret = bdrv_flush(bs); 1938 if (ret < 0 && !result) { 1939 result = ret; 1940 } 1941 aio_context_release(aio_context); 1942 } 1943 1944 return result; 1945 } 1946 1947 1948 typedef struct BdrvCoBlockStatusData { 1949 BlockDriverState *bs; 1950 BlockDriverState *base; 1951 bool want_zero; 1952 int64_t offset; 1953 int64_t bytes; 1954 int64_t *pnum; 1955 int64_t *map; 1956 BlockDriverState **file; 1957 int ret; 1958 bool done; 1959 } BdrvCoBlockStatusData; 1960 1961 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 1962 bool want_zero, 1963 int64_t offset, 1964 int64_t bytes, 1965 int64_t *pnum, 1966 int64_t *map, 1967 BlockDriverState **file) 1968 { 1969 assert(bs->file && bs->file->bs); 1970 *pnum = bytes; 1971 *map = offset; 1972 *file = bs->file->bs; 1973 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1974 } 1975 1976 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 1977 bool want_zero, 1978 int64_t offset, 1979 int64_t bytes, 1980 int64_t *pnum, 1981 int64_t *map, 1982 BlockDriverState **file) 1983 { 1984 assert(bs->backing && bs->backing->bs); 1985 *pnum = bytes; 1986 *map = offset; 1987 *file = bs->backing->bs; 1988 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1989 } 1990 1991 /* 1992 * Returns the allocation status of the specified sectors. 1993 * Drivers not implementing the functionality are assumed to not support 1994 * backing files, hence all their sectors are reported as allocated. 1995 * 1996 * If 'want_zero' is true, the caller is querying for mapping 1997 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 1998 * _ZERO where possible; otherwise, the result favors larger 'pnum', 1999 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2000 * 2001 * If 'offset' is beyond the end of the disk image the return value is 2002 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 2003 * 2004 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2005 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2006 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 2007 * 2008 * 'pnum' is set to the number of bytes (including and immediately 2009 * following the specified offset) that are easily known to be in the 2010 * same allocated/unallocated state. Note that a second call starting 2011 * at the original offset plus returned pnum may have the same status. 2012 * The returned value is non-zero on success except at end-of-file. 2013 * 2014 * Returns negative errno on failure. Otherwise, if the 2015 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 2016 * set to the host mapping and BDS corresponding to the guest offset. 2017 */ 2018 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2019 bool want_zero, 2020 int64_t offset, int64_t bytes, 2021 int64_t *pnum, int64_t *map, 2022 BlockDriverState **file) 2023 { 2024 int64_t total_size; 2025 int64_t n; /* bytes */ 2026 int ret; 2027 int64_t local_map = 0; 2028 BlockDriverState *local_file = NULL; 2029 int64_t aligned_offset, aligned_bytes; 2030 uint32_t align; 2031 2032 assert(pnum); 2033 *pnum = 0; 2034 total_size = bdrv_getlength(bs); 2035 if (total_size < 0) { 2036 ret = total_size; 2037 goto early_out; 2038 } 2039 2040 if (offset >= total_size) { 2041 ret = BDRV_BLOCK_EOF; 2042 goto early_out; 2043 } 2044 if (!bytes) { 2045 ret = 0; 2046 goto early_out; 2047 } 2048 2049 n = total_size - offset; 2050 if (n < bytes) { 2051 bytes = n; 2052 } 2053 2054 /* Must be non-NULL or bdrv_getlength() would have failed */ 2055 assert(bs->drv); 2056 if (!bs->drv->bdrv_co_block_status) { 2057 *pnum = bytes; 2058 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 2059 if (offset + bytes == total_size) { 2060 ret |= BDRV_BLOCK_EOF; 2061 } 2062 if (bs->drv->protocol_name) { 2063 ret |= BDRV_BLOCK_OFFSET_VALID; 2064 local_map = offset; 2065 local_file = bs; 2066 } 2067 goto early_out; 2068 } 2069 2070 bdrv_inc_in_flight(bs); 2071 2072 /* Round out to request_alignment boundaries */ 2073 align = bs->bl.request_alignment; 2074 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2075 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2076 2077 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 2078 aligned_bytes, pnum, &local_map, 2079 &local_file); 2080 if (ret < 0) { 2081 *pnum = 0; 2082 goto out; 2083 } 2084 2085 /* 2086 * The driver's result must be a non-zero multiple of request_alignment. 2087 * Clamp pnum and adjust map to original request. 2088 */ 2089 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2090 align > offset - aligned_offset); 2091 *pnum -= offset - aligned_offset; 2092 if (*pnum > bytes) { 2093 *pnum = bytes; 2094 } 2095 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2096 local_map += offset - aligned_offset; 2097 } 2098 2099 if (ret & BDRV_BLOCK_RAW) { 2100 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2101 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2102 *pnum, pnum, &local_map, &local_file); 2103 goto out; 2104 } 2105 2106 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2107 ret |= BDRV_BLOCK_ALLOCATED; 2108 } else if (want_zero) { 2109 if (bdrv_unallocated_blocks_are_zero(bs)) { 2110 ret |= BDRV_BLOCK_ZERO; 2111 } else if (bs->backing) { 2112 BlockDriverState *bs2 = bs->backing->bs; 2113 int64_t size2 = bdrv_getlength(bs2); 2114 2115 if (size2 >= 0 && offset >= size2) { 2116 ret |= BDRV_BLOCK_ZERO; 2117 } 2118 } 2119 } 2120 2121 if (want_zero && local_file && local_file != bs && 2122 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2123 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2124 int64_t file_pnum; 2125 int ret2; 2126 2127 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2128 *pnum, &file_pnum, NULL, NULL); 2129 if (ret2 >= 0) { 2130 /* Ignore errors. This is just providing extra information, it 2131 * is useful but not necessary. 2132 */ 2133 if (ret2 & BDRV_BLOCK_EOF && 2134 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2135 /* 2136 * It is valid for the format block driver to read 2137 * beyond the end of the underlying file's current 2138 * size; such areas read as zero. 2139 */ 2140 ret |= BDRV_BLOCK_ZERO; 2141 } else { 2142 /* Limit request to the range reported by the protocol driver */ 2143 *pnum = file_pnum; 2144 ret |= (ret2 & BDRV_BLOCK_ZERO); 2145 } 2146 } 2147 } 2148 2149 out: 2150 bdrv_dec_in_flight(bs); 2151 if (ret >= 0 && offset + *pnum == total_size) { 2152 ret |= BDRV_BLOCK_EOF; 2153 } 2154 early_out: 2155 if (file) { 2156 *file = local_file; 2157 } 2158 if (map) { 2159 *map = local_map; 2160 } 2161 return ret; 2162 } 2163 2164 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2165 BlockDriverState *base, 2166 bool want_zero, 2167 int64_t offset, 2168 int64_t bytes, 2169 int64_t *pnum, 2170 int64_t *map, 2171 BlockDriverState **file) 2172 { 2173 BlockDriverState *p; 2174 int ret = 0; 2175 bool first = true; 2176 2177 assert(bs != base); 2178 for (p = bs; p != base; p = backing_bs(p)) { 2179 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2180 file); 2181 if (ret < 0) { 2182 break; 2183 } 2184 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2185 /* 2186 * Reading beyond the end of the file continues to read 2187 * zeroes, but we can only widen the result to the 2188 * unallocated length we learned from an earlier 2189 * iteration. 2190 */ 2191 *pnum = bytes; 2192 } 2193 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2194 break; 2195 } 2196 /* [offset, pnum] unallocated on this layer, which could be only 2197 * the first part of [offset, bytes]. */ 2198 bytes = MIN(bytes, *pnum); 2199 first = false; 2200 } 2201 return ret; 2202 } 2203 2204 /* Coroutine wrapper for bdrv_block_status_above() */ 2205 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2206 { 2207 BdrvCoBlockStatusData *data = opaque; 2208 2209 data->ret = bdrv_co_block_status_above(data->bs, data->base, 2210 data->want_zero, 2211 data->offset, data->bytes, 2212 data->pnum, data->map, data->file); 2213 data->done = true; 2214 } 2215 2216 /* 2217 * Synchronous wrapper around bdrv_co_block_status_above(). 2218 * 2219 * See bdrv_co_block_status_above() for details. 2220 */ 2221 static int bdrv_common_block_status_above(BlockDriverState *bs, 2222 BlockDriverState *base, 2223 bool want_zero, int64_t offset, 2224 int64_t bytes, int64_t *pnum, 2225 int64_t *map, 2226 BlockDriverState **file) 2227 { 2228 Coroutine *co; 2229 BdrvCoBlockStatusData data = { 2230 .bs = bs, 2231 .base = base, 2232 .want_zero = want_zero, 2233 .offset = offset, 2234 .bytes = bytes, 2235 .pnum = pnum, 2236 .map = map, 2237 .file = file, 2238 .done = false, 2239 }; 2240 2241 if (qemu_in_coroutine()) { 2242 /* Fast-path if already in coroutine context */ 2243 bdrv_block_status_above_co_entry(&data); 2244 } else { 2245 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2246 bdrv_coroutine_enter(bs, co); 2247 BDRV_POLL_WHILE(bs, !data.done); 2248 } 2249 return data.ret; 2250 } 2251 2252 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2253 int64_t offset, int64_t bytes, int64_t *pnum, 2254 int64_t *map, BlockDriverState **file) 2255 { 2256 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2257 pnum, map, file); 2258 } 2259 2260 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2261 int64_t *pnum, int64_t *map, BlockDriverState **file) 2262 { 2263 return bdrv_block_status_above(bs, backing_bs(bs), 2264 offset, bytes, pnum, map, file); 2265 } 2266 2267 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2268 int64_t bytes, int64_t *pnum) 2269 { 2270 int ret; 2271 int64_t dummy; 2272 2273 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2274 bytes, pnum ? pnum : &dummy, NULL, 2275 NULL); 2276 if (ret < 0) { 2277 return ret; 2278 } 2279 return !!(ret & BDRV_BLOCK_ALLOCATED); 2280 } 2281 2282 /* 2283 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2284 * 2285 * Return true if (a prefix of) the given range is allocated in any image 2286 * between BASE and TOP (inclusive). BASE can be NULL to check if the given 2287 * offset is allocated in any image of the chain. Return false otherwise, 2288 * or negative errno on failure. 2289 * 2290 * 'pnum' is set to the number of bytes (including and immediately 2291 * following the specified offset) that are known to be in the same 2292 * allocated/unallocated state. Note that a subsequent call starting 2293 * at 'offset + *pnum' may return the same allocation status (in other 2294 * words, the result is not necessarily the maximum possible range); 2295 * but 'pnum' will only be 0 when end of file is reached. 2296 * 2297 */ 2298 int bdrv_is_allocated_above(BlockDriverState *top, 2299 BlockDriverState *base, 2300 int64_t offset, int64_t bytes, int64_t *pnum) 2301 { 2302 BlockDriverState *intermediate; 2303 int ret; 2304 int64_t n = bytes; 2305 2306 intermediate = top; 2307 while (intermediate && intermediate != base) { 2308 int64_t pnum_inter; 2309 int64_t size_inter; 2310 2311 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2312 if (ret < 0) { 2313 return ret; 2314 } 2315 if (ret) { 2316 *pnum = pnum_inter; 2317 return 1; 2318 } 2319 2320 size_inter = bdrv_getlength(intermediate); 2321 if (size_inter < 0) { 2322 return size_inter; 2323 } 2324 if (n > pnum_inter && 2325 (intermediate == top || offset + pnum_inter < size_inter)) { 2326 n = pnum_inter; 2327 } 2328 2329 intermediate = backing_bs(intermediate); 2330 } 2331 2332 *pnum = n; 2333 return 0; 2334 } 2335 2336 typedef struct BdrvVmstateCo { 2337 BlockDriverState *bs; 2338 QEMUIOVector *qiov; 2339 int64_t pos; 2340 bool is_read; 2341 int ret; 2342 } BdrvVmstateCo; 2343 2344 static int coroutine_fn 2345 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2346 bool is_read) 2347 { 2348 BlockDriver *drv = bs->drv; 2349 int ret = -ENOTSUP; 2350 2351 bdrv_inc_in_flight(bs); 2352 2353 if (!drv) { 2354 ret = -ENOMEDIUM; 2355 } else if (drv->bdrv_load_vmstate) { 2356 if (is_read) { 2357 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2358 } else { 2359 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2360 } 2361 } else if (bs->file) { 2362 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2363 } 2364 2365 bdrv_dec_in_flight(bs); 2366 return ret; 2367 } 2368 2369 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2370 { 2371 BdrvVmstateCo *co = opaque; 2372 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2373 } 2374 2375 static inline int 2376 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2377 bool is_read) 2378 { 2379 if (qemu_in_coroutine()) { 2380 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2381 } else { 2382 BdrvVmstateCo data = { 2383 .bs = bs, 2384 .qiov = qiov, 2385 .pos = pos, 2386 .is_read = is_read, 2387 .ret = -EINPROGRESS, 2388 }; 2389 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2390 2391 bdrv_coroutine_enter(bs, co); 2392 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 2393 return data.ret; 2394 } 2395 } 2396 2397 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2398 int64_t pos, int size) 2399 { 2400 QEMUIOVector qiov; 2401 struct iovec iov = { 2402 .iov_base = (void *) buf, 2403 .iov_len = size, 2404 }; 2405 int ret; 2406 2407 qemu_iovec_init_external(&qiov, &iov, 1); 2408 2409 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2410 if (ret < 0) { 2411 return ret; 2412 } 2413 2414 return size; 2415 } 2416 2417 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2418 { 2419 return bdrv_rw_vmstate(bs, qiov, pos, false); 2420 } 2421 2422 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2423 int64_t pos, int size) 2424 { 2425 QEMUIOVector qiov; 2426 struct iovec iov = { 2427 .iov_base = buf, 2428 .iov_len = size, 2429 }; 2430 int ret; 2431 2432 qemu_iovec_init_external(&qiov, &iov, 1); 2433 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2434 if (ret < 0) { 2435 return ret; 2436 } 2437 2438 return size; 2439 } 2440 2441 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2442 { 2443 return bdrv_rw_vmstate(bs, qiov, pos, true); 2444 } 2445 2446 /**************************************************************/ 2447 /* async I/Os */ 2448 2449 void bdrv_aio_cancel(BlockAIOCB *acb) 2450 { 2451 qemu_aio_ref(acb); 2452 bdrv_aio_cancel_async(acb); 2453 while (acb->refcnt > 1) { 2454 if (acb->aiocb_info->get_aio_context) { 2455 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2456 } else if (acb->bs) { 2457 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2458 * assert that we're not using an I/O thread. Thread-safe 2459 * code should use bdrv_aio_cancel_async exclusively. 2460 */ 2461 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2462 aio_poll(bdrv_get_aio_context(acb->bs), true); 2463 } else { 2464 abort(); 2465 } 2466 } 2467 qemu_aio_unref(acb); 2468 } 2469 2470 /* Async version of aio cancel. The caller is not blocked if the acb implements 2471 * cancel_async, otherwise we do nothing and let the request normally complete. 2472 * In either case the completion callback must be called. */ 2473 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2474 { 2475 if (acb->aiocb_info->cancel_async) { 2476 acb->aiocb_info->cancel_async(acb); 2477 } 2478 } 2479 2480 /**************************************************************/ 2481 /* Coroutine block device emulation */ 2482 2483 typedef struct FlushCo { 2484 BlockDriverState *bs; 2485 int ret; 2486 } FlushCo; 2487 2488 2489 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2490 { 2491 FlushCo *rwco = opaque; 2492 2493 rwco->ret = bdrv_co_flush(rwco->bs); 2494 } 2495 2496 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2497 { 2498 int current_gen; 2499 int ret = 0; 2500 2501 bdrv_inc_in_flight(bs); 2502 2503 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2504 bdrv_is_sg(bs)) { 2505 goto early_exit; 2506 } 2507 2508 qemu_co_mutex_lock(&bs->reqs_lock); 2509 current_gen = atomic_read(&bs->write_gen); 2510 2511 /* Wait until any previous flushes are completed */ 2512 while (bs->active_flush_req) { 2513 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2514 } 2515 2516 /* Flushes reach this point in nondecreasing current_gen order. */ 2517 bs->active_flush_req = true; 2518 qemu_co_mutex_unlock(&bs->reqs_lock); 2519 2520 /* Write back all layers by calling one driver function */ 2521 if (bs->drv->bdrv_co_flush) { 2522 ret = bs->drv->bdrv_co_flush(bs); 2523 goto out; 2524 } 2525 2526 /* Write back cached data to the OS even with cache=unsafe */ 2527 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2528 if (bs->drv->bdrv_co_flush_to_os) { 2529 ret = bs->drv->bdrv_co_flush_to_os(bs); 2530 if (ret < 0) { 2531 goto out; 2532 } 2533 } 2534 2535 /* But don't actually force it to the disk with cache=unsafe */ 2536 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2537 goto flush_parent; 2538 } 2539 2540 /* Check if we really need to flush anything */ 2541 if (bs->flushed_gen == current_gen) { 2542 goto flush_parent; 2543 } 2544 2545 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2546 if (!bs->drv) { 2547 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2548 * (even in case of apparent success) */ 2549 ret = -ENOMEDIUM; 2550 goto out; 2551 } 2552 if (bs->drv->bdrv_co_flush_to_disk) { 2553 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2554 } else if (bs->drv->bdrv_aio_flush) { 2555 BlockAIOCB *acb; 2556 CoroutineIOCompletion co = { 2557 .coroutine = qemu_coroutine_self(), 2558 }; 2559 2560 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2561 if (acb == NULL) { 2562 ret = -EIO; 2563 } else { 2564 qemu_coroutine_yield(); 2565 ret = co.ret; 2566 } 2567 } else { 2568 /* 2569 * Some block drivers always operate in either writethrough or unsafe 2570 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2571 * know how the server works (because the behaviour is hardcoded or 2572 * depends on server-side configuration), so we can't ensure that 2573 * everything is safe on disk. Returning an error doesn't work because 2574 * that would break guests even if the server operates in writethrough 2575 * mode. 2576 * 2577 * Let's hope the user knows what he's doing. 2578 */ 2579 ret = 0; 2580 } 2581 2582 if (ret < 0) { 2583 goto out; 2584 } 2585 2586 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2587 * in the case of cache=unsafe, so there are no useless flushes. 2588 */ 2589 flush_parent: 2590 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2591 out: 2592 /* Notify any pending flushes that we have completed */ 2593 if (ret == 0) { 2594 bs->flushed_gen = current_gen; 2595 } 2596 2597 qemu_co_mutex_lock(&bs->reqs_lock); 2598 bs->active_flush_req = false; 2599 /* Return value is ignored - it's ok if wait queue is empty */ 2600 qemu_co_queue_next(&bs->flush_queue); 2601 qemu_co_mutex_unlock(&bs->reqs_lock); 2602 2603 early_exit: 2604 bdrv_dec_in_flight(bs); 2605 return ret; 2606 } 2607 2608 int bdrv_flush(BlockDriverState *bs) 2609 { 2610 Coroutine *co; 2611 FlushCo flush_co = { 2612 .bs = bs, 2613 .ret = NOT_DONE, 2614 }; 2615 2616 if (qemu_in_coroutine()) { 2617 /* Fast-path if already in coroutine context */ 2618 bdrv_flush_co_entry(&flush_co); 2619 } else { 2620 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2621 bdrv_coroutine_enter(bs, co); 2622 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2623 } 2624 2625 return flush_co.ret; 2626 } 2627 2628 typedef struct DiscardCo { 2629 BlockDriverState *bs; 2630 int64_t offset; 2631 int bytes; 2632 int ret; 2633 } DiscardCo; 2634 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2635 { 2636 DiscardCo *rwco = opaque; 2637 2638 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); 2639 } 2640 2641 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2642 int bytes) 2643 { 2644 BdrvTrackedRequest req; 2645 int max_pdiscard, ret; 2646 int head, tail, align; 2647 2648 if (!bs->drv) { 2649 return -ENOMEDIUM; 2650 } 2651 2652 if (bdrv_has_readonly_bitmaps(bs)) { 2653 return -EPERM; 2654 } 2655 2656 ret = bdrv_check_byte_request(bs, offset, bytes); 2657 if (ret < 0) { 2658 return ret; 2659 } else if (bs->read_only) { 2660 return -EPERM; 2661 } 2662 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2663 2664 /* Do nothing if disabled. */ 2665 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2666 return 0; 2667 } 2668 2669 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2670 return 0; 2671 } 2672 2673 /* Discard is advisory, but some devices track and coalesce 2674 * unaligned requests, so we must pass everything down rather than 2675 * round here. Still, most devices will just silently ignore 2676 * unaligned requests (by returning -ENOTSUP), so we must fragment 2677 * the request accordingly. */ 2678 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2679 assert(align % bs->bl.request_alignment == 0); 2680 head = offset % align; 2681 tail = (offset + bytes) % align; 2682 2683 bdrv_inc_in_flight(bs); 2684 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2685 2686 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2687 if (ret < 0) { 2688 goto out; 2689 } 2690 2691 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2692 align); 2693 assert(max_pdiscard >= bs->bl.request_alignment); 2694 2695 while (bytes > 0) { 2696 int num = bytes; 2697 2698 if (head) { 2699 /* Make small requests to get to alignment boundaries. */ 2700 num = MIN(bytes, align - head); 2701 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2702 num %= bs->bl.request_alignment; 2703 } 2704 head = (head + num) % align; 2705 assert(num < max_pdiscard); 2706 } else if (tail) { 2707 if (num > align) { 2708 /* Shorten the request to the last aligned cluster. */ 2709 num -= tail; 2710 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2711 tail > bs->bl.request_alignment) { 2712 tail %= bs->bl.request_alignment; 2713 num -= tail; 2714 } 2715 } 2716 /* limit request size */ 2717 if (num > max_pdiscard) { 2718 num = max_pdiscard; 2719 } 2720 2721 if (!bs->drv) { 2722 ret = -ENOMEDIUM; 2723 goto out; 2724 } 2725 if (bs->drv->bdrv_co_pdiscard) { 2726 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2727 } else { 2728 BlockAIOCB *acb; 2729 CoroutineIOCompletion co = { 2730 .coroutine = qemu_coroutine_self(), 2731 }; 2732 2733 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2734 bdrv_co_io_em_complete, &co); 2735 if (acb == NULL) { 2736 ret = -EIO; 2737 goto out; 2738 } else { 2739 qemu_coroutine_yield(); 2740 ret = co.ret; 2741 } 2742 } 2743 if (ret && ret != -ENOTSUP) { 2744 goto out; 2745 } 2746 2747 offset += num; 2748 bytes -= num; 2749 } 2750 ret = 0; 2751 out: 2752 atomic_inc(&bs->write_gen); 2753 bdrv_set_dirty(bs, req.offset, req.bytes); 2754 tracked_request_end(&req); 2755 bdrv_dec_in_flight(bs); 2756 return ret; 2757 } 2758 2759 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 2760 { 2761 Coroutine *co; 2762 DiscardCo rwco = { 2763 .bs = bs, 2764 .offset = offset, 2765 .bytes = bytes, 2766 .ret = NOT_DONE, 2767 }; 2768 2769 if (qemu_in_coroutine()) { 2770 /* Fast-path if already in coroutine context */ 2771 bdrv_pdiscard_co_entry(&rwco); 2772 } else { 2773 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2774 bdrv_coroutine_enter(bs, co); 2775 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 2776 } 2777 2778 return rwco.ret; 2779 } 2780 2781 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 2782 { 2783 BlockDriver *drv = bs->drv; 2784 CoroutineIOCompletion co = { 2785 .coroutine = qemu_coroutine_self(), 2786 }; 2787 BlockAIOCB *acb; 2788 2789 bdrv_inc_in_flight(bs); 2790 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 2791 co.ret = -ENOTSUP; 2792 goto out; 2793 } 2794 2795 if (drv->bdrv_co_ioctl) { 2796 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 2797 } else { 2798 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2799 if (!acb) { 2800 co.ret = -ENOTSUP; 2801 goto out; 2802 } 2803 qemu_coroutine_yield(); 2804 } 2805 out: 2806 bdrv_dec_in_flight(bs); 2807 return co.ret; 2808 } 2809 2810 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2811 { 2812 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2813 } 2814 2815 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2816 { 2817 return memset(qemu_blockalign(bs, size), 0, size); 2818 } 2819 2820 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2821 { 2822 size_t align = bdrv_opt_mem_align(bs); 2823 2824 /* Ensure that NULL is never returned on success */ 2825 assert(align > 0); 2826 if (size == 0) { 2827 size = align; 2828 } 2829 2830 return qemu_try_memalign(align, size); 2831 } 2832 2833 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2834 { 2835 void *mem = qemu_try_blockalign(bs, size); 2836 2837 if (mem) { 2838 memset(mem, 0, size); 2839 } 2840 2841 return mem; 2842 } 2843 2844 /* 2845 * Check if all memory in this vector is sector aligned. 2846 */ 2847 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2848 { 2849 int i; 2850 size_t alignment = bdrv_min_mem_align(bs); 2851 2852 for (i = 0; i < qiov->niov; i++) { 2853 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2854 return false; 2855 } 2856 if (qiov->iov[i].iov_len % alignment) { 2857 return false; 2858 } 2859 } 2860 2861 return true; 2862 } 2863 2864 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2865 NotifierWithReturn *notifier) 2866 { 2867 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2868 } 2869 2870 void bdrv_io_plug(BlockDriverState *bs) 2871 { 2872 BdrvChild *child; 2873 2874 QLIST_FOREACH(child, &bs->children, next) { 2875 bdrv_io_plug(child->bs); 2876 } 2877 2878 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 2879 BlockDriver *drv = bs->drv; 2880 if (drv && drv->bdrv_io_plug) { 2881 drv->bdrv_io_plug(bs); 2882 } 2883 } 2884 } 2885 2886 void bdrv_io_unplug(BlockDriverState *bs) 2887 { 2888 BdrvChild *child; 2889 2890 assert(bs->io_plugged); 2891 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 2892 BlockDriver *drv = bs->drv; 2893 if (drv && drv->bdrv_io_unplug) { 2894 drv->bdrv_io_unplug(bs); 2895 } 2896 } 2897 2898 QLIST_FOREACH(child, &bs->children, next) { 2899 bdrv_io_unplug(child->bs); 2900 } 2901 } 2902 2903 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 2904 { 2905 BdrvChild *child; 2906 2907 if (bs->drv && bs->drv->bdrv_register_buf) { 2908 bs->drv->bdrv_register_buf(bs, host, size); 2909 } 2910 QLIST_FOREACH(child, &bs->children, next) { 2911 bdrv_register_buf(child->bs, host, size); 2912 } 2913 } 2914 2915 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 2916 { 2917 BdrvChild *child; 2918 2919 if (bs->drv && bs->drv->bdrv_unregister_buf) { 2920 bs->drv->bdrv_unregister_buf(bs, host); 2921 } 2922 QLIST_FOREACH(child, &bs->children, next) { 2923 bdrv_unregister_buf(child->bs, host); 2924 } 2925 } 2926 2927 static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, 2928 uint64_t src_offset, 2929 BdrvChild *dst, 2930 uint64_t dst_offset, 2931 uint64_t bytes, 2932 BdrvRequestFlags flags, 2933 bool recurse_src) 2934 { 2935 int ret; 2936 2937 if (!src || !dst || !src->bs || !dst->bs) { 2938 return -ENOMEDIUM; 2939 } 2940 ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 2941 if (ret) { 2942 return ret; 2943 } 2944 2945 ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 2946 if (ret) { 2947 return ret; 2948 } 2949 if (flags & BDRV_REQ_ZERO_WRITE) { 2950 return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags); 2951 } 2952 2953 if (!src->bs->drv->bdrv_co_copy_range_from 2954 || !dst->bs->drv->bdrv_co_copy_range_to 2955 || src->bs->encrypted || dst->bs->encrypted) { 2956 return -ENOTSUP; 2957 } 2958 if (recurse_src) { 2959 return src->bs->drv->bdrv_co_copy_range_from(src->bs, 2960 src, src_offset, 2961 dst, dst_offset, 2962 bytes, flags); 2963 } else { 2964 return dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 2965 src, src_offset, 2966 dst, dst_offset, 2967 bytes, flags); 2968 } 2969 } 2970 2971 /* Copy range from @src to @dst. 2972 * 2973 * See the comment of bdrv_co_copy_range for the parameter and return value 2974 * semantics. */ 2975 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 2976 BdrvChild *dst, uint64_t dst_offset, 2977 uint64_t bytes, BdrvRequestFlags flags) 2978 { 2979 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 2980 bytes, flags, true); 2981 } 2982 2983 /* Copy range from @src to @dst. 2984 * 2985 * See the comment of bdrv_co_copy_range for the parameter and return value 2986 * semantics. */ 2987 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 2988 BdrvChild *dst, uint64_t dst_offset, 2989 uint64_t bytes, BdrvRequestFlags flags) 2990 { 2991 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 2992 bytes, flags, false); 2993 } 2994 2995 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 2996 BdrvChild *dst, uint64_t dst_offset, 2997 uint64_t bytes, BdrvRequestFlags flags) 2998 { 2999 BdrvTrackedRequest src_req, dst_req; 3000 BlockDriverState *src_bs = src->bs; 3001 BlockDriverState *dst_bs = dst->bs; 3002 int ret; 3003 3004 bdrv_inc_in_flight(src_bs); 3005 bdrv_inc_in_flight(dst_bs); 3006 tracked_request_begin(&src_req, src_bs, src_offset, 3007 bytes, BDRV_TRACKED_READ); 3008 tracked_request_begin(&dst_req, dst_bs, dst_offset, 3009 bytes, BDRV_TRACKED_WRITE); 3010 3011 wait_serialising_requests(&src_req); 3012 wait_serialising_requests(&dst_req); 3013 ret = bdrv_co_copy_range_from(src, src_offset, 3014 dst, dst_offset, 3015 bytes, flags); 3016 3017 tracked_request_end(&src_req); 3018 tracked_request_end(&dst_req); 3019 bdrv_dec_in_flight(src_bs); 3020 bdrv_dec_in_flight(dst_bs); 3021 return ret; 3022 } 3023