1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 36 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 37 38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40 41 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 42 int64_t offset, int bytes, BdrvRequestFlags flags); 43 44 void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) 45 { 46 BdrvChild *c, *next; 47 48 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 49 if (c == ignore) { 50 continue; 51 } 52 if (c->role->drained_begin) { 53 c->role->drained_begin(c); 54 } 55 } 56 } 57 58 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) 59 { 60 BdrvChild *c, *next; 61 62 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 63 if (c == ignore) { 64 continue; 65 } 66 if (c->role->drained_end) { 67 c->role->drained_end(c); 68 } 69 } 70 } 71 72 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 73 { 74 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 75 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 76 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 77 src->opt_mem_alignment); 78 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 79 src->min_mem_alignment); 80 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 81 } 82 83 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 84 { 85 BlockDriver *drv = bs->drv; 86 Error *local_err = NULL; 87 88 memset(&bs->bl, 0, sizeof(bs->bl)); 89 90 if (!drv) { 91 return; 92 } 93 94 /* Default alignment based on whether driver has byte interface */ 95 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; 96 97 /* Take some limits from the children as a default */ 98 if (bs->file) { 99 bdrv_refresh_limits(bs->file->bs, &local_err); 100 if (local_err) { 101 error_propagate(errp, local_err); 102 return; 103 } 104 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 105 } else { 106 bs->bl.min_mem_alignment = 512; 107 bs->bl.opt_mem_alignment = getpagesize(); 108 109 /* Safe default since most protocols use readv()/writev()/etc */ 110 bs->bl.max_iov = IOV_MAX; 111 } 112 113 if (bs->backing) { 114 bdrv_refresh_limits(bs->backing->bs, &local_err); 115 if (local_err) { 116 error_propagate(errp, local_err); 117 return; 118 } 119 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 120 } 121 122 /* Then let the driver override it */ 123 if (drv->bdrv_refresh_limits) { 124 drv->bdrv_refresh_limits(bs, errp); 125 } 126 } 127 128 /** 129 * The copy-on-read flag is actually a reference count so multiple users may 130 * use the feature without worrying about clobbering its previous state. 131 * Copy-on-read stays enabled until all users have called to disable it. 132 */ 133 void bdrv_enable_copy_on_read(BlockDriverState *bs) 134 { 135 atomic_inc(&bs->copy_on_read); 136 } 137 138 void bdrv_disable_copy_on_read(BlockDriverState *bs) 139 { 140 int old = atomic_fetch_dec(&bs->copy_on_read); 141 assert(old >= 1); 142 } 143 144 typedef struct { 145 Coroutine *co; 146 BlockDriverState *bs; 147 bool done; 148 bool begin; 149 bool recursive; 150 BdrvChild *parent; 151 } BdrvCoDrainData; 152 153 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 154 { 155 BdrvCoDrainData *data = opaque; 156 BlockDriverState *bs = data->bs; 157 158 if (data->begin) { 159 bs->drv->bdrv_co_drain_begin(bs); 160 } else { 161 bs->drv->bdrv_co_drain_end(bs); 162 } 163 164 /* Set data->done before reading bs->wakeup. */ 165 atomic_mb_set(&data->done, true); 166 bdrv_wakeup(bs); 167 } 168 169 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 170 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive) 171 { 172 BdrvChild *child, *tmp; 173 BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin}; 174 175 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 176 (!begin && !bs->drv->bdrv_co_drain_end)) { 177 return; 178 } 179 180 data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data); 181 bdrv_coroutine_enter(bs, data.co); 182 BDRV_POLL_WHILE(bs, !data.done); 183 184 if (recursive) { 185 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 186 bdrv_drain_invoke(child->bs, begin, true); 187 } 188 } 189 } 190 191 static bool bdrv_drain_recurse(BlockDriverState *bs) 192 { 193 BdrvChild *child, *tmp; 194 bool waited; 195 196 /* Wait for drained requests to finish */ 197 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); 198 199 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 200 BlockDriverState *bs = child->bs; 201 bool in_main_loop = 202 qemu_get_current_aio_context() == qemu_get_aio_context(); 203 assert(bs->refcnt > 0); 204 if (in_main_loop) { 205 /* In case the recursive bdrv_drain_recurse processes a 206 * block_job_defer_to_main_loop BH and modifies the graph, 207 * let's hold a reference to bs until we are done. 208 * 209 * IOThread doesn't have such a BH, and it is not safe to call 210 * bdrv_unref without BQL, so skip doing it there. 211 */ 212 bdrv_ref(bs); 213 } 214 waited |= bdrv_drain_recurse(bs); 215 if (in_main_loop) { 216 bdrv_unref(bs); 217 } 218 } 219 220 return waited; 221 } 222 223 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 224 BdrvChild *parent); 225 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 226 BdrvChild *parent); 227 228 static void bdrv_co_drain_bh_cb(void *opaque) 229 { 230 BdrvCoDrainData *data = opaque; 231 Coroutine *co = data->co; 232 BlockDriverState *bs = data->bs; 233 234 bdrv_dec_in_flight(bs); 235 if (data->begin) { 236 bdrv_do_drained_begin(bs, data->recursive, data->parent); 237 } else { 238 bdrv_do_drained_end(bs, data->recursive, data->parent); 239 } 240 241 data->done = true; 242 aio_co_wake(co); 243 } 244 245 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 246 bool begin, bool recursive, 247 BdrvChild *parent) 248 { 249 BdrvCoDrainData data; 250 251 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 252 * other coroutines run if they were queued by aio_co_enter(). */ 253 254 assert(qemu_in_coroutine()); 255 data = (BdrvCoDrainData) { 256 .co = qemu_coroutine_self(), 257 .bs = bs, 258 .done = false, 259 .begin = begin, 260 .recursive = recursive, 261 .parent = parent, 262 }; 263 bdrv_inc_in_flight(bs); 264 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 265 bdrv_co_drain_bh_cb, &data); 266 267 qemu_coroutine_yield(); 268 /* If we are resumed from some other event (such as an aio completion or a 269 * timer callback), it is a bug in the caller that should be fixed. */ 270 assert(data.done); 271 } 272 273 void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 274 BdrvChild *parent) 275 { 276 BdrvChild *child, *next; 277 278 if (qemu_in_coroutine()) { 279 bdrv_co_yield_to_drain(bs, true, recursive, parent); 280 return; 281 } 282 283 /* Stop things in parent-to-child order */ 284 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 285 aio_disable_external(bdrv_get_aio_context(bs)); 286 } 287 288 bdrv_parent_drained_begin(bs, parent); 289 bdrv_drain_invoke(bs, true, false); 290 bdrv_drain_recurse(bs); 291 292 if (recursive) { 293 bs->recursive_quiesce_counter++; 294 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 295 bdrv_do_drained_begin(child->bs, true, child); 296 } 297 } 298 } 299 300 void bdrv_drained_begin(BlockDriverState *bs) 301 { 302 bdrv_do_drained_begin(bs, false, NULL); 303 } 304 305 void bdrv_subtree_drained_begin(BlockDriverState *bs) 306 { 307 bdrv_do_drained_begin(bs, true, NULL); 308 } 309 310 void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 311 BdrvChild *parent) 312 { 313 BdrvChild *child, *next; 314 int old_quiesce_counter; 315 316 if (qemu_in_coroutine()) { 317 bdrv_co_yield_to_drain(bs, false, recursive, parent); 318 return; 319 } 320 assert(bs->quiesce_counter > 0); 321 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 322 323 /* Re-enable things in child-to-parent order */ 324 bdrv_drain_invoke(bs, false, false); 325 bdrv_parent_drained_end(bs, parent); 326 if (old_quiesce_counter == 1) { 327 aio_enable_external(bdrv_get_aio_context(bs)); 328 } 329 330 if (recursive) { 331 bs->recursive_quiesce_counter--; 332 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 333 bdrv_do_drained_end(child->bs, true, child); 334 } 335 } 336 } 337 338 void bdrv_drained_end(BlockDriverState *bs) 339 { 340 bdrv_do_drained_end(bs, false, NULL); 341 } 342 343 void bdrv_subtree_drained_end(BlockDriverState *bs) 344 { 345 bdrv_do_drained_end(bs, true, NULL); 346 } 347 348 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 349 { 350 int i; 351 352 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 353 bdrv_do_drained_begin(child->bs, true, child); 354 } 355 } 356 357 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 358 { 359 int i; 360 361 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 362 bdrv_do_drained_end(child->bs, true, child); 363 } 364 } 365 366 /* 367 * Wait for pending requests to complete on a single BlockDriverState subtree, 368 * and suspend block driver's internal I/O until next request arrives. 369 * 370 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 371 * AioContext. 372 * 373 * Only this BlockDriverState's AioContext is run, so in-flight requests must 374 * not depend on events in other AioContexts. In that case, use 375 * bdrv_drain_all() instead. 376 */ 377 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 378 { 379 assert(qemu_in_coroutine()); 380 bdrv_drained_begin(bs); 381 bdrv_drained_end(bs); 382 } 383 384 void bdrv_drain(BlockDriverState *bs) 385 { 386 bdrv_drained_begin(bs); 387 bdrv_drained_end(bs); 388 } 389 390 /* 391 * Wait for pending requests to complete across all BlockDriverStates 392 * 393 * This function does not flush data to disk, use bdrv_flush_all() for that 394 * after calling this function. 395 * 396 * This pauses all block jobs and disables external clients. It must 397 * be paired with bdrv_drain_all_end(). 398 * 399 * NOTE: no new block jobs or BlockDriverStates can be created between 400 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 401 */ 402 void bdrv_drain_all_begin(void) 403 { 404 /* Always run first iteration so any pending completion BHs run */ 405 bool waited = true; 406 BlockDriverState *bs; 407 BdrvNextIterator it; 408 GSList *aio_ctxs = NULL, *ctx; 409 410 /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread 411 * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on 412 * nodes in several different AioContexts, so make sure we're in the main 413 * context. */ 414 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 415 416 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 417 AioContext *aio_context = bdrv_get_aio_context(bs); 418 419 /* Stop things in parent-to-child order */ 420 aio_context_acquire(aio_context); 421 aio_disable_external(aio_context); 422 bdrv_parent_drained_begin(bs, NULL); 423 bdrv_drain_invoke(bs, true, true); 424 aio_context_release(aio_context); 425 426 if (!g_slist_find(aio_ctxs, aio_context)) { 427 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 428 } 429 } 430 431 /* Note that completion of an asynchronous I/O operation can trigger any 432 * number of other I/O operations on other devices---for example a 433 * coroutine can submit an I/O request to another device in response to 434 * request completion. Therefore we must keep looping until there was no 435 * more activity rather than simply draining each device independently. 436 */ 437 while (waited) { 438 waited = false; 439 440 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 441 AioContext *aio_context = ctx->data; 442 443 aio_context_acquire(aio_context); 444 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 445 if (aio_context == bdrv_get_aio_context(bs)) { 446 waited |= bdrv_drain_recurse(bs); 447 } 448 } 449 aio_context_release(aio_context); 450 } 451 } 452 453 g_slist_free(aio_ctxs); 454 } 455 456 void bdrv_drain_all_end(void) 457 { 458 BlockDriverState *bs; 459 BdrvNextIterator it; 460 461 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 462 AioContext *aio_context = bdrv_get_aio_context(bs); 463 464 /* Re-enable things in child-to-parent order */ 465 aio_context_acquire(aio_context); 466 bdrv_drain_invoke(bs, false, true); 467 bdrv_parent_drained_end(bs, NULL); 468 aio_enable_external(aio_context); 469 aio_context_release(aio_context); 470 } 471 } 472 473 void bdrv_drain_all(void) 474 { 475 bdrv_drain_all_begin(); 476 bdrv_drain_all_end(); 477 } 478 479 /** 480 * Remove an active request from the tracked requests list 481 * 482 * This function should be called when a tracked request is completing. 483 */ 484 static void tracked_request_end(BdrvTrackedRequest *req) 485 { 486 if (req->serialising) { 487 atomic_dec(&req->bs->serialising_in_flight); 488 } 489 490 qemu_co_mutex_lock(&req->bs->reqs_lock); 491 QLIST_REMOVE(req, list); 492 qemu_co_queue_restart_all(&req->wait_queue); 493 qemu_co_mutex_unlock(&req->bs->reqs_lock); 494 } 495 496 /** 497 * Add an active request to the tracked requests list 498 */ 499 static void tracked_request_begin(BdrvTrackedRequest *req, 500 BlockDriverState *bs, 501 int64_t offset, 502 unsigned int bytes, 503 enum BdrvTrackedRequestType type) 504 { 505 *req = (BdrvTrackedRequest){ 506 .bs = bs, 507 .offset = offset, 508 .bytes = bytes, 509 .type = type, 510 .co = qemu_coroutine_self(), 511 .serialising = false, 512 .overlap_offset = offset, 513 .overlap_bytes = bytes, 514 }; 515 516 qemu_co_queue_init(&req->wait_queue); 517 518 qemu_co_mutex_lock(&bs->reqs_lock); 519 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 520 qemu_co_mutex_unlock(&bs->reqs_lock); 521 } 522 523 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 524 { 525 int64_t overlap_offset = req->offset & ~(align - 1); 526 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 527 - overlap_offset; 528 529 if (!req->serialising) { 530 atomic_inc(&req->bs->serialising_in_flight); 531 req->serialising = true; 532 } 533 534 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 535 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 536 } 537 538 /** 539 * Round a region to cluster boundaries 540 */ 541 void bdrv_round_to_clusters(BlockDriverState *bs, 542 int64_t offset, int64_t bytes, 543 int64_t *cluster_offset, 544 int64_t *cluster_bytes) 545 { 546 BlockDriverInfo bdi; 547 548 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 549 *cluster_offset = offset; 550 *cluster_bytes = bytes; 551 } else { 552 int64_t c = bdi.cluster_size; 553 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 554 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 555 } 556 } 557 558 static int bdrv_get_cluster_size(BlockDriverState *bs) 559 { 560 BlockDriverInfo bdi; 561 int ret; 562 563 ret = bdrv_get_info(bs, &bdi); 564 if (ret < 0 || bdi.cluster_size == 0) { 565 return bs->bl.request_alignment; 566 } else { 567 return bdi.cluster_size; 568 } 569 } 570 571 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 572 int64_t offset, unsigned int bytes) 573 { 574 /* aaaa bbbb */ 575 if (offset >= req->overlap_offset + req->overlap_bytes) { 576 return false; 577 } 578 /* bbbb aaaa */ 579 if (req->overlap_offset >= offset + bytes) { 580 return false; 581 } 582 return true; 583 } 584 585 void bdrv_inc_in_flight(BlockDriverState *bs) 586 { 587 atomic_inc(&bs->in_flight); 588 } 589 590 void bdrv_wakeup(BlockDriverState *bs) 591 { 592 aio_wait_kick(bdrv_get_aio_wait(bs)); 593 } 594 595 void bdrv_dec_in_flight(BlockDriverState *bs) 596 { 597 atomic_dec(&bs->in_flight); 598 bdrv_wakeup(bs); 599 } 600 601 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 602 { 603 BlockDriverState *bs = self->bs; 604 BdrvTrackedRequest *req; 605 bool retry; 606 bool waited = false; 607 608 if (!atomic_read(&bs->serialising_in_flight)) { 609 return false; 610 } 611 612 do { 613 retry = false; 614 qemu_co_mutex_lock(&bs->reqs_lock); 615 QLIST_FOREACH(req, &bs->tracked_requests, list) { 616 if (req == self || (!req->serialising && !self->serialising)) { 617 continue; 618 } 619 if (tracked_request_overlaps(req, self->overlap_offset, 620 self->overlap_bytes)) 621 { 622 /* Hitting this means there was a reentrant request, for 623 * example, a block driver issuing nested requests. This must 624 * never happen since it means deadlock. 625 */ 626 assert(qemu_coroutine_self() != req->co); 627 628 /* If the request is already (indirectly) waiting for us, or 629 * will wait for us as soon as it wakes up, then just go on 630 * (instead of producing a deadlock in the former case). */ 631 if (!req->waiting_for) { 632 self->waiting_for = req; 633 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 634 self->waiting_for = NULL; 635 retry = true; 636 waited = true; 637 break; 638 } 639 } 640 } 641 qemu_co_mutex_unlock(&bs->reqs_lock); 642 } while (retry); 643 644 return waited; 645 } 646 647 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 648 size_t size) 649 { 650 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 651 return -EIO; 652 } 653 654 if (!bdrv_is_inserted(bs)) { 655 return -ENOMEDIUM; 656 } 657 658 if (offset < 0) { 659 return -EIO; 660 } 661 662 return 0; 663 } 664 665 typedef struct RwCo { 666 BdrvChild *child; 667 int64_t offset; 668 QEMUIOVector *qiov; 669 bool is_write; 670 int ret; 671 BdrvRequestFlags flags; 672 } RwCo; 673 674 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 675 { 676 RwCo *rwco = opaque; 677 678 if (!rwco->is_write) { 679 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 680 rwco->qiov->size, rwco->qiov, 681 rwco->flags); 682 } else { 683 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 684 rwco->qiov->size, rwco->qiov, 685 rwco->flags); 686 } 687 } 688 689 /* 690 * Process a vectored synchronous request using coroutines 691 */ 692 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 693 QEMUIOVector *qiov, bool is_write, 694 BdrvRequestFlags flags) 695 { 696 Coroutine *co; 697 RwCo rwco = { 698 .child = child, 699 .offset = offset, 700 .qiov = qiov, 701 .is_write = is_write, 702 .ret = NOT_DONE, 703 .flags = flags, 704 }; 705 706 if (qemu_in_coroutine()) { 707 /* Fast-path if already in coroutine context */ 708 bdrv_rw_co_entry(&rwco); 709 } else { 710 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 711 bdrv_coroutine_enter(child->bs, co); 712 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 713 } 714 return rwco.ret; 715 } 716 717 /* 718 * Process a synchronous request using coroutines 719 */ 720 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 721 int nb_sectors, bool is_write, BdrvRequestFlags flags) 722 { 723 QEMUIOVector qiov; 724 struct iovec iov = { 725 .iov_base = (void *)buf, 726 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 727 }; 728 729 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 730 return -EINVAL; 731 } 732 733 qemu_iovec_init_external(&qiov, &iov, 1); 734 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 735 &qiov, is_write, flags); 736 } 737 738 /* return < 0 if error. See bdrv_write() for the return codes */ 739 int bdrv_read(BdrvChild *child, int64_t sector_num, 740 uint8_t *buf, int nb_sectors) 741 { 742 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 743 } 744 745 /* Return < 0 if error. Important errors are: 746 -EIO generic I/O error (may happen for all errors) 747 -ENOMEDIUM No media inserted. 748 -EINVAL Invalid sector number or nb_sectors 749 -EACCES Trying to write a read-only device 750 */ 751 int bdrv_write(BdrvChild *child, int64_t sector_num, 752 const uint8_t *buf, int nb_sectors) 753 { 754 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 755 } 756 757 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 758 int bytes, BdrvRequestFlags flags) 759 { 760 QEMUIOVector qiov; 761 struct iovec iov = { 762 .iov_base = NULL, 763 .iov_len = bytes, 764 }; 765 766 qemu_iovec_init_external(&qiov, &iov, 1); 767 return bdrv_prwv_co(child, offset, &qiov, true, 768 BDRV_REQ_ZERO_WRITE | flags); 769 } 770 771 /* 772 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 773 * The operation is sped up by checking the block status and only writing 774 * zeroes to the device if they currently do not return zeroes. Optional 775 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 776 * BDRV_REQ_FUA). 777 * 778 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 779 */ 780 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 781 { 782 int ret; 783 int64_t target_size, bytes, offset = 0; 784 BlockDriverState *bs = child->bs; 785 786 target_size = bdrv_getlength(bs); 787 if (target_size < 0) { 788 return target_size; 789 } 790 791 for (;;) { 792 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 793 if (bytes <= 0) { 794 return 0; 795 } 796 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 797 if (ret < 0) { 798 error_report("error getting block status at offset %" PRId64 ": %s", 799 offset, strerror(-ret)); 800 return ret; 801 } 802 if (ret & BDRV_BLOCK_ZERO) { 803 offset += bytes; 804 continue; 805 } 806 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 807 if (ret < 0) { 808 error_report("error writing zeroes at offset %" PRId64 ": %s", 809 offset, strerror(-ret)); 810 return ret; 811 } 812 offset += bytes; 813 } 814 } 815 816 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 817 { 818 int ret; 819 820 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 821 if (ret < 0) { 822 return ret; 823 } 824 825 return qiov->size; 826 } 827 828 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 829 { 830 QEMUIOVector qiov; 831 struct iovec iov = { 832 .iov_base = (void *)buf, 833 .iov_len = bytes, 834 }; 835 836 if (bytes < 0) { 837 return -EINVAL; 838 } 839 840 qemu_iovec_init_external(&qiov, &iov, 1); 841 return bdrv_preadv(child, offset, &qiov); 842 } 843 844 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 845 { 846 int ret; 847 848 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 849 if (ret < 0) { 850 return ret; 851 } 852 853 return qiov->size; 854 } 855 856 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 857 { 858 QEMUIOVector qiov; 859 struct iovec iov = { 860 .iov_base = (void *) buf, 861 .iov_len = bytes, 862 }; 863 864 if (bytes < 0) { 865 return -EINVAL; 866 } 867 868 qemu_iovec_init_external(&qiov, &iov, 1); 869 return bdrv_pwritev(child, offset, &qiov); 870 } 871 872 /* 873 * Writes to the file and ensures that no writes are reordered across this 874 * request (acts as a barrier) 875 * 876 * Returns 0 on success, -errno in error cases. 877 */ 878 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 879 const void *buf, int count) 880 { 881 int ret; 882 883 ret = bdrv_pwrite(child, offset, buf, count); 884 if (ret < 0) { 885 return ret; 886 } 887 888 ret = bdrv_flush(child->bs); 889 if (ret < 0) { 890 return ret; 891 } 892 893 return 0; 894 } 895 896 typedef struct CoroutineIOCompletion { 897 Coroutine *coroutine; 898 int ret; 899 } CoroutineIOCompletion; 900 901 static void bdrv_co_io_em_complete(void *opaque, int ret) 902 { 903 CoroutineIOCompletion *co = opaque; 904 905 co->ret = ret; 906 aio_co_wake(co->coroutine); 907 } 908 909 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 910 uint64_t offset, uint64_t bytes, 911 QEMUIOVector *qiov, int flags) 912 { 913 BlockDriver *drv = bs->drv; 914 int64_t sector_num; 915 unsigned int nb_sectors; 916 917 assert(!(flags & ~BDRV_REQ_MASK)); 918 919 if (!drv) { 920 return -ENOMEDIUM; 921 } 922 923 if (drv->bdrv_co_preadv) { 924 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 925 } 926 927 sector_num = offset >> BDRV_SECTOR_BITS; 928 nb_sectors = bytes >> BDRV_SECTOR_BITS; 929 930 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 931 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 932 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 933 934 if (drv->bdrv_co_readv) { 935 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 936 } else { 937 BlockAIOCB *acb; 938 CoroutineIOCompletion co = { 939 .coroutine = qemu_coroutine_self(), 940 }; 941 942 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, 943 bdrv_co_io_em_complete, &co); 944 if (acb == NULL) { 945 return -EIO; 946 } else { 947 qemu_coroutine_yield(); 948 return co.ret; 949 } 950 } 951 } 952 953 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 954 uint64_t offset, uint64_t bytes, 955 QEMUIOVector *qiov, int flags) 956 { 957 BlockDriver *drv = bs->drv; 958 int64_t sector_num; 959 unsigned int nb_sectors; 960 int ret; 961 962 assert(!(flags & ~BDRV_REQ_MASK)); 963 964 if (!drv) { 965 return -ENOMEDIUM; 966 } 967 968 if (drv->bdrv_co_pwritev) { 969 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 970 flags & bs->supported_write_flags); 971 flags &= ~bs->supported_write_flags; 972 goto emulate_flags; 973 } 974 975 sector_num = offset >> BDRV_SECTOR_BITS; 976 nb_sectors = bytes >> BDRV_SECTOR_BITS; 977 978 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 979 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 980 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 981 982 if (drv->bdrv_co_writev_flags) { 983 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, 984 flags & bs->supported_write_flags); 985 flags &= ~bs->supported_write_flags; 986 } else if (drv->bdrv_co_writev) { 987 assert(!bs->supported_write_flags); 988 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 989 } else { 990 BlockAIOCB *acb; 991 CoroutineIOCompletion co = { 992 .coroutine = qemu_coroutine_self(), 993 }; 994 995 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, 996 bdrv_co_io_em_complete, &co); 997 if (acb == NULL) { 998 ret = -EIO; 999 } else { 1000 qemu_coroutine_yield(); 1001 ret = co.ret; 1002 } 1003 } 1004 1005 emulate_flags: 1006 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1007 ret = bdrv_co_flush(bs); 1008 } 1009 1010 return ret; 1011 } 1012 1013 static int coroutine_fn 1014 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1015 uint64_t bytes, QEMUIOVector *qiov) 1016 { 1017 BlockDriver *drv = bs->drv; 1018 1019 if (!drv) { 1020 return -ENOMEDIUM; 1021 } 1022 1023 if (!drv->bdrv_co_pwritev_compressed) { 1024 return -ENOTSUP; 1025 } 1026 1027 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1028 } 1029 1030 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1031 int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 1032 { 1033 BlockDriverState *bs = child->bs; 1034 1035 /* Perform I/O through a temporary buffer so that users who scribble over 1036 * their read buffer while the operation is in progress do not end up 1037 * modifying the image file. This is critical for zero-copy guest I/O 1038 * where anything might happen inside guest memory. 1039 */ 1040 void *bounce_buffer; 1041 1042 BlockDriver *drv = bs->drv; 1043 struct iovec iov; 1044 QEMUIOVector local_qiov; 1045 int64_t cluster_offset; 1046 int64_t cluster_bytes; 1047 size_t skip_bytes; 1048 int ret; 1049 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1050 BDRV_REQUEST_MAX_BYTES); 1051 unsigned int progress = 0; 1052 1053 if (!drv) { 1054 return -ENOMEDIUM; 1055 } 1056 1057 /* FIXME We cannot require callers to have write permissions when all they 1058 * are doing is a read request. If we did things right, write permissions 1059 * would be obtained anyway, but internally by the copy-on-read code. As 1060 * long as it is implemented here rather than in a separate filter driver, 1061 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1062 * it could request permissions. Therefore we have to bypass the permission 1063 * system for the moment. */ 1064 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1065 1066 /* Cover entire cluster so no additional backing file I/O is required when 1067 * allocating cluster in the image file. Note that this value may exceed 1068 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1069 * is one reason we loop rather than doing it all at once. 1070 */ 1071 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1072 skip_bytes = offset - cluster_offset; 1073 1074 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1075 cluster_offset, cluster_bytes); 1076 1077 bounce_buffer = qemu_try_blockalign(bs, 1078 MIN(MIN(max_transfer, cluster_bytes), 1079 MAX_BOUNCE_BUFFER)); 1080 if (bounce_buffer == NULL) { 1081 ret = -ENOMEM; 1082 goto err; 1083 } 1084 1085 while (cluster_bytes) { 1086 int64_t pnum; 1087 1088 ret = bdrv_is_allocated(bs, cluster_offset, 1089 MIN(cluster_bytes, max_transfer), &pnum); 1090 if (ret < 0) { 1091 /* Safe to treat errors in querying allocation as if 1092 * unallocated; we'll probably fail again soon on the 1093 * read, but at least that will set a decent errno. 1094 */ 1095 pnum = MIN(cluster_bytes, max_transfer); 1096 } 1097 1098 assert(skip_bytes < pnum); 1099 1100 if (ret <= 0) { 1101 /* Must copy-on-read; use the bounce buffer */ 1102 iov.iov_base = bounce_buffer; 1103 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1104 qemu_iovec_init_external(&local_qiov, &iov, 1); 1105 1106 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1107 &local_qiov, 0); 1108 if (ret < 0) { 1109 goto err; 1110 } 1111 1112 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1113 if (drv->bdrv_co_pwrite_zeroes && 1114 buffer_is_zero(bounce_buffer, pnum)) { 1115 /* FIXME: Should we (perhaps conditionally) be setting 1116 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1117 * that still correctly reads as zero? */ 1118 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0); 1119 } else { 1120 /* This does not change the data on the disk, it is not 1121 * necessary to flush even in cache=writethrough mode. 1122 */ 1123 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1124 &local_qiov, 0); 1125 } 1126 1127 if (ret < 0) { 1128 /* It might be okay to ignore write errors for guest 1129 * requests. If this is a deliberate copy-on-read 1130 * then we don't want to ignore the error. Simply 1131 * report it in all cases. 1132 */ 1133 goto err; 1134 } 1135 1136 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, 1137 pnum - skip_bytes); 1138 } else { 1139 /* Read directly into the destination */ 1140 qemu_iovec_init(&local_qiov, qiov->niov); 1141 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); 1142 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, 1143 &local_qiov, 0); 1144 qemu_iovec_destroy(&local_qiov); 1145 if (ret < 0) { 1146 goto err; 1147 } 1148 } 1149 1150 cluster_offset += pnum; 1151 cluster_bytes -= pnum; 1152 progress += pnum - skip_bytes; 1153 skip_bytes = 0; 1154 } 1155 ret = 0; 1156 1157 err: 1158 qemu_vfree(bounce_buffer); 1159 return ret; 1160 } 1161 1162 /* 1163 * Forwards an already correctly aligned request to the BlockDriver. This 1164 * handles copy on read, zeroing after EOF, and fragmentation of large 1165 * reads; any other features must be implemented by the caller. 1166 */ 1167 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1168 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1169 int64_t align, QEMUIOVector *qiov, int flags) 1170 { 1171 BlockDriverState *bs = child->bs; 1172 int64_t total_bytes, max_bytes; 1173 int ret = 0; 1174 uint64_t bytes_remaining = bytes; 1175 int max_transfer; 1176 1177 assert(is_power_of_2(align)); 1178 assert((offset & (align - 1)) == 0); 1179 assert((bytes & (align - 1)) == 0); 1180 assert(!qiov || bytes == qiov->size); 1181 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1182 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1183 align); 1184 1185 /* TODO: We would need a per-BDS .supported_read_flags and 1186 * potential fallback support, if we ever implement any read flags 1187 * to pass through to drivers. For now, there aren't any 1188 * passthrough flags. */ 1189 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 1190 1191 /* Handle Copy on Read and associated serialisation */ 1192 if (flags & BDRV_REQ_COPY_ON_READ) { 1193 /* If we touch the same cluster it counts as an overlap. This 1194 * guarantees that allocating writes will be serialized and not race 1195 * with each other for the same cluster. For example, in copy-on-read 1196 * it ensures that the CoR read and write operations are atomic and 1197 * guest writes cannot interleave between them. */ 1198 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1199 } 1200 1201 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 1202 wait_serialising_requests(req); 1203 } 1204 1205 if (flags & BDRV_REQ_COPY_ON_READ) { 1206 int64_t pnum; 1207 1208 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1209 if (ret < 0) { 1210 goto out; 1211 } 1212 1213 if (!ret || pnum != bytes) { 1214 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 1215 goto out; 1216 } 1217 } 1218 1219 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1220 total_bytes = bdrv_getlength(bs); 1221 if (total_bytes < 0) { 1222 ret = total_bytes; 1223 goto out; 1224 } 1225 1226 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1227 if (bytes <= max_bytes && bytes <= max_transfer) { 1228 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 1229 goto out; 1230 } 1231 1232 while (bytes_remaining) { 1233 int num; 1234 1235 if (max_bytes) { 1236 QEMUIOVector local_qiov; 1237 1238 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1239 assert(num); 1240 qemu_iovec_init(&local_qiov, qiov->niov); 1241 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1242 1243 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1244 num, &local_qiov, 0); 1245 max_bytes -= num; 1246 qemu_iovec_destroy(&local_qiov); 1247 } else { 1248 num = bytes_remaining; 1249 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1250 bytes_remaining); 1251 } 1252 if (ret < 0) { 1253 goto out; 1254 } 1255 bytes_remaining -= num; 1256 } 1257 1258 out: 1259 return ret < 0 ? ret : 0; 1260 } 1261 1262 /* 1263 * Handle a read request in coroutine context 1264 */ 1265 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1266 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1267 BdrvRequestFlags flags) 1268 { 1269 BlockDriverState *bs = child->bs; 1270 BlockDriver *drv = bs->drv; 1271 BdrvTrackedRequest req; 1272 1273 uint64_t align = bs->bl.request_alignment; 1274 uint8_t *head_buf = NULL; 1275 uint8_t *tail_buf = NULL; 1276 QEMUIOVector local_qiov; 1277 bool use_local_qiov = false; 1278 int ret; 1279 1280 trace_bdrv_co_preadv(child->bs, offset, bytes, flags); 1281 1282 if (!drv) { 1283 return -ENOMEDIUM; 1284 } 1285 1286 ret = bdrv_check_byte_request(bs, offset, bytes); 1287 if (ret < 0) { 1288 return ret; 1289 } 1290 1291 bdrv_inc_in_flight(bs); 1292 1293 /* Don't do copy-on-read if we read data before write operation */ 1294 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 1295 flags |= BDRV_REQ_COPY_ON_READ; 1296 } 1297 1298 /* Align read if necessary by padding qiov */ 1299 if (offset & (align - 1)) { 1300 head_buf = qemu_blockalign(bs, align); 1301 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1302 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1303 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1304 use_local_qiov = true; 1305 1306 bytes += offset & (align - 1); 1307 offset = offset & ~(align - 1); 1308 } 1309 1310 if ((offset + bytes) & (align - 1)) { 1311 if (!use_local_qiov) { 1312 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1313 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1314 use_local_qiov = true; 1315 } 1316 tail_buf = qemu_blockalign(bs, align); 1317 qemu_iovec_add(&local_qiov, tail_buf, 1318 align - ((offset + bytes) & (align - 1))); 1319 1320 bytes = ROUND_UP(bytes, align); 1321 } 1322 1323 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1324 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 1325 use_local_qiov ? &local_qiov : qiov, 1326 flags); 1327 tracked_request_end(&req); 1328 bdrv_dec_in_flight(bs); 1329 1330 if (use_local_qiov) { 1331 qemu_iovec_destroy(&local_qiov); 1332 qemu_vfree(head_buf); 1333 qemu_vfree(tail_buf); 1334 } 1335 1336 return ret; 1337 } 1338 1339 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, 1340 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1341 BdrvRequestFlags flags) 1342 { 1343 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1344 return -EINVAL; 1345 } 1346 1347 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, 1348 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1349 } 1350 1351 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, 1352 int nb_sectors, QEMUIOVector *qiov) 1353 { 1354 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); 1355 } 1356 1357 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1358 int64_t offset, int bytes, BdrvRequestFlags flags) 1359 { 1360 BlockDriver *drv = bs->drv; 1361 QEMUIOVector qiov; 1362 struct iovec iov = {0}; 1363 int ret = 0; 1364 bool need_flush = false; 1365 int head = 0; 1366 int tail = 0; 1367 1368 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1369 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1370 bs->bl.request_alignment); 1371 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1372 1373 if (!drv) { 1374 return -ENOMEDIUM; 1375 } 1376 1377 assert(alignment % bs->bl.request_alignment == 0); 1378 head = offset % alignment; 1379 tail = (offset + bytes) % alignment; 1380 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1381 assert(max_write_zeroes >= bs->bl.request_alignment); 1382 1383 while (bytes > 0 && !ret) { 1384 int num = bytes; 1385 1386 /* Align request. Block drivers can expect the "bulk" of the request 1387 * to be aligned, and that unaligned requests do not cross cluster 1388 * boundaries. 1389 */ 1390 if (head) { 1391 /* Make a small request up to the first aligned sector. For 1392 * convenience, limit this request to max_transfer even if 1393 * we don't need to fall back to writes. */ 1394 num = MIN(MIN(bytes, max_transfer), alignment - head); 1395 head = (head + num) % alignment; 1396 assert(num < max_write_zeroes); 1397 } else if (tail && num > alignment) { 1398 /* Shorten the request to the last aligned sector. */ 1399 num -= tail; 1400 } 1401 1402 /* limit request size */ 1403 if (num > max_write_zeroes) { 1404 num = max_write_zeroes; 1405 } 1406 1407 ret = -ENOTSUP; 1408 /* First try the efficient write zeroes operation */ 1409 if (drv->bdrv_co_pwrite_zeroes) { 1410 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1411 flags & bs->supported_zero_flags); 1412 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1413 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1414 need_flush = true; 1415 } 1416 } else { 1417 assert(!bs->supported_zero_flags); 1418 } 1419 1420 if (ret == -ENOTSUP) { 1421 /* Fall back to bounce buffer if write zeroes is unsupported */ 1422 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1423 1424 if ((flags & BDRV_REQ_FUA) && 1425 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1426 /* No need for bdrv_driver_pwrite() to do a fallback 1427 * flush on each chunk; use just one at the end */ 1428 write_flags &= ~BDRV_REQ_FUA; 1429 need_flush = true; 1430 } 1431 num = MIN(num, max_transfer); 1432 iov.iov_len = num; 1433 if (iov.iov_base == NULL) { 1434 iov.iov_base = qemu_try_blockalign(bs, num); 1435 if (iov.iov_base == NULL) { 1436 ret = -ENOMEM; 1437 goto fail; 1438 } 1439 memset(iov.iov_base, 0, num); 1440 } 1441 qemu_iovec_init_external(&qiov, &iov, 1); 1442 1443 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 1444 1445 /* Keep bounce buffer around if it is big enough for all 1446 * all future requests. 1447 */ 1448 if (num < max_transfer) { 1449 qemu_vfree(iov.iov_base); 1450 iov.iov_base = NULL; 1451 } 1452 } 1453 1454 offset += num; 1455 bytes -= num; 1456 } 1457 1458 fail: 1459 if (ret == 0 && need_flush) { 1460 ret = bdrv_co_flush(bs); 1461 } 1462 qemu_vfree(iov.iov_base); 1463 return ret; 1464 } 1465 1466 /* 1467 * Forwards an already correctly aligned write request to the BlockDriver, 1468 * after possibly fragmenting it. 1469 */ 1470 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1471 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1472 int64_t align, QEMUIOVector *qiov, int flags) 1473 { 1474 BlockDriverState *bs = child->bs; 1475 BlockDriver *drv = bs->drv; 1476 bool waited; 1477 int ret; 1478 1479 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1480 uint64_t bytes_remaining = bytes; 1481 int max_transfer; 1482 1483 if (!drv) { 1484 return -ENOMEDIUM; 1485 } 1486 1487 if (bdrv_has_readonly_bitmaps(bs)) { 1488 return -EPERM; 1489 } 1490 1491 assert(is_power_of_2(align)); 1492 assert((offset & (align - 1)) == 0); 1493 assert((bytes & (align - 1)) == 0); 1494 assert(!qiov || bytes == qiov->size); 1495 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1496 assert(!(flags & ~BDRV_REQ_MASK)); 1497 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1498 align); 1499 1500 waited = wait_serialising_requests(req); 1501 assert(!waited || !req->serialising); 1502 assert(req->overlap_offset <= offset); 1503 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1504 assert(child->perm & BLK_PERM_WRITE); 1505 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1506 1507 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1508 1509 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1510 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1511 qemu_iovec_is_zero(qiov)) { 1512 flags |= BDRV_REQ_ZERO_WRITE; 1513 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1514 flags |= BDRV_REQ_MAY_UNMAP; 1515 } 1516 } 1517 1518 if (ret < 0) { 1519 /* Do nothing, write notifier decided to fail this request */ 1520 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1521 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1522 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1523 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1524 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 1525 } else if (bytes <= max_transfer) { 1526 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1527 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 1528 } else { 1529 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1530 while (bytes_remaining) { 1531 int num = MIN(bytes_remaining, max_transfer); 1532 QEMUIOVector local_qiov; 1533 int local_flags = flags; 1534 1535 assert(num); 1536 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1537 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1538 /* If FUA is going to be emulated by flush, we only 1539 * need to flush on the last iteration */ 1540 local_flags &= ~BDRV_REQ_FUA; 1541 } 1542 qemu_iovec_init(&local_qiov, qiov->niov); 1543 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1544 1545 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1546 num, &local_qiov, local_flags); 1547 qemu_iovec_destroy(&local_qiov); 1548 if (ret < 0) { 1549 break; 1550 } 1551 bytes_remaining -= num; 1552 } 1553 } 1554 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1555 1556 atomic_inc(&bs->write_gen); 1557 bdrv_set_dirty(bs, offset, bytes); 1558 1559 stat64_max(&bs->wr_highest_offset, offset + bytes); 1560 1561 if (ret >= 0) { 1562 bs->total_sectors = MAX(bs->total_sectors, end_sector); 1563 ret = 0; 1564 } 1565 1566 return ret; 1567 } 1568 1569 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1570 int64_t offset, 1571 unsigned int bytes, 1572 BdrvRequestFlags flags, 1573 BdrvTrackedRequest *req) 1574 { 1575 BlockDriverState *bs = child->bs; 1576 uint8_t *buf = NULL; 1577 QEMUIOVector local_qiov; 1578 struct iovec iov; 1579 uint64_t align = bs->bl.request_alignment; 1580 unsigned int head_padding_bytes, tail_padding_bytes; 1581 int ret = 0; 1582 1583 head_padding_bytes = offset & (align - 1); 1584 tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 1585 1586 1587 assert(flags & BDRV_REQ_ZERO_WRITE); 1588 if (head_padding_bytes || tail_padding_bytes) { 1589 buf = qemu_blockalign(bs, align); 1590 iov = (struct iovec) { 1591 .iov_base = buf, 1592 .iov_len = align, 1593 }; 1594 qemu_iovec_init_external(&local_qiov, &iov, 1); 1595 } 1596 if (head_padding_bytes) { 1597 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1598 1599 /* RMW the unaligned part before head. */ 1600 mark_request_serialising(req, align); 1601 wait_serialising_requests(req); 1602 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1603 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 1604 align, &local_qiov, 0); 1605 if (ret < 0) { 1606 goto fail; 1607 } 1608 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1609 1610 memset(buf + head_padding_bytes, 0, zero_bytes); 1611 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1612 align, &local_qiov, 1613 flags & ~BDRV_REQ_ZERO_WRITE); 1614 if (ret < 0) { 1615 goto fail; 1616 } 1617 offset += zero_bytes; 1618 bytes -= zero_bytes; 1619 } 1620 1621 assert(!bytes || (offset & (align - 1)) == 0); 1622 if (bytes >= align) { 1623 /* Write the aligned part in the middle. */ 1624 uint64_t aligned_bytes = bytes & ~(align - 1); 1625 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 1626 NULL, flags); 1627 if (ret < 0) { 1628 goto fail; 1629 } 1630 bytes -= aligned_bytes; 1631 offset += aligned_bytes; 1632 } 1633 1634 assert(!bytes || (offset & (align - 1)) == 0); 1635 if (bytes) { 1636 assert(align == tail_padding_bytes + bytes); 1637 /* RMW the unaligned part after tail. */ 1638 mark_request_serialising(req, align); 1639 wait_serialising_requests(req); 1640 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1641 ret = bdrv_aligned_preadv(child, req, offset, align, 1642 align, &local_qiov, 0); 1643 if (ret < 0) { 1644 goto fail; 1645 } 1646 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1647 1648 memset(buf, 0, bytes); 1649 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 1650 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1651 } 1652 fail: 1653 qemu_vfree(buf); 1654 return ret; 1655 1656 } 1657 1658 /* 1659 * Handle a write request in coroutine context 1660 */ 1661 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 1662 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1663 BdrvRequestFlags flags) 1664 { 1665 BlockDriverState *bs = child->bs; 1666 BdrvTrackedRequest req; 1667 uint64_t align = bs->bl.request_alignment; 1668 uint8_t *head_buf = NULL; 1669 uint8_t *tail_buf = NULL; 1670 QEMUIOVector local_qiov; 1671 bool use_local_qiov = false; 1672 int ret; 1673 1674 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 1675 1676 if (!bs->drv) { 1677 return -ENOMEDIUM; 1678 } 1679 if (bs->read_only) { 1680 return -EPERM; 1681 } 1682 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1683 1684 ret = bdrv_check_byte_request(bs, offset, bytes); 1685 if (ret < 0) { 1686 return ret; 1687 } 1688 1689 bdrv_inc_in_flight(bs); 1690 /* 1691 * Align write if necessary by performing a read-modify-write cycle. 1692 * Pad qiov with the read parts and be sure to have a tracked request not 1693 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1694 */ 1695 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1696 1697 if (flags & BDRV_REQ_ZERO_WRITE) { 1698 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 1699 goto out; 1700 } 1701 1702 if (offset & (align - 1)) { 1703 QEMUIOVector head_qiov; 1704 struct iovec head_iov; 1705 1706 mark_request_serialising(&req, align); 1707 wait_serialising_requests(&req); 1708 1709 head_buf = qemu_blockalign(bs, align); 1710 head_iov = (struct iovec) { 1711 .iov_base = head_buf, 1712 .iov_len = align, 1713 }; 1714 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1715 1716 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1717 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 1718 align, &head_qiov, 0); 1719 if (ret < 0) { 1720 goto fail; 1721 } 1722 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1723 1724 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1725 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1726 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1727 use_local_qiov = true; 1728 1729 bytes += offset & (align - 1); 1730 offset = offset & ~(align - 1); 1731 1732 /* We have read the tail already if the request is smaller 1733 * than one aligned block. 1734 */ 1735 if (bytes < align) { 1736 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1737 bytes = align; 1738 } 1739 } 1740 1741 if ((offset + bytes) & (align - 1)) { 1742 QEMUIOVector tail_qiov; 1743 struct iovec tail_iov; 1744 size_t tail_bytes; 1745 bool waited; 1746 1747 mark_request_serialising(&req, align); 1748 waited = wait_serialising_requests(&req); 1749 assert(!waited || !use_local_qiov); 1750 1751 tail_buf = qemu_blockalign(bs, align); 1752 tail_iov = (struct iovec) { 1753 .iov_base = tail_buf, 1754 .iov_len = align, 1755 }; 1756 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1757 1758 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1759 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 1760 align, align, &tail_qiov, 0); 1761 if (ret < 0) { 1762 goto fail; 1763 } 1764 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1765 1766 if (!use_local_qiov) { 1767 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1768 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1769 use_local_qiov = true; 1770 } 1771 1772 tail_bytes = (offset + bytes) & (align - 1); 1773 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1774 1775 bytes = ROUND_UP(bytes, align); 1776 } 1777 1778 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 1779 use_local_qiov ? &local_qiov : qiov, 1780 flags); 1781 1782 fail: 1783 1784 if (use_local_qiov) { 1785 qemu_iovec_destroy(&local_qiov); 1786 } 1787 qemu_vfree(head_buf); 1788 qemu_vfree(tail_buf); 1789 out: 1790 tracked_request_end(&req); 1791 bdrv_dec_in_flight(bs); 1792 return ret; 1793 } 1794 1795 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, 1796 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1797 BdrvRequestFlags flags) 1798 { 1799 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1800 return -EINVAL; 1801 } 1802 1803 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, 1804 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1805 } 1806 1807 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, 1808 int nb_sectors, QEMUIOVector *qiov) 1809 { 1810 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); 1811 } 1812 1813 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1814 int bytes, BdrvRequestFlags flags) 1815 { 1816 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 1817 1818 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 1819 flags &= ~BDRV_REQ_MAY_UNMAP; 1820 } 1821 1822 return bdrv_co_pwritev(child, offset, bytes, NULL, 1823 BDRV_REQ_ZERO_WRITE | flags); 1824 } 1825 1826 /* 1827 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 1828 */ 1829 int bdrv_flush_all(void) 1830 { 1831 BdrvNextIterator it; 1832 BlockDriverState *bs = NULL; 1833 int result = 0; 1834 1835 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 1836 AioContext *aio_context = bdrv_get_aio_context(bs); 1837 int ret; 1838 1839 aio_context_acquire(aio_context); 1840 ret = bdrv_flush(bs); 1841 if (ret < 0 && !result) { 1842 result = ret; 1843 } 1844 aio_context_release(aio_context); 1845 } 1846 1847 return result; 1848 } 1849 1850 1851 typedef struct BdrvCoBlockStatusData { 1852 BlockDriverState *bs; 1853 BlockDriverState *base; 1854 bool want_zero; 1855 int64_t offset; 1856 int64_t bytes; 1857 int64_t *pnum; 1858 int64_t *map; 1859 BlockDriverState **file; 1860 int ret; 1861 bool done; 1862 } BdrvCoBlockStatusData; 1863 1864 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 1865 bool want_zero, 1866 int64_t offset, 1867 int64_t bytes, 1868 int64_t *pnum, 1869 int64_t *map, 1870 BlockDriverState **file) 1871 { 1872 assert(bs->file && bs->file->bs); 1873 *pnum = bytes; 1874 *map = offset; 1875 *file = bs->file->bs; 1876 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1877 } 1878 1879 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 1880 bool want_zero, 1881 int64_t offset, 1882 int64_t bytes, 1883 int64_t *pnum, 1884 int64_t *map, 1885 BlockDriverState **file) 1886 { 1887 assert(bs->backing && bs->backing->bs); 1888 *pnum = bytes; 1889 *map = offset; 1890 *file = bs->backing->bs; 1891 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1892 } 1893 1894 /* 1895 * Returns the allocation status of the specified sectors. 1896 * Drivers not implementing the functionality are assumed to not support 1897 * backing files, hence all their sectors are reported as allocated. 1898 * 1899 * If 'want_zero' is true, the caller is querying for mapping 1900 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 1901 * _ZERO where possible; otherwise, the result favors larger 'pnum', 1902 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 1903 * 1904 * If 'offset' is beyond the end of the disk image the return value is 1905 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 1906 * 1907 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 1908 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 1909 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 1910 * 1911 * 'pnum' is set to the number of bytes (including and immediately 1912 * following the specified offset) that are easily known to be in the 1913 * same allocated/unallocated state. Note that a second call starting 1914 * at the original offset plus returned pnum may have the same status. 1915 * The returned value is non-zero on success except at end-of-file. 1916 * 1917 * Returns negative errno on failure. Otherwise, if the 1918 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 1919 * set to the host mapping and BDS corresponding to the guest offset. 1920 */ 1921 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 1922 bool want_zero, 1923 int64_t offset, int64_t bytes, 1924 int64_t *pnum, int64_t *map, 1925 BlockDriverState **file) 1926 { 1927 int64_t total_size; 1928 int64_t n; /* bytes */ 1929 int ret; 1930 int64_t local_map = 0; 1931 BlockDriverState *local_file = NULL; 1932 int64_t aligned_offset, aligned_bytes; 1933 uint32_t align; 1934 1935 assert(pnum); 1936 *pnum = 0; 1937 total_size = bdrv_getlength(bs); 1938 if (total_size < 0) { 1939 ret = total_size; 1940 goto early_out; 1941 } 1942 1943 if (offset >= total_size) { 1944 ret = BDRV_BLOCK_EOF; 1945 goto early_out; 1946 } 1947 if (!bytes) { 1948 ret = 0; 1949 goto early_out; 1950 } 1951 1952 n = total_size - offset; 1953 if (n < bytes) { 1954 bytes = n; 1955 } 1956 1957 /* Must be non-NULL or bdrv_getlength() would have failed */ 1958 assert(bs->drv); 1959 if (!bs->drv->bdrv_co_block_status) { 1960 *pnum = bytes; 1961 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1962 if (offset + bytes == total_size) { 1963 ret |= BDRV_BLOCK_EOF; 1964 } 1965 if (bs->drv->protocol_name) { 1966 ret |= BDRV_BLOCK_OFFSET_VALID; 1967 local_map = offset; 1968 local_file = bs; 1969 } 1970 goto early_out; 1971 } 1972 1973 bdrv_inc_in_flight(bs); 1974 1975 /* Round out to request_alignment boundaries */ 1976 align = bs->bl.request_alignment; 1977 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 1978 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 1979 1980 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 1981 aligned_bytes, pnum, &local_map, 1982 &local_file); 1983 if (ret < 0) { 1984 *pnum = 0; 1985 goto out; 1986 } 1987 1988 /* 1989 * The driver's result must be a non-zero multiple of request_alignment. 1990 * Clamp pnum and adjust map to original request. 1991 */ 1992 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 1993 align > offset - aligned_offset); 1994 *pnum -= offset - aligned_offset; 1995 if (*pnum > bytes) { 1996 *pnum = bytes; 1997 } 1998 if (ret & BDRV_BLOCK_OFFSET_VALID) { 1999 local_map += offset - aligned_offset; 2000 } 2001 2002 if (ret & BDRV_BLOCK_RAW) { 2003 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2004 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2005 *pnum, pnum, &local_map, &local_file); 2006 goto out; 2007 } 2008 2009 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2010 ret |= BDRV_BLOCK_ALLOCATED; 2011 } else if (want_zero) { 2012 if (bdrv_unallocated_blocks_are_zero(bs)) { 2013 ret |= BDRV_BLOCK_ZERO; 2014 } else if (bs->backing) { 2015 BlockDriverState *bs2 = bs->backing->bs; 2016 int64_t size2 = bdrv_getlength(bs2); 2017 2018 if (size2 >= 0 && offset >= size2) { 2019 ret |= BDRV_BLOCK_ZERO; 2020 } 2021 } 2022 } 2023 2024 if (want_zero && local_file && local_file != bs && 2025 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2026 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2027 int64_t file_pnum; 2028 int ret2; 2029 2030 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2031 *pnum, &file_pnum, NULL, NULL); 2032 if (ret2 >= 0) { 2033 /* Ignore errors. This is just providing extra information, it 2034 * is useful but not necessary. 2035 */ 2036 if (ret2 & BDRV_BLOCK_EOF && 2037 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2038 /* 2039 * It is valid for the format block driver to read 2040 * beyond the end of the underlying file's current 2041 * size; such areas read as zero. 2042 */ 2043 ret |= BDRV_BLOCK_ZERO; 2044 } else { 2045 /* Limit request to the range reported by the protocol driver */ 2046 *pnum = file_pnum; 2047 ret |= (ret2 & BDRV_BLOCK_ZERO); 2048 } 2049 } 2050 } 2051 2052 out: 2053 bdrv_dec_in_flight(bs); 2054 if (ret >= 0 && offset + *pnum == total_size) { 2055 ret |= BDRV_BLOCK_EOF; 2056 } 2057 early_out: 2058 if (file) { 2059 *file = local_file; 2060 } 2061 if (map) { 2062 *map = local_map; 2063 } 2064 return ret; 2065 } 2066 2067 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2068 BlockDriverState *base, 2069 bool want_zero, 2070 int64_t offset, 2071 int64_t bytes, 2072 int64_t *pnum, 2073 int64_t *map, 2074 BlockDriverState **file) 2075 { 2076 BlockDriverState *p; 2077 int ret = 0; 2078 bool first = true; 2079 2080 assert(bs != base); 2081 for (p = bs; p != base; p = backing_bs(p)) { 2082 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2083 file); 2084 if (ret < 0) { 2085 break; 2086 } 2087 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2088 /* 2089 * Reading beyond the end of the file continues to read 2090 * zeroes, but we can only widen the result to the 2091 * unallocated length we learned from an earlier 2092 * iteration. 2093 */ 2094 *pnum = bytes; 2095 } 2096 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2097 break; 2098 } 2099 /* [offset, pnum] unallocated on this layer, which could be only 2100 * the first part of [offset, bytes]. */ 2101 bytes = MIN(bytes, *pnum); 2102 first = false; 2103 } 2104 return ret; 2105 } 2106 2107 /* Coroutine wrapper for bdrv_block_status_above() */ 2108 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2109 { 2110 BdrvCoBlockStatusData *data = opaque; 2111 2112 data->ret = bdrv_co_block_status_above(data->bs, data->base, 2113 data->want_zero, 2114 data->offset, data->bytes, 2115 data->pnum, data->map, data->file); 2116 data->done = true; 2117 } 2118 2119 /* 2120 * Synchronous wrapper around bdrv_co_block_status_above(). 2121 * 2122 * See bdrv_co_block_status_above() for details. 2123 */ 2124 static int bdrv_common_block_status_above(BlockDriverState *bs, 2125 BlockDriverState *base, 2126 bool want_zero, int64_t offset, 2127 int64_t bytes, int64_t *pnum, 2128 int64_t *map, 2129 BlockDriverState **file) 2130 { 2131 Coroutine *co; 2132 BdrvCoBlockStatusData data = { 2133 .bs = bs, 2134 .base = base, 2135 .want_zero = want_zero, 2136 .offset = offset, 2137 .bytes = bytes, 2138 .pnum = pnum, 2139 .map = map, 2140 .file = file, 2141 .done = false, 2142 }; 2143 2144 if (qemu_in_coroutine()) { 2145 /* Fast-path if already in coroutine context */ 2146 bdrv_block_status_above_co_entry(&data); 2147 } else { 2148 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2149 bdrv_coroutine_enter(bs, co); 2150 BDRV_POLL_WHILE(bs, !data.done); 2151 } 2152 return data.ret; 2153 } 2154 2155 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2156 int64_t offset, int64_t bytes, int64_t *pnum, 2157 int64_t *map, BlockDriverState **file) 2158 { 2159 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2160 pnum, map, file); 2161 } 2162 2163 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2164 int64_t *pnum, int64_t *map, BlockDriverState **file) 2165 { 2166 return bdrv_block_status_above(bs, backing_bs(bs), 2167 offset, bytes, pnum, map, file); 2168 } 2169 2170 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2171 int64_t bytes, int64_t *pnum) 2172 { 2173 int ret; 2174 int64_t dummy; 2175 2176 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2177 bytes, pnum ? pnum : &dummy, NULL, 2178 NULL); 2179 if (ret < 0) { 2180 return ret; 2181 } 2182 return !!(ret & BDRV_BLOCK_ALLOCATED); 2183 } 2184 2185 /* 2186 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2187 * 2188 * Return true if (a prefix of) the given range is allocated in any image 2189 * between BASE and TOP (inclusive). BASE can be NULL to check if the given 2190 * offset is allocated in any image of the chain. Return false otherwise, 2191 * or negative errno on failure. 2192 * 2193 * 'pnum' is set to the number of bytes (including and immediately 2194 * following the specified offset) that are known to be in the same 2195 * allocated/unallocated state. Note that a subsequent call starting 2196 * at 'offset + *pnum' may return the same allocation status (in other 2197 * words, the result is not necessarily the maximum possible range); 2198 * but 'pnum' will only be 0 when end of file is reached. 2199 * 2200 */ 2201 int bdrv_is_allocated_above(BlockDriverState *top, 2202 BlockDriverState *base, 2203 int64_t offset, int64_t bytes, int64_t *pnum) 2204 { 2205 BlockDriverState *intermediate; 2206 int ret; 2207 int64_t n = bytes; 2208 2209 intermediate = top; 2210 while (intermediate && intermediate != base) { 2211 int64_t pnum_inter; 2212 int64_t size_inter; 2213 2214 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2215 if (ret < 0) { 2216 return ret; 2217 } 2218 if (ret) { 2219 *pnum = pnum_inter; 2220 return 1; 2221 } 2222 2223 size_inter = bdrv_getlength(intermediate); 2224 if (size_inter < 0) { 2225 return size_inter; 2226 } 2227 if (n > pnum_inter && 2228 (intermediate == top || offset + pnum_inter < size_inter)) { 2229 n = pnum_inter; 2230 } 2231 2232 intermediate = backing_bs(intermediate); 2233 } 2234 2235 *pnum = n; 2236 return 0; 2237 } 2238 2239 typedef struct BdrvVmstateCo { 2240 BlockDriverState *bs; 2241 QEMUIOVector *qiov; 2242 int64_t pos; 2243 bool is_read; 2244 int ret; 2245 } BdrvVmstateCo; 2246 2247 static int coroutine_fn 2248 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2249 bool is_read) 2250 { 2251 BlockDriver *drv = bs->drv; 2252 int ret = -ENOTSUP; 2253 2254 bdrv_inc_in_flight(bs); 2255 2256 if (!drv) { 2257 ret = -ENOMEDIUM; 2258 } else if (drv->bdrv_load_vmstate) { 2259 if (is_read) { 2260 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2261 } else { 2262 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2263 } 2264 } else if (bs->file) { 2265 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2266 } 2267 2268 bdrv_dec_in_flight(bs); 2269 return ret; 2270 } 2271 2272 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2273 { 2274 BdrvVmstateCo *co = opaque; 2275 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2276 } 2277 2278 static inline int 2279 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2280 bool is_read) 2281 { 2282 if (qemu_in_coroutine()) { 2283 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2284 } else { 2285 BdrvVmstateCo data = { 2286 .bs = bs, 2287 .qiov = qiov, 2288 .pos = pos, 2289 .is_read = is_read, 2290 .ret = -EINPROGRESS, 2291 }; 2292 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2293 2294 bdrv_coroutine_enter(bs, co); 2295 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 2296 return data.ret; 2297 } 2298 } 2299 2300 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2301 int64_t pos, int size) 2302 { 2303 QEMUIOVector qiov; 2304 struct iovec iov = { 2305 .iov_base = (void *) buf, 2306 .iov_len = size, 2307 }; 2308 int ret; 2309 2310 qemu_iovec_init_external(&qiov, &iov, 1); 2311 2312 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2313 if (ret < 0) { 2314 return ret; 2315 } 2316 2317 return size; 2318 } 2319 2320 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2321 { 2322 return bdrv_rw_vmstate(bs, qiov, pos, false); 2323 } 2324 2325 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2326 int64_t pos, int size) 2327 { 2328 QEMUIOVector qiov; 2329 struct iovec iov = { 2330 .iov_base = buf, 2331 .iov_len = size, 2332 }; 2333 int ret; 2334 2335 qemu_iovec_init_external(&qiov, &iov, 1); 2336 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2337 if (ret < 0) { 2338 return ret; 2339 } 2340 2341 return size; 2342 } 2343 2344 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2345 { 2346 return bdrv_rw_vmstate(bs, qiov, pos, true); 2347 } 2348 2349 /**************************************************************/ 2350 /* async I/Os */ 2351 2352 void bdrv_aio_cancel(BlockAIOCB *acb) 2353 { 2354 qemu_aio_ref(acb); 2355 bdrv_aio_cancel_async(acb); 2356 while (acb->refcnt > 1) { 2357 if (acb->aiocb_info->get_aio_context) { 2358 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2359 } else if (acb->bs) { 2360 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2361 * assert that we're not using an I/O thread. Thread-safe 2362 * code should use bdrv_aio_cancel_async exclusively. 2363 */ 2364 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2365 aio_poll(bdrv_get_aio_context(acb->bs), true); 2366 } else { 2367 abort(); 2368 } 2369 } 2370 qemu_aio_unref(acb); 2371 } 2372 2373 /* Async version of aio cancel. The caller is not blocked if the acb implements 2374 * cancel_async, otherwise we do nothing and let the request normally complete. 2375 * In either case the completion callback must be called. */ 2376 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2377 { 2378 if (acb->aiocb_info->cancel_async) { 2379 acb->aiocb_info->cancel_async(acb); 2380 } 2381 } 2382 2383 /**************************************************************/ 2384 /* Coroutine block device emulation */ 2385 2386 typedef struct FlushCo { 2387 BlockDriverState *bs; 2388 int ret; 2389 } FlushCo; 2390 2391 2392 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2393 { 2394 FlushCo *rwco = opaque; 2395 2396 rwco->ret = bdrv_co_flush(rwco->bs); 2397 } 2398 2399 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2400 { 2401 int current_gen; 2402 int ret = 0; 2403 2404 bdrv_inc_in_flight(bs); 2405 2406 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2407 bdrv_is_sg(bs)) { 2408 goto early_exit; 2409 } 2410 2411 qemu_co_mutex_lock(&bs->reqs_lock); 2412 current_gen = atomic_read(&bs->write_gen); 2413 2414 /* Wait until any previous flushes are completed */ 2415 while (bs->active_flush_req) { 2416 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2417 } 2418 2419 /* Flushes reach this point in nondecreasing current_gen order. */ 2420 bs->active_flush_req = true; 2421 qemu_co_mutex_unlock(&bs->reqs_lock); 2422 2423 /* Write back all layers by calling one driver function */ 2424 if (bs->drv->bdrv_co_flush) { 2425 ret = bs->drv->bdrv_co_flush(bs); 2426 goto out; 2427 } 2428 2429 /* Write back cached data to the OS even with cache=unsafe */ 2430 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2431 if (bs->drv->bdrv_co_flush_to_os) { 2432 ret = bs->drv->bdrv_co_flush_to_os(bs); 2433 if (ret < 0) { 2434 goto out; 2435 } 2436 } 2437 2438 /* But don't actually force it to the disk with cache=unsafe */ 2439 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2440 goto flush_parent; 2441 } 2442 2443 /* Check if we really need to flush anything */ 2444 if (bs->flushed_gen == current_gen) { 2445 goto flush_parent; 2446 } 2447 2448 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2449 if (!bs->drv) { 2450 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2451 * (even in case of apparent success) */ 2452 ret = -ENOMEDIUM; 2453 goto out; 2454 } 2455 if (bs->drv->bdrv_co_flush_to_disk) { 2456 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2457 } else if (bs->drv->bdrv_aio_flush) { 2458 BlockAIOCB *acb; 2459 CoroutineIOCompletion co = { 2460 .coroutine = qemu_coroutine_self(), 2461 }; 2462 2463 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2464 if (acb == NULL) { 2465 ret = -EIO; 2466 } else { 2467 qemu_coroutine_yield(); 2468 ret = co.ret; 2469 } 2470 } else { 2471 /* 2472 * Some block drivers always operate in either writethrough or unsafe 2473 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2474 * know how the server works (because the behaviour is hardcoded or 2475 * depends on server-side configuration), so we can't ensure that 2476 * everything is safe on disk. Returning an error doesn't work because 2477 * that would break guests even if the server operates in writethrough 2478 * mode. 2479 * 2480 * Let's hope the user knows what he's doing. 2481 */ 2482 ret = 0; 2483 } 2484 2485 if (ret < 0) { 2486 goto out; 2487 } 2488 2489 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2490 * in the case of cache=unsafe, so there are no useless flushes. 2491 */ 2492 flush_parent: 2493 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2494 out: 2495 /* Notify any pending flushes that we have completed */ 2496 if (ret == 0) { 2497 bs->flushed_gen = current_gen; 2498 } 2499 2500 qemu_co_mutex_lock(&bs->reqs_lock); 2501 bs->active_flush_req = false; 2502 /* Return value is ignored - it's ok if wait queue is empty */ 2503 qemu_co_queue_next(&bs->flush_queue); 2504 qemu_co_mutex_unlock(&bs->reqs_lock); 2505 2506 early_exit: 2507 bdrv_dec_in_flight(bs); 2508 return ret; 2509 } 2510 2511 int bdrv_flush(BlockDriverState *bs) 2512 { 2513 Coroutine *co; 2514 FlushCo flush_co = { 2515 .bs = bs, 2516 .ret = NOT_DONE, 2517 }; 2518 2519 if (qemu_in_coroutine()) { 2520 /* Fast-path if already in coroutine context */ 2521 bdrv_flush_co_entry(&flush_co); 2522 } else { 2523 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2524 bdrv_coroutine_enter(bs, co); 2525 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2526 } 2527 2528 return flush_co.ret; 2529 } 2530 2531 typedef struct DiscardCo { 2532 BlockDriverState *bs; 2533 int64_t offset; 2534 int bytes; 2535 int ret; 2536 } DiscardCo; 2537 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2538 { 2539 DiscardCo *rwco = opaque; 2540 2541 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); 2542 } 2543 2544 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2545 int bytes) 2546 { 2547 BdrvTrackedRequest req; 2548 int max_pdiscard, ret; 2549 int head, tail, align; 2550 2551 if (!bs->drv) { 2552 return -ENOMEDIUM; 2553 } 2554 2555 if (bdrv_has_readonly_bitmaps(bs)) { 2556 return -EPERM; 2557 } 2558 2559 ret = bdrv_check_byte_request(bs, offset, bytes); 2560 if (ret < 0) { 2561 return ret; 2562 } else if (bs->read_only) { 2563 return -EPERM; 2564 } 2565 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2566 2567 /* Do nothing if disabled. */ 2568 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2569 return 0; 2570 } 2571 2572 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2573 return 0; 2574 } 2575 2576 /* Discard is advisory, but some devices track and coalesce 2577 * unaligned requests, so we must pass everything down rather than 2578 * round here. Still, most devices will just silently ignore 2579 * unaligned requests (by returning -ENOTSUP), so we must fragment 2580 * the request accordingly. */ 2581 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2582 assert(align % bs->bl.request_alignment == 0); 2583 head = offset % align; 2584 tail = (offset + bytes) % align; 2585 2586 bdrv_inc_in_flight(bs); 2587 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2588 2589 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2590 if (ret < 0) { 2591 goto out; 2592 } 2593 2594 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2595 align); 2596 assert(max_pdiscard >= bs->bl.request_alignment); 2597 2598 while (bytes > 0) { 2599 int num = bytes; 2600 2601 if (head) { 2602 /* Make small requests to get to alignment boundaries. */ 2603 num = MIN(bytes, align - head); 2604 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2605 num %= bs->bl.request_alignment; 2606 } 2607 head = (head + num) % align; 2608 assert(num < max_pdiscard); 2609 } else if (tail) { 2610 if (num > align) { 2611 /* Shorten the request to the last aligned cluster. */ 2612 num -= tail; 2613 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2614 tail > bs->bl.request_alignment) { 2615 tail %= bs->bl.request_alignment; 2616 num -= tail; 2617 } 2618 } 2619 /* limit request size */ 2620 if (num > max_pdiscard) { 2621 num = max_pdiscard; 2622 } 2623 2624 if (!bs->drv) { 2625 ret = -ENOMEDIUM; 2626 goto out; 2627 } 2628 if (bs->drv->bdrv_co_pdiscard) { 2629 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2630 } else { 2631 BlockAIOCB *acb; 2632 CoroutineIOCompletion co = { 2633 .coroutine = qemu_coroutine_self(), 2634 }; 2635 2636 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2637 bdrv_co_io_em_complete, &co); 2638 if (acb == NULL) { 2639 ret = -EIO; 2640 goto out; 2641 } else { 2642 qemu_coroutine_yield(); 2643 ret = co.ret; 2644 } 2645 } 2646 if (ret && ret != -ENOTSUP) { 2647 goto out; 2648 } 2649 2650 offset += num; 2651 bytes -= num; 2652 } 2653 ret = 0; 2654 out: 2655 atomic_inc(&bs->write_gen); 2656 bdrv_set_dirty(bs, req.offset, req.bytes); 2657 tracked_request_end(&req); 2658 bdrv_dec_in_flight(bs); 2659 return ret; 2660 } 2661 2662 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 2663 { 2664 Coroutine *co; 2665 DiscardCo rwco = { 2666 .bs = bs, 2667 .offset = offset, 2668 .bytes = bytes, 2669 .ret = NOT_DONE, 2670 }; 2671 2672 if (qemu_in_coroutine()) { 2673 /* Fast-path if already in coroutine context */ 2674 bdrv_pdiscard_co_entry(&rwco); 2675 } else { 2676 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2677 bdrv_coroutine_enter(bs, co); 2678 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 2679 } 2680 2681 return rwco.ret; 2682 } 2683 2684 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 2685 { 2686 BlockDriver *drv = bs->drv; 2687 CoroutineIOCompletion co = { 2688 .coroutine = qemu_coroutine_self(), 2689 }; 2690 BlockAIOCB *acb; 2691 2692 bdrv_inc_in_flight(bs); 2693 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 2694 co.ret = -ENOTSUP; 2695 goto out; 2696 } 2697 2698 if (drv->bdrv_co_ioctl) { 2699 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 2700 } else { 2701 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2702 if (!acb) { 2703 co.ret = -ENOTSUP; 2704 goto out; 2705 } 2706 qemu_coroutine_yield(); 2707 } 2708 out: 2709 bdrv_dec_in_flight(bs); 2710 return co.ret; 2711 } 2712 2713 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2714 { 2715 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2716 } 2717 2718 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2719 { 2720 return memset(qemu_blockalign(bs, size), 0, size); 2721 } 2722 2723 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2724 { 2725 size_t align = bdrv_opt_mem_align(bs); 2726 2727 /* Ensure that NULL is never returned on success */ 2728 assert(align > 0); 2729 if (size == 0) { 2730 size = align; 2731 } 2732 2733 return qemu_try_memalign(align, size); 2734 } 2735 2736 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2737 { 2738 void *mem = qemu_try_blockalign(bs, size); 2739 2740 if (mem) { 2741 memset(mem, 0, size); 2742 } 2743 2744 return mem; 2745 } 2746 2747 /* 2748 * Check if all memory in this vector is sector aligned. 2749 */ 2750 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2751 { 2752 int i; 2753 size_t alignment = bdrv_min_mem_align(bs); 2754 2755 for (i = 0; i < qiov->niov; i++) { 2756 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2757 return false; 2758 } 2759 if (qiov->iov[i].iov_len % alignment) { 2760 return false; 2761 } 2762 } 2763 2764 return true; 2765 } 2766 2767 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2768 NotifierWithReturn *notifier) 2769 { 2770 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2771 } 2772 2773 void bdrv_io_plug(BlockDriverState *bs) 2774 { 2775 BdrvChild *child; 2776 2777 QLIST_FOREACH(child, &bs->children, next) { 2778 bdrv_io_plug(child->bs); 2779 } 2780 2781 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 2782 BlockDriver *drv = bs->drv; 2783 if (drv && drv->bdrv_io_plug) { 2784 drv->bdrv_io_plug(bs); 2785 } 2786 } 2787 } 2788 2789 void bdrv_io_unplug(BlockDriverState *bs) 2790 { 2791 BdrvChild *child; 2792 2793 assert(bs->io_plugged); 2794 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 2795 BlockDriver *drv = bs->drv; 2796 if (drv && drv->bdrv_io_unplug) { 2797 drv->bdrv_io_unplug(bs); 2798 } 2799 } 2800 2801 QLIST_FOREACH(child, &bs->children, next) { 2802 bdrv_io_unplug(child->bs); 2803 } 2804 } 2805 2806 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 2807 { 2808 BdrvChild *child; 2809 2810 if (bs->drv && bs->drv->bdrv_register_buf) { 2811 bs->drv->bdrv_register_buf(bs, host, size); 2812 } 2813 QLIST_FOREACH(child, &bs->children, next) { 2814 bdrv_register_buf(child->bs, host, size); 2815 } 2816 } 2817 2818 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 2819 { 2820 BdrvChild *child; 2821 2822 if (bs->drv && bs->drv->bdrv_unregister_buf) { 2823 bs->drv->bdrv_unregister_buf(bs, host); 2824 } 2825 QLIST_FOREACH(child, &bs->children, next) { 2826 bdrv_unregister_buf(child->bs, host); 2827 } 2828 } 2829