1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/aio-wait.h" 29 #include "block/blockjob.h" 30 #include "block/blockjob_int.h" 31 #include "block/block_int.h" 32 #include "qemu/cutils.h" 33 #include "qapi/error.h" 34 #include "qemu/error-report.h" 35 36 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 37 38 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40 41 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 42 int64_t offset, int bytes, BdrvRequestFlags flags); 43 44 void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) 45 { 46 BdrvChild *c, *next; 47 48 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 49 if (c == ignore) { 50 continue; 51 } 52 if (c->role->drained_begin) { 53 c->role->drained_begin(c); 54 } 55 } 56 } 57 58 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) 59 { 60 BdrvChild *c, *next; 61 62 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 63 if (c == ignore) { 64 continue; 65 } 66 if (c->role->drained_end) { 67 c->role->drained_end(c); 68 } 69 } 70 } 71 72 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 73 { 74 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 75 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 76 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 77 src->opt_mem_alignment); 78 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 79 src->min_mem_alignment); 80 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 81 } 82 83 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 84 { 85 BlockDriver *drv = bs->drv; 86 Error *local_err = NULL; 87 88 memset(&bs->bl, 0, sizeof(bs->bl)); 89 90 if (!drv) { 91 return; 92 } 93 94 /* Default alignment based on whether driver has byte interface */ 95 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; 96 97 /* Take some limits from the children as a default */ 98 if (bs->file) { 99 bdrv_refresh_limits(bs->file->bs, &local_err); 100 if (local_err) { 101 error_propagate(errp, local_err); 102 return; 103 } 104 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 105 } else { 106 bs->bl.min_mem_alignment = 512; 107 bs->bl.opt_mem_alignment = getpagesize(); 108 109 /* Safe default since most protocols use readv()/writev()/etc */ 110 bs->bl.max_iov = IOV_MAX; 111 } 112 113 if (bs->backing) { 114 bdrv_refresh_limits(bs->backing->bs, &local_err); 115 if (local_err) { 116 error_propagate(errp, local_err); 117 return; 118 } 119 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 120 } 121 122 /* Then let the driver override it */ 123 if (drv->bdrv_refresh_limits) { 124 drv->bdrv_refresh_limits(bs, errp); 125 } 126 } 127 128 /** 129 * The copy-on-read flag is actually a reference count so multiple users may 130 * use the feature without worrying about clobbering its previous state. 131 * Copy-on-read stays enabled until all users have called to disable it. 132 */ 133 void bdrv_enable_copy_on_read(BlockDriverState *bs) 134 { 135 atomic_inc(&bs->copy_on_read); 136 } 137 138 void bdrv_disable_copy_on_read(BlockDriverState *bs) 139 { 140 int old = atomic_fetch_dec(&bs->copy_on_read); 141 assert(old >= 1); 142 } 143 144 typedef struct { 145 Coroutine *co; 146 BlockDriverState *bs; 147 bool done; 148 bool begin; 149 bool recursive; 150 BdrvChild *parent; 151 } BdrvCoDrainData; 152 153 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 154 { 155 BdrvCoDrainData *data = opaque; 156 BlockDriverState *bs = data->bs; 157 158 if (data->begin) { 159 bs->drv->bdrv_co_drain_begin(bs); 160 } else { 161 bs->drv->bdrv_co_drain_end(bs); 162 } 163 164 /* Set data->done before reading bs->wakeup. */ 165 atomic_mb_set(&data->done, true); 166 bdrv_wakeup(bs); 167 } 168 169 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 170 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive) 171 { 172 BdrvChild *child, *tmp; 173 BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin}; 174 175 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 176 (!begin && !bs->drv->bdrv_co_drain_end)) { 177 return; 178 } 179 180 data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data); 181 bdrv_coroutine_enter(bs, data.co); 182 BDRV_POLL_WHILE(bs, !data.done); 183 184 if (recursive) { 185 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 186 bdrv_drain_invoke(child->bs, begin, true); 187 } 188 } 189 } 190 191 static bool bdrv_drain_recurse(BlockDriverState *bs) 192 { 193 BdrvChild *child, *tmp; 194 bool waited; 195 196 /* Wait for drained requests to finish */ 197 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); 198 199 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 200 BlockDriverState *bs = child->bs; 201 bool in_main_loop = 202 qemu_get_current_aio_context() == qemu_get_aio_context(); 203 assert(bs->refcnt > 0); 204 if (in_main_loop) { 205 /* In case the recursive bdrv_drain_recurse processes a 206 * block_job_defer_to_main_loop BH and modifies the graph, 207 * let's hold a reference to bs until we are done. 208 * 209 * IOThread doesn't have such a BH, and it is not safe to call 210 * bdrv_unref without BQL, so skip doing it there. 211 */ 212 bdrv_ref(bs); 213 } 214 waited |= bdrv_drain_recurse(bs); 215 if (in_main_loop) { 216 bdrv_unref(bs); 217 } 218 } 219 220 return waited; 221 } 222 223 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 224 BdrvChild *parent); 225 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 226 BdrvChild *parent); 227 228 static void bdrv_co_drain_bh_cb(void *opaque) 229 { 230 BdrvCoDrainData *data = opaque; 231 Coroutine *co = data->co; 232 BlockDriverState *bs = data->bs; 233 234 bdrv_dec_in_flight(bs); 235 if (data->begin) { 236 bdrv_do_drained_begin(bs, data->recursive, data->parent); 237 } else { 238 bdrv_do_drained_end(bs, data->recursive, data->parent); 239 } 240 241 data->done = true; 242 aio_co_wake(co); 243 } 244 245 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 246 bool begin, bool recursive, 247 BdrvChild *parent) 248 { 249 BdrvCoDrainData data; 250 251 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 252 * other coroutines run if they were queued from 253 * qemu_co_queue_run_restart(). */ 254 255 assert(qemu_in_coroutine()); 256 data = (BdrvCoDrainData) { 257 .co = qemu_coroutine_self(), 258 .bs = bs, 259 .done = false, 260 .begin = begin, 261 .recursive = recursive, 262 .parent = parent, 263 }; 264 bdrv_inc_in_flight(bs); 265 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 266 bdrv_co_drain_bh_cb, &data); 267 268 qemu_coroutine_yield(); 269 /* If we are resumed from some other event (such as an aio completion or a 270 * timer callback), it is a bug in the caller that should be fixed. */ 271 assert(data.done); 272 } 273 274 void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 275 BdrvChild *parent) 276 { 277 BdrvChild *child, *next; 278 279 if (qemu_in_coroutine()) { 280 bdrv_co_yield_to_drain(bs, true, recursive, parent); 281 return; 282 } 283 284 /* Stop things in parent-to-child order */ 285 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 286 aio_disable_external(bdrv_get_aio_context(bs)); 287 } 288 289 bdrv_parent_drained_begin(bs, parent); 290 bdrv_drain_invoke(bs, true, false); 291 bdrv_drain_recurse(bs); 292 293 if (recursive) { 294 bs->recursive_quiesce_counter++; 295 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 296 bdrv_do_drained_begin(child->bs, true, child); 297 } 298 } 299 } 300 301 void bdrv_drained_begin(BlockDriverState *bs) 302 { 303 bdrv_do_drained_begin(bs, false, NULL); 304 } 305 306 void bdrv_subtree_drained_begin(BlockDriverState *bs) 307 { 308 bdrv_do_drained_begin(bs, true, NULL); 309 } 310 311 void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 312 BdrvChild *parent) 313 { 314 BdrvChild *child, *next; 315 int old_quiesce_counter; 316 317 if (qemu_in_coroutine()) { 318 bdrv_co_yield_to_drain(bs, false, recursive, parent); 319 return; 320 } 321 assert(bs->quiesce_counter > 0); 322 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 323 324 /* Re-enable things in child-to-parent order */ 325 bdrv_drain_invoke(bs, false, false); 326 bdrv_parent_drained_end(bs, parent); 327 if (old_quiesce_counter == 1) { 328 aio_enable_external(bdrv_get_aio_context(bs)); 329 } 330 331 if (recursive) { 332 bs->recursive_quiesce_counter--; 333 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 334 bdrv_do_drained_end(child->bs, true, child); 335 } 336 } 337 } 338 339 void bdrv_drained_end(BlockDriverState *bs) 340 { 341 bdrv_do_drained_end(bs, false, NULL); 342 } 343 344 void bdrv_subtree_drained_end(BlockDriverState *bs) 345 { 346 bdrv_do_drained_end(bs, true, NULL); 347 } 348 349 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 350 { 351 int i; 352 353 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 354 bdrv_do_drained_begin(child->bs, true, child); 355 } 356 } 357 358 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 359 { 360 int i; 361 362 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 363 bdrv_do_drained_end(child->bs, true, child); 364 } 365 } 366 367 /* 368 * Wait for pending requests to complete on a single BlockDriverState subtree, 369 * and suspend block driver's internal I/O until next request arrives. 370 * 371 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 372 * AioContext. 373 * 374 * Only this BlockDriverState's AioContext is run, so in-flight requests must 375 * not depend on events in other AioContexts. In that case, use 376 * bdrv_drain_all() instead. 377 */ 378 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 379 { 380 assert(qemu_in_coroutine()); 381 bdrv_drained_begin(bs); 382 bdrv_drained_end(bs); 383 } 384 385 void bdrv_drain(BlockDriverState *bs) 386 { 387 bdrv_drained_begin(bs); 388 bdrv_drained_end(bs); 389 } 390 391 /* 392 * Wait for pending requests to complete across all BlockDriverStates 393 * 394 * This function does not flush data to disk, use bdrv_flush_all() for that 395 * after calling this function. 396 * 397 * This pauses all block jobs and disables external clients. It must 398 * be paired with bdrv_drain_all_end(). 399 * 400 * NOTE: no new block jobs or BlockDriverStates can be created between 401 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 402 */ 403 void bdrv_drain_all_begin(void) 404 { 405 /* Always run first iteration so any pending completion BHs run */ 406 bool waited = true; 407 BlockDriverState *bs; 408 BdrvNextIterator it; 409 GSList *aio_ctxs = NULL, *ctx; 410 411 /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread 412 * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on 413 * nodes in several different AioContexts, so make sure we're in the main 414 * context. */ 415 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 416 417 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 418 AioContext *aio_context = bdrv_get_aio_context(bs); 419 420 /* Stop things in parent-to-child order */ 421 aio_context_acquire(aio_context); 422 aio_disable_external(aio_context); 423 bdrv_parent_drained_begin(bs, NULL); 424 bdrv_drain_invoke(bs, true, true); 425 aio_context_release(aio_context); 426 427 if (!g_slist_find(aio_ctxs, aio_context)) { 428 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 429 } 430 } 431 432 /* Note that completion of an asynchronous I/O operation can trigger any 433 * number of other I/O operations on other devices---for example a 434 * coroutine can submit an I/O request to another device in response to 435 * request completion. Therefore we must keep looping until there was no 436 * more activity rather than simply draining each device independently. 437 */ 438 while (waited) { 439 waited = false; 440 441 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 442 AioContext *aio_context = ctx->data; 443 444 aio_context_acquire(aio_context); 445 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 446 if (aio_context == bdrv_get_aio_context(bs)) { 447 waited |= bdrv_drain_recurse(bs); 448 } 449 } 450 aio_context_release(aio_context); 451 } 452 } 453 454 g_slist_free(aio_ctxs); 455 } 456 457 void bdrv_drain_all_end(void) 458 { 459 BlockDriverState *bs; 460 BdrvNextIterator it; 461 462 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 463 AioContext *aio_context = bdrv_get_aio_context(bs); 464 465 /* Re-enable things in child-to-parent order */ 466 aio_context_acquire(aio_context); 467 bdrv_drain_invoke(bs, false, true); 468 bdrv_parent_drained_end(bs, NULL); 469 aio_enable_external(aio_context); 470 aio_context_release(aio_context); 471 } 472 } 473 474 void bdrv_drain_all(void) 475 { 476 bdrv_drain_all_begin(); 477 bdrv_drain_all_end(); 478 } 479 480 /** 481 * Remove an active request from the tracked requests list 482 * 483 * This function should be called when a tracked request is completing. 484 */ 485 static void tracked_request_end(BdrvTrackedRequest *req) 486 { 487 if (req->serialising) { 488 atomic_dec(&req->bs->serialising_in_flight); 489 } 490 491 qemu_co_mutex_lock(&req->bs->reqs_lock); 492 QLIST_REMOVE(req, list); 493 qemu_co_queue_restart_all(&req->wait_queue); 494 qemu_co_mutex_unlock(&req->bs->reqs_lock); 495 } 496 497 /** 498 * Add an active request to the tracked requests list 499 */ 500 static void tracked_request_begin(BdrvTrackedRequest *req, 501 BlockDriverState *bs, 502 int64_t offset, 503 unsigned int bytes, 504 enum BdrvTrackedRequestType type) 505 { 506 *req = (BdrvTrackedRequest){ 507 .bs = bs, 508 .offset = offset, 509 .bytes = bytes, 510 .type = type, 511 .co = qemu_coroutine_self(), 512 .serialising = false, 513 .overlap_offset = offset, 514 .overlap_bytes = bytes, 515 }; 516 517 qemu_co_queue_init(&req->wait_queue); 518 519 qemu_co_mutex_lock(&bs->reqs_lock); 520 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 521 qemu_co_mutex_unlock(&bs->reqs_lock); 522 } 523 524 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 525 { 526 int64_t overlap_offset = req->offset & ~(align - 1); 527 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 528 - overlap_offset; 529 530 if (!req->serialising) { 531 atomic_inc(&req->bs->serialising_in_flight); 532 req->serialising = true; 533 } 534 535 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 536 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 537 } 538 539 /** 540 * Round a region to cluster boundaries 541 */ 542 void bdrv_round_to_clusters(BlockDriverState *bs, 543 int64_t offset, int64_t bytes, 544 int64_t *cluster_offset, 545 int64_t *cluster_bytes) 546 { 547 BlockDriverInfo bdi; 548 549 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 550 *cluster_offset = offset; 551 *cluster_bytes = bytes; 552 } else { 553 int64_t c = bdi.cluster_size; 554 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 555 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 556 } 557 } 558 559 static int bdrv_get_cluster_size(BlockDriverState *bs) 560 { 561 BlockDriverInfo bdi; 562 int ret; 563 564 ret = bdrv_get_info(bs, &bdi); 565 if (ret < 0 || bdi.cluster_size == 0) { 566 return bs->bl.request_alignment; 567 } else { 568 return bdi.cluster_size; 569 } 570 } 571 572 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 573 int64_t offset, unsigned int bytes) 574 { 575 /* aaaa bbbb */ 576 if (offset >= req->overlap_offset + req->overlap_bytes) { 577 return false; 578 } 579 /* bbbb aaaa */ 580 if (req->overlap_offset >= offset + bytes) { 581 return false; 582 } 583 return true; 584 } 585 586 void bdrv_inc_in_flight(BlockDriverState *bs) 587 { 588 atomic_inc(&bs->in_flight); 589 } 590 591 void bdrv_wakeup(BlockDriverState *bs) 592 { 593 aio_wait_kick(bdrv_get_aio_wait(bs)); 594 } 595 596 void bdrv_dec_in_flight(BlockDriverState *bs) 597 { 598 atomic_dec(&bs->in_flight); 599 bdrv_wakeup(bs); 600 } 601 602 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 603 { 604 BlockDriverState *bs = self->bs; 605 BdrvTrackedRequest *req; 606 bool retry; 607 bool waited = false; 608 609 if (!atomic_read(&bs->serialising_in_flight)) { 610 return false; 611 } 612 613 do { 614 retry = false; 615 qemu_co_mutex_lock(&bs->reqs_lock); 616 QLIST_FOREACH(req, &bs->tracked_requests, list) { 617 if (req == self || (!req->serialising && !self->serialising)) { 618 continue; 619 } 620 if (tracked_request_overlaps(req, self->overlap_offset, 621 self->overlap_bytes)) 622 { 623 /* Hitting this means there was a reentrant request, for 624 * example, a block driver issuing nested requests. This must 625 * never happen since it means deadlock. 626 */ 627 assert(qemu_coroutine_self() != req->co); 628 629 /* If the request is already (indirectly) waiting for us, or 630 * will wait for us as soon as it wakes up, then just go on 631 * (instead of producing a deadlock in the former case). */ 632 if (!req->waiting_for) { 633 self->waiting_for = req; 634 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 635 self->waiting_for = NULL; 636 retry = true; 637 waited = true; 638 break; 639 } 640 } 641 } 642 qemu_co_mutex_unlock(&bs->reqs_lock); 643 } while (retry); 644 645 return waited; 646 } 647 648 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 649 size_t size) 650 { 651 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 652 return -EIO; 653 } 654 655 if (!bdrv_is_inserted(bs)) { 656 return -ENOMEDIUM; 657 } 658 659 if (offset < 0) { 660 return -EIO; 661 } 662 663 return 0; 664 } 665 666 typedef struct RwCo { 667 BdrvChild *child; 668 int64_t offset; 669 QEMUIOVector *qiov; 670 bool is_write; 671 int ret; 672 BdrvRequestFlags flags; 673 } RwCo; 674 675 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 676 { 677 RwCo *rwco = opaque; 678 679 if (!rwco->is_write) { 680 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 681 rwco->qiov->size, rwco->qiov, 682 rwco->flags); 683 } else { 684 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 685 rwco->qiov->size, rwco->qiov, 686 rwco->flags); 687 } 688 } 689 690 /* 691 * Process a vectored synchronous request using coroutines 692 */ 693 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 694 QEMUIOVector *qiov, bool is_write, 695 BdrvRequestFlags flags) 696 { 697 Coroutine *co; 698 RwCo rwco = { 699 .child = child, 700 .offset = offset, 701 .qiov = qiov, 702 .is_write = is_write, 703 .ret = NOT_DONE, 704 .flags = flags, 705 }; 706 707 if (qemu_in_coroutine()) { 708 /* Fast-path if already in coroutine context */ 709 bdrv_rw_co_entry(&rwco); 710 } else { 711 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 712 bdrv_coroutine_enter(child->bs, co); 713 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 714 } 715 return rwco.ret; 716 } 717 718 /* 719 * Process a synchronous request using coroutines 720 */ 721 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 722 int nb_sectors, bool is_write, BdrvRequestFlags flags) 723 { 724 QEMUIOVector qiov; 725 struct iovec iov = { 726 .iov_base = (void *)buf, 727 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 728 }; 729 730 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 731 return -EINVAL; 732 } 733 734 qemu_iovec_init_external(&qiov, &iov, 1); 735 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 736 &qiov, is_write, flags); 737 } 738 739 /* return < 0 if error. See bdrv_write() for the return codes */ 740 int bdrv_read(BdrvChild *child, int64_t sector_num, 741 uint8_t *buf, int nb_sectors) 742 { 743 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 744 } 745 746 /* Return < 0 if error. Important errors are: 747 -EIO generic I/O error (may happen for all errors) 748 -ENOMEDIUM No media inserted. 749 -EINVAL Invalid sector number or nb_sectors 750 -EACCES Trying to write a read-only device 751 */ 752 int bdrv_write(BdrvChild *child, int64_t sector_num, 753 const uint8_t *buf, int nb_sectors) 754 { 755 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 756 } 757 758 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 759 int bytes, BdrvRequestFlags flags) 760 { 761 QEMUIOVector qiov; 762 struct iovec iov = { 763 .iov_base = NULL, 764 .iov_len = bytes, 765 }; 766 767 qemu_iovec_init_external(&qiov, &iov, 1); 768 return bdrv_prwv_co(child, offset, &qiov, true, 769 BDRV_REQ_ZERO_WRITE | flags); 770 } 771 772 /* 773 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 774 * The operation is sped up by checking the block status and only writing 775 * zeroes to the device if they currently do not return zeroes. Optional 776 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 777 * BDRV_REQ_FUA). 778 * 779 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 780 */ 781 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 782 { 783 int ret; 784 int64_t target_size, bytes, offset = 0; 785 BlockDriverState *bs = child->bs; 786 787 target_size = bdrv_getlength(bs); 788 if (target_size < 0) { 789 return target_size; 790 } 791 792 for (;;) { 793 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 794 if (bytes <= 0) { 795 return 0; 796 } 797 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 798 if (ret < 0) { 799 error_report("error getting block status at offset %" PRId64 ": %s", 800 offset, strerror(-ret)); 801 return ret; 802 } 803 if (ret & BDRV_BLOCK_ZERO) { 804 offset += bytes; 805 continue; 806 } 807 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 808 if (ret < 0) { 809 error_report("error writing zeroes at offset %" PRId64 ": %s", 810 offset, strerror(-ret)); 811 return ret; 812 } 813 offset += bytes; 814 } 815 } 816 817 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 818 { 819 int ret; 820 821 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 822 if (ret < 0) { 823 return ret; 824 } 825 826 return qiov->size; 827 } 828 829 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 830 { 831 QEMUIOVector qiov; 832 struct iovec iov = { 833 .iov_base = (void *)buf, 834 .iov_len = bytes, 835 }; 836 837 if (bytes < 0) { 838 return -EINVAL; 839 } 840 841 qemu_iovec_init_external(&qiov, &iov, 1); 842 return bdrv_preadv(child, offset, &qiov); 843 } 844 845 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 846 { 847 int ret; 848 849 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 850 if (ret < 0) { 851 return ret; 852 } 853 854 return qiov->size; 855 } 856 857 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 858 { 859 QEMUIOVector qiov; 860 struct iovec iov = { 861 .iov_base = (void *) buf, 862 .iov_len = bytes, 863 }; 864 865 if (bytes < 0) { 866 return -EINVAL; 867 } 868 869 qemu_iovec_init_external(&qiov, &iov, 1); 870 return bdrv_pwritev(child, offset, &qiov); 871 } 872 873 /* 874 * Writes to the file and ensures that no writes are reordered across this 875 * request (acts as a barrier) 876 * 877 * Returns 0 on success, -errno in error cases. 878 */ 879 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 880 const void *buf, int count) 881 { 882 int ret; 883 884 ret = bdrv_pwrite(child, offset, buf, count); 885 if (ret < 0) { 886 return ret; 887 } 888 889 ret = bdrv_flush(child->bs); 890 if (ret < 0) { 891 return ret; 892 } 893 894 return 0; 895 } 896 897 typedef struct CoroutineIOCompletion { 898 Coroutine *coroutine; 899 int ret; 900 } CoroutineIOCompletion; 901 902 static void bdrv_co_io_em_complete(void *opaque, int ret) 903 { 904 CoroutineIOCompletion *co = opaque; 905 906 co->ret = ret; 907 aio_co_wake(co->coroutine); 908 } 909 910 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 911 uint64_t offset, uint64_t bytes, 912 QEMUIOVector *qiov, int flags) 913 { 914 BlockDriver *drv = bs->drv; 915 int64_t sector_num; 916 unsigned int nb_sectors; 917 918 assert(!(flags & ~BDRV_REQ_MASK)); 919 920 if (!drv) { 921 return -ENOMEDIUM; 922 } 923 924 if (drv->bdrv_co_preadv) { 925 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 926 } 927 928 sector_num = offset >> BDRV_SECTOR_BITS; 929 nb_sectors = bytes >> BDRV_SECTOR_BITS; 930 931 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 932 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 933 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 934 935 if (drv->bdrv_co_readv) { 936 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 937 } else { 938 BlockAIOCB *acb; 939 CoroutineIOCompletion co = { 940 .coroutine = qemu_coroutine_self(), 941 }; 942 943 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, 944 bdrv_co_io_em_complete, &co); 945 if (acb == NULL) { 946 return -EIO; 947 } else { 948 qemu_coroutine_yield(); 949 return co.ret; 950 } 951 } 952 } 953 954 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 955 uint64_t offset, uint64_t bytes, 956 QEMUIOVector *qiov, int flags) 957 { 958 BlockDriver *drv = bs->drv; 959 int64_t sector_num; 960 unsigned int nb_sectors; 961 int ret; 962 963 assert(!(flags & ~BDRV_REQ_MASK)); 964 965 if (!drv) { 966 return -ENOMEDIUM; 967 } 968 969 if (drv->bdrv_co_pwritev) { 970 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 971 flags & bs->supported_write_flags); 972 flags &= ~bs->supported_write_flags; 973 goto emulate_flags; 974 } 975 976 sector_num = offset >> BDRV_SECTOR_BITS; 977 nb_sectors = bytes >> BDRV_SECTOR_BITS; 978 979 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 980 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 981 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 982 983 if (drv->bdrv_co_writev_flags) { 984 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, 985 flags & bs->supported_write_flags); 986 flags &= ~bs->supported_write_flags; 987 } else if (drv->bdrv_co_writev) { 988 assert(!bs->supported_write_flags); 989 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 990 } else { 991 BlockAIOCB *acb; 992 CoroutineIOCompletion co = { 993 .coroutine = qemu_coroutine_self(), 994 }; 995 996 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, 997 bdrv_co_io_em_complete, &co); 998 if (acb == NULL) { 999 ret = -EIO; 1000 } else { 1001 qemu_coroutine_yield(); 1002 ret = co.ret; 1003 } 1004 } 1005 1006 emulate_flags: 1007 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1008 ret = bdrv_co_flush(bs); 1009 } 1010 1011 return ret; 1012 } 1013 1014 static int coroutine_fn 1015 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1016 uint64_t bytes, QEMUIOVector *qiov) 1017 { 1018 BlockDriver *drv = bs->drv; 1019 1020 if (!drv) { 1021 return -ENOMEDIUM; 1022 } 1023 1024 if (!drv->bdrv_co_pwritev_compressed) { 1025 return -ENOTSUP; 1026 } 1027 1028 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1029 } 1030 1031 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1032 int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 1033 { 1034 BlockDriverState *bs = child->bs; 1035 1036 /* Perform I/O through a temporary buffer so that users who scribble over 1037 * their read buffer while the operation is in progress do not end up 1038 * modifying the image file. This is critical for zero-copy guest I/O 1039 * where anything might happen inside guest memory. 1040 */ 1041 void *bounce_buffer; 1042 1043 BlockDriver *drv = bs->drv; 1044 struct iovec iov; 1045 QEMUIOVector local_qiov; 1046 int64_t cluster_offset; 1047 int64_t cluster_bytes; 1048 size_t skip_bytes; 1049 int ret; 1050 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1051 BDRV_REQUEST_MAX_BYTES); 1052 unsigned int progress = 0; 1053 1054 if (!drv) { 1055 return -ENOMEDIUM; 1056 } 1057 1058 /* FIXME We cannot require callers to have write permissions when all they 1059 * are doing is a read request. If we did things right, write permissions 1060 * would be obtained anyway, but internally by the copy-on-read code. As 1061 * long as it is implemented here rather than in a separate filter driver, 1062 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1063 * it could request permissions. Therefore we have to bypass the permission 1064 * system for the moment. */ 1065 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1066 1067 /* Cover entire cluster so no additional backing file I/O is required when 1068 * allocating cluster in the image file. Note that this value may exceed 1069 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1070 * is one reason we loop rather than doing it all at once. 1071 */ 1072 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1073 skip_bytes = offset - cluster_offset; 1074 1075 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1076 cluster_offset, cluster_bytes); 1077 1078 bounce_buffer = qemu_try_blockalign(bs, 1079 MIN(MIN(max_transfer, cluster_bytes), 1080 MAX_BOUNCE_BUFFER)); 1081 if (bounce_buffer == NULL) { 1082 ret = -ENOMEM; 1083 goto err; 1084 } 1085 1086 while (cluster_bytes) { 1087 int64_t pnum; 1088 1089 ret = bdrv_is_allocated(bs, cluster_offset, 1090 MIN(cluster_bytes, max_transfer), &pnum); 1091 if (ret < 0) { 1092 /* Safe to treat errors in querying allocation as if 1093 * unallocated; we'll probably fail again soon on the 1094 * read, but at least that will set a decent errno. 1095 */ 1096 pnum = MIN(cluster_bytes, max_transfer); 1097 } 1098 1099 assert(skip_bytes < pnum); 1100 1101 if (ret <= 0) { 1102 /* Must copy-on-read; use the bounce buffer */ 1103 iov.iov_base = bounce_buffer; 1104 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1105 qemu_iovec_init_external(&local_qiov, &iov, 1); 1106 1107 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1108 &local_qiov, 0); 1109 if (ret < 0) { 1110 goto err; 1111 } 1112 1113 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1114 if (drv->bdrv_co_pwrite_zeroes && 1115 buffer_is_zero(bounce_buffer, pnum)) { 1116 /* FIXME: Should we (perhaps conditionally) be setting 1117 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1118 * that still correctly reads as zero? */ 1119 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0); 1120 } else { 1121 /* This does not change the data on the disk, it is not 1122 * necessary to flush even in cache=writethrough mode. 1123 */ 1124 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1125 &local_qiov, 0); 1126 } 1127 1128 if (ret < 0) { 1129 /* It might be okay to ignore write errors for guest 1130 * requests. If this is a deliberate copy-on-read 1131 * then we don't want to ignore the error. Simply 1132 * report it in all cases. 1133 */ 1134 goto err; 1135 } 1136 1137 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, 1138 pnum - skip_bytes); 1139 } else { 1140 /* Read directly into the destination */ 1141 qemu_iovec_init(&local_qiov, qiov->niov); 1142 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); 1143 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, 1144 &local_qiov, 0); 1145 qemu_iovec_destroy(&local_qiov); 1146 if (ret < 0) { 1147 goto err; 1148 } 1149 } 1150 1151 cluster_offset += pnum; 1152 cluster_bytes -= pnum; 1153 progress += pnum - skip_bytes; 1154 skip_bytes = 0; 1155 } 1156 ret = 0; 1157 1158 err: 1159 qemu_vfree(bounce_buffer); 1160 return ret; 1161 } 1162 1163 /* 1164 * Forwards an already correctly aligned request to the BlockDriver. This 1165 * handles copy on read, zeroing after EOF, and fragmentation of large 1166 * reads; any other features must be implemented by the caller. 1167 */ 1168 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1169 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1170 int64_t align, QEMUIOVector *qiov, int flags) 1171 { 1172 BlockDriverState *bs = child->bs; 1173 int64_t total_bytes, max_bytes; 1174 int ret = 0; 1175 uint64_t bytes_remaining = bytes; 1176 int max_transfer; 1177 1178 assert(is_power_of_2(align)); 1179 assert((offset & (align - 1)) == 0); 1180 assert((bytes & (align - 1)) == 0); 1181 assert(!qiov || bytes == qiov->size); 1182 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1183 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1184 align); 1185 1186 /* TODO: We would need a per-BDS .supported_read_flags and 1187 * potential fallback support, if we ever implement any read flags 1188 * to pass through to drivers. For now, there aren't any 1189 * passthrough flags. */ 1190 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 1191 1192 /* Handle Copy on Read and associated serialisation */ 1193 if (flags & BDRV_REQ_COPY_ON_READ) { 1194 /* If we touch the same cluster it counts as an overlap. This 1195 * guarantees that allocating writes will be serialized and not race 1196 * with each other for the same cluster. For example, in copy-on-read 1197 * it ensures that the CoR read and write operations are atomic and 1198 * guest writes cannot interleave between them. */ 1199 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1200 } 1201 1202 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 1203 wait_serialising_requests(req); 1204 } 1205 1206 if (flags & BDRV_REQ_COPY_ON_READ) { 1207 int64_t pnum; 1208 1209 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1210 if (ret < 0) { 1211 goto out; 1212 } 1213 1214 if (!ret || pnum != bytes) { 1215 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 1216 goto out; 1217 } 1218 } 1219 1220 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1221 total_bytes = bdrv_getlength(bs); 1222 if (total_bytes < 0) { 1223 ret = total_bytes; 1224 goto out; 1225 } 1226 1227 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1228 if (bytes <= max_bytes && bytes <= max_transfer) { 1229 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 1230 goto out; 1231 } 1232 1233 while (bytes_remaining) { 1234 int num; 1235 1236 if (max_bytes) { 1237 QEMUIOVector local_qiov; 1238 1239 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1240 assert(num); 1241 qemu_iovec_init(&local_qiov, qiov->niov); 1242 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1243 1244 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1245 num, &local_qiov, 0); 1246 max_bytes -= num; 1247 qemu_iovec_destroy(&local_qiov); 1248 } else { 1249 num = bytes_remaining; 1250 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1251 bytes_remaining); 1252 } 1253 if (ret < 0) { 1254 goto out; 1255 } 1256 bytes_remaining -= num; 1257 } 1258 1259 out: 1260 return ret < 0 ? ret : 0; 1261 } 1262 1263 /* 1264 * Handle a read request in coroutine context 1265 */ 1266 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1267 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1268 BdrvRequestFlags flags) 1269 { 1270 BlockDriverState *bs = child->bs; 1271 BlockDriver *drv = bs->drv; 1272 BdrvTrackedRequest req; 1273 1274 uint64_t align = bs->bl.request_alignment; 1275 uint8_t *head_buf = NULL; 1276 uint8_t *tail_buf = NULL; 1277 QEMUIOVector local_qiov; 1278 bool use_local_qiov = false; 1279 int ret; 1280 1281 trace_bdrv_co_preadv(child->bs, offset, bytes, flags); 1282 1283 if (!drv) { 1284 return -ENOMEDIUM; 1285 } 1286 1287 ret = bdrv_check_byte_request(bs, offset, bytes); 1288 if (ret < 0) { 1289 return ret; 1290 } 1291 1292 bdrv_inc_in_flight(bs); 1293 1294 /* Don't do copy-on-read if we read data before write operation */ 1295 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 1296 flags |= BDRV_REQ_COPY_ON_READ; 1297 } 1298 1299 /* Align read if necessary by padding qiov */ 1300 if (offset & (align - 1)) { 1301 head_buf = qemu_blockalign(bs, align); 1302 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1303 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1304 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1305 use_local_qiov = true; 1306 1307 bytes += offset & (align - 1); 1308 offset = offset & ~(align - 1); 1309 } 1310 1311 if ((offset + bytes) & (align - 1)) { 1312 if (!use_local_qiov) { 1313 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1314 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1315 use_local_qiov = true; 1316 } 1317 tail_buf = qemu_blockalign(bs, align); 1318 qemu_iovec_add(&local_qiov, tail_buf, 1319 align - ((offset + bytes) & (align - 1))); 1320 1321 bytes = ROUND_UP(bytes, align); 1322 } 1323 1324 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1325 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 1326 use_local_qiov ? &local_qiov : qiov, 1327 flags); 1328 tracked_request_end(&req); 1329 bdrv_dec_in_flight(bs); 1330 1331 if (use_local_qiov) { 1332 qemu_iovec_destroy(&local_qiov); 1333 qemu_vfree(head_buf); 1334 qemu_vfree(tail_buf); 1335 } 1336 1337 return ret; 1338 } 1339 1340 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, 1341 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1342 BdrvRequestFlags flags) 1343 { 1344 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1345 return -EINVAL; 1346 } 1347 1348 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, 1349 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1350 } 1351 1352 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, 1353 int nb_sectors, QEMUIOVector *qiov) 1354 { 1355 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); 1356 } 1357 1358 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1359 int64_t offset, int bytes, BdrvRequestFlags flags) 1360 { 1361 BlockDriver *drv = bs->drv; 1362 QEMUIOVector qiov; 1363 struct iovec iov = {0}; 1364 int ret = 0; 1365 bool need_flush = false; 1366 int head = 0; 1367 int tail = 0; 1368 1369 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1370 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1371 bs->bl.request_alignment); 1372 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1373 1374 if (!drv) { 1375 return -ENOMEDIUM; 1376 } 1377 1378 assert(alignment % bs->bl.request_alignment == 0); 1379 head = offset % alignment; 1380 tail = (offset + bytes) % alignment; 1381 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1382 assert(max_write_zeroes >= bs->bl.request_alignment); 1383 1384 while (bytes > 0 && !ret) { 1385 int num = bytes; 1386 1387 /* Align request. Block drivers can expect the "bulk" of the request 1388 * to be aligned, and that unaligned requests do not cross cluster 1389 * boundaries. 1390 */ 1391 if (head) { 1392 /* Make a small request up to the first aligned sector. For 1393 * convenience, limit this request to max_transfer even if 1394 * we don't need to fall back to writes. */ 1395 num = MIN(MIN(bytes, max_transfer), alignment - head); 1396 head = (head + num) % alignment; 1397 assert(num < max_write_zeroes); 1398 } else if (tail && num > alignment) { 1399 /* Shorten the request to the last aligned sector. */ 1400 num -= tail; 1401 } 1402 1403 /* limit request size */ 1404 if (num > max_write_zeroes) { 1405 num = max_write_zeroes; 1406 } 1407 1408 ret = -ENOTSUP; 1409 /* First try the efficient write zeroes operation */ 1410 if (drv->bdrv_co_pwrite_zeroes) { 1411 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1412 flags & bs->supported_zero_flags); 1413 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1414 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1415 need_flush = true; 1416 } 1417 } else { 1418 assert(!bs->supported_zero_flags); 1419 } 1420 1421 if (ret == -ENOTSUP) { 1422 /* Fall back to bounce buffer if write zeroes is unsupported */ 1423 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1424 1425 if ((flags & BDRV_REQ_FUA) && 1426 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1427 /* No need for bdrv_driver_pwrite() to do a fallback 1428 * flush on each chunk; use just one at the end */ 1429 write_flags &= ~BDRV_REQ_FUA; 1430 need_flush = true; 1431 } 1432 num = MIN(num, max_transfer); 1433 iov.iov_len = num; 1434 if (iov.iov_base == NULL) { 1435 iov.iov_base = qemu_try_blockalign(bs, num); 1436 if (iov.iov_base == NULL) { 1437 ret = -ENOMEM; 1438 goto fail; 1439 } 1440 memset(iov.iov_base, 0, num); 1441 } 1442 qemu_iovec_init_external(&qiov, &iov, 1); 1443 1444 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 1445 1446 /* Keep bounce buffer around if it is big enough for all 1447 * all future requests. 1448 */ 1449 if (num < max_transfer) { 1450 qemu_vfree(iov.iov_base); 1451 iov.iov_base = NULL; 1452 } 1453 } 1454 1455 offset += num; 1456 bytes -= num; 1457 } 1458 1459 fail: 1460 if (ret == 0 && need_flush) { 1461 ret = bdrv_co_flush(bs); 1462 } 1463 qemu_vfree(iov.iov_base); 1464 return ret; 1465 } 1466 1467 /* 1468 * Forwards an already correctly aligned write request to the BlockDriver, 1469 * after possibly fragmenting it. 1470 */ 1471 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1472 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1473 int64_t align, QEMUIOVector *qiov, int flags) 1474 { 1475 BlockDriverState *bs = child->bs; 1476 BlockDriver *drv = bs->drv; 1477 bool waited; 1478 int ret; 1479 1480 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1481 uint64_t bytes_remaining = bytes; 1482 int max_transfer; 1483 1484 if (!drv) { 1485 return -ENOMEDIUM; 1486 } 1487 1488 if (bdrv_has_readonly_bitmaps(bs)) { 1489 return -EPERM; 1490 } 1491 1492 assert(is_power_of_2(align)); 1493 assert((offset & (align - 1)) == 0); 1494 assert((bytes & (align - 1)) == 0); 1495 assert(!qiov || bytes == qiov->size); 1496 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1497 assert(!(flags & ~BDRV_REQ_MASK)); 1498 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1499 align); 1500 1501 waited = wait_serialising_requests(req); 1502 assert(!waited || !req->serialising); 1503 assert(req->overlap_offset <= offset); 1504 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1505 assert(child->perm & BLK_PERM_WRITE); 1506 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1507 1508 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1509 1510 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1511 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1512 qemu_iovec_is_zero(qiov)) { 1513 flags |= BDRV_REQ_ZERO_WRITE; 1514 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1515 flags |= BDRV_REQ_MAY_UNMAP; 1516 } 1517 } 1518 1519 if (ret < 0) { 1520 /* Do nothing, write notifier decided to fail this request */ 1521 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1522 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1523 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1524 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1525 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 1526 } else if (bytes <= max_transfer) { 1527 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1528 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 1529 } else { 1530 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1531 while (bytes_remaining) { 1532 int num = MIN(bytes_remaining, max_transfer); 1533 QEMUIOVector local_qiov; 1534 int local_flags = flags; 1535 1536 assert(num); 1537 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1538 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1539 /* If FUA is going to be emulated by flush, we only 1540 * need to flush on the last iteration */ 1541 local_flags &= ~BDRV_REQ_FUA; 1542 } 1543 qemu_iovec_init(&local_qiov, qiov->niov); 1544 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1545 1546 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1547 num, &local_qiov, local_flags); 1548 qemu_iovec_destroy(&local_qiov); 1549 if (ret < 0) { 1550 break; 1551 } 1552 bytes_remaining -= num; 1553 } 1554 } 1555 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1556 1557 atomic_inc(&bs->write_gen); 1558 bdrv_set_dirty(bs, offset, bytes); 1559 1560 stat64_max(&bs->wr_highest_offset, offset + bytes); 1561 1562 if (ret >= 0) { 1563 bs->total_sectors = MAX(bs->total_sectors, end_sector); 1564 ret = 0; 1565 } 1566 1567 return ret; 1568 } 1569 1570 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1571 int64_t offset, 1572 unsigned int bytes, 1573 BdrvRequestFlags flags, 1574 BdrvTrackedRequest *req) 1575 { 1576 BlockDriverState *bs = child->bs; 1577 uint8_t *buf = NULL; 1578 QEMUIOVector local_qiov; 1579 struct iovec iov; 1580 uint64_t align = bs->bl.request_alignment; 1581 unsigned int head_padding_bytes, tail_padding_bytes; 1582 int ret = 0; 1583 1584 head_padding_bytes = offset & (align - 1); 1585 tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 1586 1587 1588 assert(flags & BDRV_REQ_ZERO_WRITE); 1589 if (head_padding_bytes || tail_padding_bytes) { 1590 buf = qemu_blockalign(bs, align); 1591 iov = (struct iovec) { 1592 .iov_base = buf, 1593 .iov_len = align, 1594 }; 1595 qemu_iovec_init_external(&local_qiov, &iov, 1); 1596 } 1597 if (head_padding_bytes) { 1598 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1599 1600 /* RMW the unaligned part before head. */ 1601 mark_request_serialising(req, align); 1602 wait_serialising_requests(req); 1603 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1604 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 1605 align, &local_qiov, 0); 1606 if (ret < 0) { 1607 goto fail; 1608 } 1609 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1610 1611 memset(buf + head_padding_bytes, 0, zero_bytes); 1612 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1613 align, &local_qiov, 1614 flags & ~BDRV_REQ_ZERO_WRITE); 1615 if (ret < 0) { 1616 goto fail; 1617 } 1618 offset += zero_bytes; 1619 bytes -= zero_bytes; 1620 } 1621 1622 assert(!bytes || (offset & (align - 1)) == 0); 1623 if (bytes >= align) { 1624 /* Write the aligned part in the middle. */ 1625 uint64_t aligned_bytes = bytes & ~(align - 1); 1626 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 1627 NULL, flags); 1628 if (ret < 0) { 1629 goto fail; 1630 } 1631 bytes -= aligned_bytes; 1632 offset += aligned_bytes; 1633 } 1634 1635 assert(!bytes || (offset & (align - 1)) == 0); 1636 if (bytes) { 1637 assert(align == tail_padding_bytes + bytes); 1638 /* RMW the unaligned part after tail. */ 1639 mark_request_serialising(req, align); 1640 wait_serialising_requests(req); 1641 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1642 ret = bdrv_aligned_preadv(child, req, offset, align, 1643 align, &local_qiov, 0); 1644 if (ret < 0) { 1645 goto fail; 1646 } 1647 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1648 1649 memset(buf, 0, bytes); 1650 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 1651 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1652 } 1653 fail: 1654 qemu_vfree(buf); 1655 return ret; 1656 1657 } 1658 1659 /* 1660 * Handle a write request in coroutine context 1661 */ 1662 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 1663 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1664 BdrvRequestFlags flags) 1665 { 1666 BlockDriverState *bs = child->bs; 1667 BdrvTrackedRequest req; 1668 uint64_t align = bs->bl.request_alignment; 1669 uint8_t *head_buf = NULL; 1670 uint8_t *tail_buf = NULL; 1671 QEMUIOVector local_qiov; 1672 bool use_local_qiov = false; 1673 int ret; 1674 1675 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 1676 1677 if (!bs->drv) { 1678 return -ENOMEDIUM; 1679 } 1680 if (bs->read_only) { 1681 return -EPERM; 1682 } 1683 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1684 1685 ret = bdrv_check_byte_request(bs, offset, bytes); 1686 if (ret < 0) { 1687 return ret; 1688 } 1689 1690 bdrv_inc_in_flight(bs); 1691 /* 1692 * Align write if necessary by performing a read-modify-write cycle. 1693 * Pad qiov with the read parts and be sure to have a tracked request not 1694 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1695 */ 1696 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1697 1698 if (flags & BDRV_REQ_ZERO_WRITE) { 1699 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 1700 goto out; 1701 } 1702 1703 if (offset & (align - 1)) { 1704 QEMUIOVector head_qiov; 1705 struct iovec head_iov; 1706 1707 mark_request_serialising(&req, align); 1708 wait_serialising_requests(&req); 1709 1710 head_buf = qemu_blockalign(bs, align); 1711 head_iov = (struct iovec) { 1712 .iov_base = head_buf, 1713 .iov_len = align, 1714 }; 1715 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1716 1717 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1718 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 1719 align, &head_qiov, 0); 1720 if (ret < 0) { 1721 goto fail; 1722 } 1723 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1724 1725 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1726 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1727 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1728 use_local_qiov = true; 1729 1730 bytes += offset & (align - 1); 1731 offset = offset & ~(align - 1); 1732 1733 /* We have read the tail already if the request is smaller 1734 * than one aligned block. 1735 */ 1736 if (bytes < align) { 1737 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1738 bytes = align; 1739 } 1740 } 1741 1742 if ((offset + bytes) & (align - 1)) { 1743 QEMUIOVector tail_qiov; 1744 struct iovec tail_iov; 1745 size_t tail_bytes; 1746 bool waited; 1747 1748 mark_request_serialising(&req, align); 1749 waited = wait_serialising_requests(&req); 1750 assert(!waited || !use_local_qiov); 1751 1752 tail_buf = qemu_blockalign(bs, align); 1753 tail_iov = (struct iovec) { 1754 .iov_base = tail_buf, 1755 .iov_len = align, 1756 }; 1757 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1758 1759 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1760 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 1761 align, align, &tail_qiov, 0); 1762 if (ret < 0) { 1763 goto fail; 1764 } 1765 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1766 1767 if (!use_local_qiov) { 1768 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1769 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1770 use_local_qiov = true; 1771 } 1772 1773 tail_bytes = (offset + bytes) & (align - 1); 1774 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1775 1776 bytes = ROUND_UP(bytes, align); 1777 } 1778 1779 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 1780 use_local_qiov ? &local_qiov : qiov, 1781 flags); 1782 1783 fail: 1784 1785 if (use_local_qiov) { 1786 qemu_iovec_destroy(&local_qiov); 1787 } 1788 qemu_vfree(head_buf); 1789 qemu_vfree(tail_buf); 1790 out: 1791 tracked_request_end(&req); 1792 bdrv_dec_in_flight(bs); 1793 return ret; 1794 } 1795 1796 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, 1797 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1798 BdrvRequestFlags flags) 1799 { 1800 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1801 return -EINVAL; 1802 } 1803 1804 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, 1805 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1806 } 1807 1808 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, 1809 int nb_sectors, QEMUIOVector *qiov) 1810 { 1811 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); 1812 } 1813 1814 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1815 int bytes, BdrvRequestFlags flags) 1816 { 1817 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 1818 1819 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 1820 flags &= ~BDRV_REQ_MAY_UNMAP; 1821 } 1822 1823 return bdrv_co_pwritev(child, offset, bytes, NULL, 1824 BDRV_REQ_ZERO_WRITE | flags); 1825 } 1826 1827 /* 1828 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 1829 */ 1830 int bdrv_flush_all(void) 1831 { 1832 BdrvNextIterator it; 1833 BlockDriverState *bs = NULL; 1834 int result = 0; 1835 1836 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 1837 AioContext *aio_context = bdrv_get_aio_context(bs); 1838 int ret; 1839 1840 aio_context_acquire(aio_context); 1841 ret = bdrv_flush(bs); 1842 if (ret < 0 && !result) { 1843 result = ret; 1844 } 1845 aio_context_release(aio_context); 1846 } 1847 1848 return result; 1849 } 1850 1851 1852 typedef struct BdrvCoBlockStatusData { 1853 BlockDriverState *bs; 1854 BlockDriverState *base; 1855 bool want_zero; 1856 int64_t offset; 1857 int64_t bytes; 1858 int64_t *pnum; 1859 int64_t *map; 1860 BlockDriverState **file; 1861 int ret; 1862 bool done; 1863 } BdrvCoBlockStatusData; 1864 1865 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 1866 bool want_zero, 1867 int64_t offset, 1868 int64_t bytes, 1869 int64_t *pnum, 1870 int64_t *map, 1871 BlockDriverState **file) 1872 { 1873 assert(bs->file && bs->file->bs); 1874 *pnum = bytes; 1875 *map = offset; 1876 *file = bs->file->bs; 1877 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1878 } 1879 1880 int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 1881 bool want_zero, 1882 int64_t offset, 1883 int64_t bytes, 1884 int64_t *pnum, 1885 int64_t *map, 1886 BlockDriverState **file) 1887 { 1888 assert(bs->backing && bs->backing->bs); 1889 *pnum = bytes; 1890 *map = offset; 1891 *file = bs->backing->bs; 1892 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1893 } 1894 1895 /* 1896 * Returns the allocation status of the specified sectors. 1897 * Drivers not implementing the functionality are assumed to not support 1898 * backing files, hence all their sectors are reported as allocated. 1899 * 1900 * If 'want_zero' is true, the caller is querying for mapping 1901 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 1902 * _ZERO where possible; otherwise, the result favors larger 'pnum', 1903 * with a focus on accurate BDRV_BLOCK_ALLOCATED. 1904 * 1905 * If 'offset' is beyond the end of the disk image the return value is 1906 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 1907 * 1908 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 1909 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 1910 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 1911 * 1912 * 'pnum' is set to the number of bytes (including and immediately 1913 * following the specified offset) that are easily known to be in the 1914 * same allocated/unallocated state. Note that a second call starting 1915 * at the original offset plus returned pnum may have the same status. 1916 * The returned value is non-zero on success except at end-of-file. 1917 * 1918 * Returns negative errno on failure. Otherwise, if the 1919 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 1920 * set to the host mapping and BDS corresponding to the guest offset. 1921 */ 1922 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 1923 bool want_zero, 1924 int64_t offset, int64_t bytes, 1925 int64_t *pnum, int64_t *map, 1926 BlockDriverState **file) 1927 { 1928 int64_t total_size; 1929 int64_t n; /* bytes */ 1930 int ret; 1931 int64_t local_map = 0; 1932 BlockDriverState *local_file = NULL; 1933 int64_t aligned_offset, aligned_bytes; 1934 uint32_t align; 1935 1936 assert(pnum); 1937 *pnum = 0; 1938 total_size = bdrv_getlength(bs); 1939 if (total_size < 0) { 1940 ret = total_size; 1941 goto early_out; 1942 } 1943 1944 if (offset >= total_size) { 1945 ret = BDRV_BLOCK_EOF; 1946 goto early_out; 1947 } 1948 if (!bytes) { 1949 ret = 0; 1950 goto early_out; 1951 } 1952 1953 n = total_size - offset; 1954 if (n < bytes) { 1955 bytes = n; 1956 } 1957 1958 /* Must be non-NULL or bdrv_getlength() would have failed */ 1959 assert(bs->drv); 1960 if (!bs->drv->bdrv_co_block_status) { 1961 *pnum = bytes; 1962 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1963 if (offset + bytes == total_size) { 1964 ret |= BDRV_BLOCK_EOF; 1965 } 1966 if (bs->drv->protocol_name) { 1967 ret |= BDRV_BLOCK_OFFSET_VALID; 1968 local_map = offset; 1969 local_file = bs; 1970 } 1971 goto early_out; 1972 } 1973 1974 bdrv_inc_in_flight(bs); 1975 1976 /* Round out to request_alignment boundaries */ 1977 align = bs->bl.request_alignment; 1978 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 1979 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 1980 1981 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 1982 aligned_bytes, pnum, &local_map, 1983 &local_file); 1984 if (ret < 0) { 1985 *pnum = 0; 1986 goto out; 1987 } 1988 1989 /* 1990 * The driver's result must be a non-zero multiple of request_alignment. 1991 * Clamp pnum and adjust map to original request. 1992 */ 1993 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 1994 align > offset - aligned_offset); 1995 *pnum -= offset - aligned_offset; 1996 if (*pnum > bytes) { 1997 *pnum = bytes; 1998 } 1999 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2000 local_map += offset - aligned_offset; 2001 } 2002 2003 if (ret & BDRV_BLOCK_RAW) { 2004 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2005 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2006 *pnum, pnum, &local_map, &local_file); 2007 goto out; 2008 } 2009 2010 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2011 ret |= BDRV_BLOCK_ALLOCATED; 2012 } else if (want_zero) { 2013 if (bdrv_unallocated_blocks_are_zero(bs)) { 2014 ret |= BDRV_BLOCK_ZERO; 2015 } else if (bs->backing) { 2016 BlockDriverState *bs2 = bs->backing->bs; 2017 int64_t size2 = bdrv_getlength(bs2); 2018 2019 if (size2 >= 0 && offset >= size2) { 2020 ret |= BDRV_BLOCK_ZERO; 2021 } 2022 } 2023 } 2024 2025 if (want_zero && local_file && local_file != bs && 2026 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2027 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2028 int64_t file_pnum; 2029 int ret2; 2030 2031 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2032 *pnum, &file_pnum, NULL, NULL); 2033 if (ret2 >= 0) { 2034 /* Ignore errors. This is just providing extra information, it 2035 * is useful but not necessary. 2036 */ 2037 if (ret2 & BDRV_BLOCK_EOF && 2038 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2039 /* 2040 * It is valid for the format block driver to read 2041 * beyond the end of the underlying file's current 2042 * size; such areas read as zero. 2043 */ 2044 ret |= BDRV_BLOCK_ZERO; 2045 } else { 2046 /* Limit request to the range reported by the protocol driver */ 2047 *pnum = file_pnum; 2048 ret |= (ret2 & BDRV_BLOCK_ZERO); 2049 } 2050 } 2051 } 2052 2053 out: 2054 bdrv_dec_in_flight(bs); 2055 if (ret >= 0 && offset + *pnum == total_size) { 2056 ret |= BDRV_BLOCK_EOF; 2057 } 2058 early_out: 2059 if (file) { 2060 *file = local_file; 2061 } 2062 if (map) { 2063 *map = local_map; 2064 } 2065 return ret; 2066 } 2067 2068 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2069 BlockDriverState *base, 2070 bool want_zero, 2071 int64_t offset, 2072 int64_t bytes, 2073 int64_t *pnum, 2074 int64_t *map, 2075 BlockDriverState **file) 2076 { 2077 BlockDriverState *p; 2078 int ret = 0; 2079 bool first = true; 2080 2081 assert(bs != base); 2082 for (p = bs; p != base; p = backing_bs(p)) { 2083 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2084 file); 2085 if (ret < 0) { 2086 break; 2087 } 2088 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2089 /* 2090 * Reading beyond the end of the file continues to read 2091 * zeroes, but we can only widen the result to the 2092 * unallocated length we learned from an earlier 2093 * iteration. 2094 */ 2095 *pnum = bytes; 2096 } 2097 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2098 break; 2099 } 2100 /* [offset, pnum] unallocated on this layer, which could be only 2101 * the first part of [offset, bytes]. */ 2102 bytes = MIN(bytes, *pnum); 2103 first = false; 2104 } 2105 return ret; 2106 } 2107 2108 /* Coroutine wrapper for bdrv_block_status_above() */ 2109 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2110 { 2111 BdrvCoBlockStatusData *data = opaque; 2112 2113 data->ret = bdrv_co_block_status_above(data->bs, data->base, 2114 data->want_zero, 2115 data->offset, data->bytes, 2116 data->pnum, data->map, data->file); 2117 data->done = true; 2118 } 2119 2120 /* 2121 * Synchronous wrapper around bdrv_co_block_status_above(). 2122 * 2123 * See bdrv_co_block_status_above() for details. 2124 */ 2125 static int bdrv_common_block_status_above(BlockDriverState *bs, 2126 BlockDriverState *base, 2127 bool want_zero, int64_t offset, 2128 int64_t bytes, int64_t *pnum, 2129 int64_t *map, 2130 BlockDriverState **file) 2131 { 2132 Coroutine *co; 2133 BdrvCoBlockStatusData data = { 2134 .bs = bs, 2135 .base = base, 2136 .want_zero = want_zero, 2137 .offset = offset, 2138 .bytes = bytes, 2139 .pnum = pnum, 2140 .map = map, 2141 .file = file, 2142 .done = false, 2143 }; 2144 2145 if (qemu_in_coroutine()) { 2146 /* Fast-path if already in coroutine context */ 2147 bdrv_block_status_above_co_entry(&data); 2148 } else { 2149 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2150 bdrv_coroutine_enter(bs, co); 2151 BDRV_POLL_WHILE(bs, !data.done); 2152 } 2153 return data.ret; 2154 } 2155 2156 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2157 int64_t offset, int64_t bytes, int64_t *pnum, 2158 int64_t *map, BlockDriverState **file) 2159 { 2160 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2161 pnum, map, file); 2162 } 2163 2164 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2165 int64_t *pnum, int64_t *map, BlockDriverState **file) 2166 { 2167 return bdrv_block_status_above(bs, backing_bs(bs), 2168 offset, bytes, pnum, map, file); 2169 } 2170 2171 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2172 int64_t bytes, int64_t *pnum) 2173 { 2174 int ret; 2175 int64_t dummy; 2176 2177 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2178 bytes, pnum ? pnum : &dummy, NULL, 2179 NULL); 2180 if (ret < 0) { 2181 return ret; 2182 } 2183 return !!(ret & BDRV_BLOCK_ALLOCATED); 2184 } 2185 2186 /* 2187 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2188 * 2189 * Return true if (a prefix of) the given range is allocated in any image 2190 * between BASE and TOP (inclusive). BASE can be NULL to check if the given 2191 * offset is allocated in any image of the chain. Return false otherwise, 2192 * or negative errno on failure. 2193 * 2194 * 'pnum' is set to the number of bytes (including and immediately 2195 * following the specified offset) that are known to be in the same 2196 * allocated/unallocated state. Note that a subsequent call starting 2197 * at 'offset + *pnum' may return the same allocation status (in other 2198 * words, the result is not necessarily the maximum possible range); 2199 * but 'pnum' will only be 0 when end of file is reached. 2200 * 2201 */ 2202 int bdrv_is_allocated_above(BlockDriverState *top, 2203 BlockDriverState *base, 2204 int64_t offset, int64_t bytes, int64_t *pnum) 2205 { 2206 BlockDriverState *intermediate; 2207 int ret; 2208 int64_t n = bytes; 2209 2210 intermediate = top; 2211 while (intermediate && intermediate != base) { 2212 int64_t pnum_inter; 2213 int64_t size_inter; 2214 2215 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2216 if (ret < 0) { 2217 return ret; 2218 } 2219 if (ret) { 2220 *pnum = pnum_inter; 2221 return 1; 2222 } 2223 2224 size_inter = bdrv_getlength(intermediate); 2225 if (size_inter < 0) { 2226 return size_inter; 2227 } 2228 if (n > pnum_inter && 2229 (intermediate == top || offset + pnum_inter < size_inter)) { 2230 n = pnum_inter; 2231 } 2232 2233 intermediate = backing_bs(intermediate); 2234 } 2235 2236 *pnum = n; 2237 return 0; 2238 } 2239 2240 typedef struct BdrvVmstateCo { 2241 BlockDriverState *bs; 2242 QEMUIOVector *qiov; 2243 int64_t pos; 2244 bool is_read; 2245 int ret; 2246 } BdrvVmstateCo; 2247 2248 static int coroutine_fn 2249 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2250 bool is_read) 2251 { 2252 BlockDriver *drv = bs->drv; 2253 int ret = -ENOTSUP; 2254 2255 bdrv_inc_in_flight(bs); 2256 2257 if (!drv) { 2258 ret = -ENOMEDIUM; 2259 } else if (drv->bdrv_load_vmstate) { 2260 if (is_read) { 2261 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2262 } else { 2263 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2264 } 2265 } else if (bs->file) { 2266 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2267 } 2268 2269 bdrv_dec_in_flight(bs); 2270 return ret; 2271 } 2272 2273 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2274 { 2275 BdrvVmstateCo *co = opaque; 2276 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2277 } 2278 2279 static inline int 2280 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2281 bool is_read) 2282 { 2283 if (qemu_in_coroutine()) { 2284 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2285 } else { 2286 BdrvVmstateCo data = { 2287 .bs = bs, 2288 .qiov = qiov, 2289 .pos = pos, 2290 .is_read = is_read, 2291 .ret = -EINPROGRESS, 2292 }; 2293 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2294 2295 bdrv_coroutine_enter(bs, co); 2296 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 2297 return data.ret; 2298 } 2299 } 2300 2301 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2302 int64_t pos, int size) 2303 { 2304 QEMUIOVector qiov; 2305 struct iovec iov = { 2306 .iov_base = (void *) buf, 2307 .iov_len = size, 2308 }; 2309 int ret; 2310 2311 qemu_iovec_init_external(&qiov, &iov, 1); 2312 2313 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2314 if (ret < 0) { 2315 return ret; 2316 } 2317 2318 return size; 2319 } 2320 2321 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2322 { 2323 return bdrv_rw_vmstate(bs, qiov, pos, false); 2324 } 2325 2326 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2327 int64_t pos, int size) 2328 { 2329 QEMUIOVector qiov; 2330 struct iovec iov = { 2331 .iov_base = buf, 2332 .iov_len = size, 2333 }; 2334 int ret; 2335 2336 qemu_iovec_init_external(&qiov, &iov, 1); 2337 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2338 if (ret < 0) { 2339 return ret; 2340 } 2341 2342 return size; 2343 } 2344 2345 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2346 { 2347 return bdrv_rw_vmstate(bs, qiov, pos, true); 2348 } 2349 2350 /**************************************************************/ 2351 /* async I/Os */ 2352 2353 void bdrv_aio_cancel(BlockAIOCB *acb) 2354 { 2355 qemu_aio_ref(acb); 2356 bdrv_aio_cancel_async(acb); 2357 while (acb->refcnt > 1) { 2358 if (acb->aiocb_info->get_aio_context) { 2359 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2360 } else if (acb->bs) { 2361 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2362 * assert that we're not using an I/O thread. Thread-safe 2363 * code should use bdrv_aio_cancel_async exclusively. 2364 */ 2365 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2366 aio_poll(bdrv_get_aio_context(acb->bs), true); 2367 } else { 2368 abort(); 2369 } 2370 } 2371 qemu_aio_unref(acb); 2372 } 2373 2374 /* Async version of aio cancel. The caller is not blocked if the acb implements 2375 * cancel_async, otherwise we do nothing and let the request normally complete. 2376 * In either case the completion callback must be called. */ 2377 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2378 { 2379 if (acb->aiocb_info->cancel_async) { 2380 acb->aiocb_info->cancel_async(acb); 2381 } 2382 } 2383 2384 /**************************************************************/ 2385 /* Coroutine block device emulation */ 2386 2387 typedef struct FlushCo { 2388 BlockDriverState *bs; 2389 int ret; 2390 } FlushCo; 2391 2392 2393 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2394 { 2395 FlushCo *rwco = opaque; 2396 2397 rwco->ret = bdrv_co_flush(rwco->bs); 2398 } 2399 2400 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2401 { 2402 int current_gen; 2403 int ret = 0; 2404 2405 bdrv_inc_in_flight(bs); 2406 2407 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2408 bdrv_is_sg(bs)) { 2409 goto early_exit; 2410 } 2411 2412 qemu_co_mutex_lock(&bs->reqs_lock); 2413 current_gen = atomic_read(&bs->write_gen); 2414 2415 /* Wait until any previous flushes are completed */ 2416 while (bs->active_flush_req) { 2417 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2418 } 2419 2420 /* Flushes reach this point in nondecreasing current_gen order. */ 2421 bs->active_flush_req = true; 2422 qemu_co_mutex_unlock(&bs->reqs_lock); 2423 2424 /* Write back all layers by calling one driver function */ 2425 if (bs->drv->bdrv_co_flush) { 2426 ret = bs->drv->bdrv_co_flush(bs); 2427 goto out; 2428 } 2429 2430 /* Write back cached data to the OS even with cache=unsafe */ 2431 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2432 if (bs->drv->bdrv_co_flush_to_os) { 2433 ret = bs->drv->bdrv_co_flush_to_os(bs); 2434 if (ret < 0) { 2435 goto out; 2436 } 2437 } 2438 2439 /* But don't actually force it to the disk with cache=unsafe */ 2440 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2441 goto flush_parent; 2442 } 2443 2444 /* Check if we really need to flush anything */ 2445 if (bs->flushed_gen == current_gen) { 2446 goto flush_parent; 2447 } 2448 2449 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2450 if (!bs->drv) { 2451 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2452 * (even in case of apparent success) */ 2453 ret = -ENOMEDIUM; 2454 goto out; 2455 } 2456 if (bs->drv->bdrv_co_flush_to_disk) { 2457 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2458 } else if (bs->drv->bdrv_aio_flush) { 2459 BlockAIOCB *acb; 2460 CoroutineIOCompletion co = { 2461 .coroutine = qemu_coroutine_self(), 2462 }; 2463 2464 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2465 if (acb == NULL) { 2466 ret = -EIO; 2467 } else { 2468 qemu_coroutine_yield(); 2469 ret = co.ret; 2470 } 2471 } else { 2472 /* 2473 * Some block drivers always operate in either writethrough or unsafe 2474 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2475 * know how the server works (because the behaviour is hardcoded or 2476 * depends on server-side configuration), so we can't ensure that 2477 * everything is safe on disk. Returning an error doesn't work because 2478 * that would break guests even if the server operates in writethrough 2479 * mode. 2480 * 2481 * Let's hope the user knows what he's doing. 2482 */ 2483 ret = 0; 2484 } 2485 2486 if (ret < 0) { 2487 goto out; 2488 } 2489 2490 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2491 * in the case of cache=unsafe, so there are no useless flushes. 2492 */ 2493 flush_parent: 2494 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2495 out: 2496 /* Notify any pending flushes that we have completed */ 2497 if (ret == 0) { 2498 bs->flushed_gen = current_gen; 2499 } 2500 2501 qemu_co_mutex_lock(&bs->reqs_lock); 2502 bs->active_flush_req = false; 2503 /* Return value is ignored - it's ok if wait queue is empty */ 2504 qemu_co_queue_next(&bs->flush_queue); 2505 qemu_co_mutex_unlock(&bs->reqs_lock); 2506 2507 early_exit: 2508 bdrv_dec_in_flight(bs); 2509 return ret; 2510 } 2511 2512 int bdrv_flush(BlockDriverState *bs) 2513 { 2514 Coroutine *co; 2515 FlushCo flush_co = { 2516 .bs = bs, 2517 .ret = NOT_DONE, 2518 }; 2519 2520 if (qemu_in_coroutine()) { 2521 /* Fast-path if already in coroutine context */ 2522 bdrv_flush_co_entry(&flush_co); 2523 } else { 2524 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2525 bdrv_coroutine_enter(bs, co); 2526 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2527 } 2528 2529 return flush_co.ret; 2530 } 2531 2532 typedef struct DiscardCo { 2533 BlockDriverState *bs; 2534 int64_t offset; 2535 int bytes; 2536 int ret; 2537 } DiscardCo; 2538 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2539 { 2540 DiscardCo *rwco = opaque; 2541 2542 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); 2543 } 2544 2545 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2546 int bytes) 2547 { 2548 BdrvTrackedRequest req; 2549 int max_pdiscard, ret; 2550 int head, tail, align; 2551 2552 if (!bs->drv) { 2553 return -ENOMEDIUM; 2554 } 2555 2556 if (bdrv_has_readonly_bitmaps(bs)) { 2557 return -EPERM; 2558 } 2559 2560 ret = bdrv_check_byte_request(bs, offset, bytes); 2561 if (ret < 0) { 2562 return ret; 2563 } else if (bs->read_only) { 2564 return -EPERM; 2565 } 2566 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2567 2568 /* Do nothing if disabled. */ 2569 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2570 return 0; 2571 } 2572 2573 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2574 return 0; 2575 } 2576 2577 /* Discard is advisory, but some devices track and coalesce 2578 * unaligned requests, so we must pass everything down rather than 2579 * round here. Still, most devices will just silently ignore 2580 * unaligned requests (by returning -ENOTSUP), so we must fragment 2581 * the request accordingly. */ 2582 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2583 assert(align % bs->bl.request_alignment == 0); 2584 head = offset % align; 2585 tail = (offset + bytes) % align; 2586 2587 bdrv_inc_in_flight(bs); 2588 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2589 2590 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2591 if (ret < 0) { 2592 goto out; 2593 } 2594 2595 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2596 align); 2597 assert(max_pdiscard >= bs->bl.request_alignment); 2598 2599 while (bytes > 0) { 2600 int num = bytes; 2601 2602 if (head) { 2603 /* Make small requests to get to alignment boundaries. */ 2604 num = MIN(bytes, align - head); 2605 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2606 num %= bs->bl.request_alignment; 2607 } 2608 head = (head + num) % align; 2609 assert(num < max_pdiscard); 2610 } else if (tail) { 2611 if (num > align) { 2612 /* Shorten the request to the last aligned cluster. */ 2613 num -= tail; 2614 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2615 tail > bs->bl.request_alignment) { 2616 tail %= bs->bl.request_alignment; 2617 num -= tail; 2618 } 2619 } 2620 /* limit request size */ 2621 if (num > max_pdiscard) { 2622 num = max_pdiscard; 2623 } 2624 2625 if (!bs->drv) { 2626 ret = -ENOMEDIUM; 2627 goto out; 2628 } 2629 if (bs->drv->bdrv_co_pdiscard) { 2630 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2631 } else { 2632 BlockAIOCB *acb; 2633 CoroutineIOCompletion co = { 2634 .coroutine = qemu_coroutine_self(), 2635 }; 2636 2637 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2638 bdrv_co_io_em_complete, &co); 2639 if (acb == NULL) { 2640 ret = -EIO; 2641 goto out; 2642 } else { 2643 qemu_coroutine_yield(); 2644 ret = co.ret; 2645 } 2646 } 2647 if (ret && ret != -ENOTSUP) { 2648 goto out; 2649 } 2650 2651 offset += num; 2652 bytes -= num; 2653 } 2654 ret = 0; 2655 out: 2656 atomic_inc(&bs->write_gen); 2657 bdrv_set_dirty(bs, req.offset, req.bytes); 2658 tracked_request_end(&req); 2659 bdrv_dec_in_flight(bs); 2660 return ret; 2661 } 2662 2663 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 2664 { 2665 Coroutine *co; 2666 DiscardCo rwco = { 2667 .bs = bs, 2668 .offset = offset, 2669 .bytes = bytes, 2670 .ret = NOT_DONE, 2671 }; 2672 2673 if (qemu_in_coroutine()) { 2674 /* Fast-path if already in coroutine context */ 2675 bdrv_pdiscard_co_entry(&rwco); 2676 } else { 2677 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2678 bdrv_coroutine_enter(bs, co); 2679 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 2680 } 2681 2682 return rwco.ret; 2683 } 2684 2685 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 2686 { 2687 BlockDriver *drv = bs->drv; 2688 CoroutineIOCompletion co = { 2689 .coroutine = qemu_coroutine_self(), 2690 }; 2691 BlockAIOCB *acb; 2692 2693 bdrv_inc_in_flight(bs); 2694 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 2695 co.ret = -ENOTSUP; 2696 goto out; 2697 } 2698 2699 if (drv->bdrv_co_ioctl) { 2700 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 2701 } else { 2702 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2703 if (!acb) { 2704 co.ret = -ENOTSUP; 2705 goto out; 2706 } 2707 qemu_coroutine_yield(); 2708 } 2709 out: 2710 bdrv_dec_in_flight(bs); 2711 return co.ret; 2712 } 2713 2714 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2715 { 2716 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2717 } 2718 2719 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2720 { 2721 return memset(qemu_blockalign(bs, size), 0, size); 2722 } 2723 2724 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2725 { 2726 size_t align = bdrv_opt_mem_align(bs); 2727 2728 /* Ensure that NULL is never returned on success */ 2729 assert(align > 0); 2730 if (size == 0) { 2731 size = align; 2732 } 2733 2734 return qemu_try_memalign(align, size); 2735 } 2736 2737 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2738 { 2739 void *mem = qemu_try_blockalign(bs, size); 2740 2741 if (mem) { 2742 memset(mem, 0, size); 2743 } 2744 2745 return mem; 2746 } 2747 2748 /* 2749 * Check if all memory in this vector is sector aligned. 2750 */ 2751 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2752 { 2753 int i; 2754 size_t alignment = bdrv_min_mem_align(bs); 2755 2756 for (i = 0; i < qiov->niov; i++) { 2757 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2758 return false; 2759 } 2760 if (qiov->iov[i].iov_len % alignment) { 2761 return false; 2762 } 2763 } 2764 2765 return true; 2766 } 2767 2768 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2769 NotifierWithReturn *notifier) 2770 { 2771 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2772 } 2773 2774 void bdrv_io_plug(BlockDriverState *bs) 2775 { 2776 BdrvChild *child; 2777 2778 QLIST_FOREACH(child, &bs->children, next) { 2779 bdrv_io_plug(child->bs); 2780 } 2781 2782 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 2783 BlockDriver *drv = bs->drv; 2784 if (drv && drv->bdrv_io_plug) { 2785 drv->bdrv_io_plug(bs); 2786 } 2787 } 2788 } 2789 2790 void bdrv_io_unplug(BlockDriverState *bs) 2791 { 2792 BdrvChild *child; 2793 2794 assert(bs->io_plugged); 2795 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 2796 BlockDriver *drv = bs->drv; 2797 if (drv && drv->bdrv_io_unplug) { 2798 drv->bdrv_io_unplug(bs); 2799 } 2800 } 2801 2802 QLIST_FOREACH(child, &bs->children, next) { 2803 bdrv_io_unplug(child->bs); 2804 } 2805 } 2806 2807 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 2808 { 2809 BdrvChild *child; 2810 2811 if (bs->drv && bs->drv->bdrv_register_buf) { 2812 bs->drv->bdrv_register_buf(bs, host, size); 2813 } 2814 QLIST_FOREACH(child, &bs->children, next) { 2815 bdrv_register_buf(child->bs, host, size); 2816 } 2817 } 2818 2819 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 2820 { 2821 BdrvChild *child; 2822 2823 if (bs->drv && bs->drv->bdrv_unregister_buf) { 2824 bs->drv->bdrv_unregister_buf(bs, host); 2825 } 2826 QLIST_FOREACH(child, &bs->children, next) { 2827 bdrv_unregister_buf(child->bs, host); 2828 } 2829 } 2830