1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/blockjob_int.h" 30 #include "block/block_int.h" 31 #include "qemu/cutils.h" 32 #include "qapi/error.h" 33 #include "qemu/error-report.h" 34 35 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 36 37 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 38 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 39 40 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 41 int64_t offset, int bytes, BdrvRequestFlags flags); 42 43 void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) 44 { 45 BdrvChild *c, *next; 46 47 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 48 if (c == ignore) { 49 continue; 50 } 51 if (c->role->drained_begin) { 52 c->role->drained_begin(c); 53 } 54 } 55 } 56 57 void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) 58 { 59 BdrvChild *c, *next; 60 61 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 62 if (c == ignore) { 63 continue; 64 } 65 if (c->role->drained_end) { 66 c->role->drained_end(c); 67 } 68 } 69 } 70 71 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 72 { 73 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 74 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 75 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 76 src->opt_mem_alignment); 77 dst->min_mem_alignment = MAX(dst->min_mem_alignment, 78 src->min_mem_alignment); 79 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 80 } 81 82 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 83 { 84 BlockDriver *drv = bs->drv; 85 Error *local_err = NULL; 86 87 memset(&bs->bl, 0, sizeof(bs->bl)); 88 89 if (!drv) { 90 return; 91 } 92 93 /* Default alignment based on whether driver has byte interface */ 94 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512; 95 96 /* Take some limits from the children as a default */ 97 if (bs->file) { 98 bdrv_refresh_limits(bs->file->bs, &local_err); 99 if (local_err) { 100 error_propagate(errp, local_err); 101 return; 102 } 103 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 104 } else { 105 bs->bl.min_mem_alignment = 512; 106 bs->bl.opt_mem_alignment = getpagesize(); 107 108 /* Safe default since most protocols use readv()/writev()/etc */ 109 bs->bl.max_iov = IOV_MAX; 110 } 111 112 if (bs->backing) { 113 bdrv_refresh_limits(bs->backing->bs, &local_err); 114 if (local_err) { 115 error_propagate(errp, local_err); 116 return; 117 } 118 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 119 } 120 121 /* Then let the driver override it */ 122 if (drv->bdrv_refresh_limits) { 123 drv->bdrv_refresh_limits(bs, errp); 124 } 125 } 126 127 /** 128 * The copy-on-read flag is actually a reference count so multiple users may 129 * use the feature without worrying about clobbering its previous state. 130 * Copy-on-read stays enabled until all users have called to disable it. 131 */ 132 void bdrv_enable_copy_on_read(BlockDriverState *bs) 133 { 134 atomic_inc(&bs->copy_on_read); 135 } 136 137 void bdrv_disable_copy_on_read(BlockDriverState *bs) 138 { 139 int old = atomic_fetch_dec(&bs->copy_on_read); 140 assert(old >= 1); 141 } 142 143 typedef struct { 144 Coroutine *co; 145 BlockDriverState *bs; 146 bool done; 147 bool begin; 148 bool recursive; 149 BdrvChild *parent; 150 } BdrvCoDrainData; 151 152 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 153 { 154 BdrvCoDrainData *data = opaque; 155 BlockDriverState *bs = data->bs; 156 157 if (data->begin) { 158 bs->drv->bdrv_co_drain_begin(bs); 159 } else { 160 bs->drv->bdrv_co_drain_end(bs); 161 } 162 163 /* Set data->done before reading bs->wakeup. */ 164 atomic_mb_set(&data->done, true); 165 bdrv_wakeup(bs); 166 } 167 168 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 169 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive) 170 { 171 BdrvChild *child, *tmp; 172 BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin}; 173 174 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 175 (!begin && !bs->drv->bdrv_co_drain_end)) { 176 return; 177 } 178 179 data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data); 180 bdrv_coroutine_enter(bs, data.co); 181 BDRV_POLL_WHILE(bs, !data.done); 182 183 if (recursive) { 184 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 185 bdrv_drain_invoke(child->bs, begin, true); 186 } 187 } 188 } 189 190 static bool bdrv_drain_recurse(BlockDriverState *bs) 191 { 192 BdrvChild *child, *tmp; 193 bool waited; 194 195 /* Wait for drained requests to finish */ 196 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); 197 198 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { 199 BlockDriverState *bs = child->bs; 200 bool in_main_loop = 201 qemu_get_current_aio_context() == qemu_get_aio_context(); 202 assert(bs->refcnt > 0); 203 if (in_main_loop) { 204 /* In case the recursive bdrv_drain_recurse processes a 205 * block_job_defer_to_main_loop BH and modifies the graph, 206 * let's hold a reference to bs until we are done. 207 * 208 * IOThread doesn't have such a BH, and it is not safe to call 209 * bdrv_unref without BQL, so skip doing it there. 210 */ 211 bdrv_ref(bs); 212 } 213 waited |= bdrv_drain_recurse(bs); 214 if (in_main_loop) { 215 bdrv_unref(bs); 216 } 217 } 218 219 return waited; 220 } 221 222 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 223 BdrvChild *parent); 224 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 225 BdrvChild *parent); 226 227 static void bdrv_co_drain_bh_cb(void *opaque) 228 { 229 BdrvCoDrainData *data = opaque; 230 Coroutine *co = data->co; 231 BlockDriverState *bs = data->bs; 232 233 bdrv_dec_in_flight(bs); 234 if (data->begin) { 235 bdrv_do_drained_begin(bs, data->recursive, data->parent); 236 } else { 237 bdrv_do_drained_end(bs, data->recursive, data->parent); 238 } 239 240 data->done = true; 241 aio_co_wake(co); 242 } 243 244 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 245 bool begin, bool recursive, 246 BdrvChild *parent) 247 { 248 BdrvCoDrainData data; 249 250 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 251 * other coroutines run if they were queued from 252 * qemu_co_queue_run_restart(). */ 253 254 assert(qemu_in_coroutine()); 255 data = (BdrvCoDrainData) { 256 .co = qemu_coroutine_self(), 257 .bs = bs, 258 .done = false, 259 .begin = begin, 260 .recursive = recursive, 261 .parent = parent, 262 }; 263 bdrv_inc_in_flight(bs); 264 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 265 bdrv_co_drain_bh_cb, &data); 266 267 qemu_coroutine_yield(); 268 /* If we are resumed from some other event (such as an aio completion or a 269 * timer callback), it is a bug in the caller that should be fixed. */ 270 assert(data.done); 271 } 272 273 void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 274 BdrvChild *parent) 275 { 276 BdrvChild *child, *next; 277 278 if (qemu_in_coroutine()) { 279 bdrv_co_yield_to_drain(bs, true, recursive, parent); 280 return; 281 } 282 283 /* Stop things in parent-to-child order */ 284 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 285 aio_disable_external(bdrv_get_aio_context(bs)); 286 } 287 288 bdrv_parent_drained_begin(bs, parent); 289 bdrv_drain_invoke(bs, true, false); 290 bdrv_drain_recurse(bs); 291 292 if (recursive) { 293 bs->recursive_quiesce_counter++; 294 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 295 bdrv_do_drained_begin(child->bs, true, child); 296 } 297 } 298 } 299 300 void bdrv_drained_begin(BlockDriverState *bs) 301 { 302 bdrv_do_drained_begin(bs, false, NULL); 303 } 304 305 void bdrv_subtree_drained_begin(BlockDriverState *bs) 306 { 307 bdrv_do_drained_begin(bs, true, NULL); 308 } 309 310 void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 311 BdrvChild *parent) 312 { 313 BdrvChild *child, *next; 314 int old_quiesce_counter; 315 316 if (qemu_in_coroutine()) { 317 bdrv_co_yield_to_drain(bs, false, recursive, parent); 318 return; 319 } 320 assert(bs->quiesce_counter > 0); 321 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 322 323 /* Re-enable things in child-to-parent order */ 324 bdrv_drain_invoke(bs, false, false); 325 bdrv_parent_drained_end(bs, parent); 326 if (old_quiesce_counter == 1) { 327 aio_enable_external(bdrv_get_aio_context(bs)); 328 } 329 330 if (recursive) { 331 bs->recursive_quiesce_counter--; 332 QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 333 bdrv_do_drained_end(child->bs, true, child); 334 } 335 } 336 } 337 338 void bdrv_drained_end(BlockDriverState *bs) 339 { 340 bdrv_do_drained_end(bs, false, NULL); 341 } 342 343 void bdrv_subtree_drained_end(BlockDriverState *bs) 344 { 345 bdrv_do_drained_end(bs, true, NULL); 346 } 347 348 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 349 { 350 int i; 351 352 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 353 bdrv_do_drained_begin(child->bs, true, child); 354 } 355 } 356 357 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 358 { 359 int i; 360 361 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 362 bdrv_do_drained_end(child->bs, true, child); 363 } 364 } 365 366 /* 367 * Wait for pending requests to complete on a single BlockDriverState subtree, 368 * and suspend block driver's internal I/O until next request arrives. 369 * 370 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 371 * AioContext. 372 * 373 * Only this BlockDriverState's AioContext is run, so in-flight requests must 374 * not depend on events in other AioContexts. In that case, use 375 * bdrv_drain_all() instead. 376 */ 377 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 378 { 379 assert(qemu_in_coroutine()); 380 bdrv_drained_begin(bs); 381 bdrv_drained_end(bs); 382 } 383 384 void bdrv_drain(BlockDriverState *bs) 385 { 386 bdrv_drained_begin(bs); 387 bdrv_drained_end(bs); 388 } 389 390 /* 391 * Wait for pending requests to complete across all BlockDriverStates 392 * 393 * This function does not flush data to disk, use bdrv_flush_all() for that 394 * after calling this function. 395 * 396 * This pauses all block jobs and disables external clients. It must 397 * be paired with bdrv_drain_all_end(). 398 * 399 * NOTE: no new block jobs or BlockDriverStates can be created between 400 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 401 */ 402 void bdrv_drain_all_begin(void) 403 { 404 /* Always run first iteration so any pending completion BHs run */ 405 bool waited = true; 406 BlockDriverState *bs; 407 BdrvNextIterator it; 408 GSList *aio_ctxs = NULL, *ctx; 409 410 /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread 411 * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on 412 * nodes in several different AioContexts, so make sure we're in the main 413 * context. */ 414 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 415 416 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 417 AioContext *aio_context = bdrv_get_aio_context(bs); 418 419 /* Stop things in parent-to-child order */ 420 aio_context_acquire(aio_context); 421 aio_disable_external(aio_context); 422 bdrv_parent_drained_begin(bs, NULL); 423 bdrv_drain_invoke(bs, true, true); 424 aio_context_release(aio_context); 425 426 if (!g_slist_find(aio_ctxs, aio_context)) { 427 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 428 } 429 } 430 431 /* Note that completion of an asynchronous I/O operation can trigger any 432 * number of other I/O operations on other devices---for example a 433 * coroutine can submit an I/O request to another device in response to 434 * request completion. Therefore we must keep looping until there was no 435 * more activity rather than simply draining each device independently. 436 */ 437 while (waited) { 438 waited = false; 439 440 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 441 AioContext *aio_context = ctx->data; 442 443 aio_context_acquire(aio_context); 444 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 445 if (aio_context == bdrv_get_aio_context(bs)) { 446 waited |= bdrv_drain_recurse(bs); 447 } 448 } 449 aio_context_release(aio_context); 450 } 451 } 452 453 g_slist_free(aio_ctxs); 454 } 455 456 void bdrv_drain_all_end(void) 457 { 458 BlockDriverState *bs; 459 BdrvNextIterator it; 460 461 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 462 AioContext *aio_context = bdrv_get_aio_context(bs); 463 464 /* Re-enable things in child-to-parent order */ 465 aio_context_acquire(aio_context); 466 bdrv_drain_invoke(bs, false, true); 467 bdrv_parent_drained_end(bs, NULL); 468 aio_enable_external(aio_context); 469 aio_context_release(aio_context); 470 } 471 } 472 473 void bdrv_drain_all(void) 474 { 475 bdrv_drain_all_begin(); 476 bdrv_drain_all_end(); 477 } 478 479 /** 480 * Remove an active request from the tracked requests list 481 * 482 * This function should be called when a tracked request is completing. 483 */ 484 static void tracked_request_end(BdrvTrackedRequest *req) 485 { 486 if (req->serialising) { 487 atomic_dec(&req->bs->serialising_in_flight); 488 } 489 490 qemu_co_mutex_lock(&req->bs->reqs_lock); 491 QLIST_REMOVE(req, list); 492 qemu_co_queue_restart_all(&req->wait_queue); 493 qemu_co_mutex_unlock(&req->bs->reqs_lock); 494 } 495 496 /** 497 * Add an active request to the tracked requests list 498 */ 499 static void tracked_request_begin(BdrvTrackedRequest *req, 500 BlockDriverState *bs, 501 int64_t offset, 502 unsigned int bytes, 503 enum BdrvTrackedRequestType type) 504 { 505 *req = (BdrvTrackedRequest){ 506 .bs = bs, 507 .offset = offset, 508 .bytes = bytes, 509 .type = type, 510 .co = qemu_coroutine_self(), 511 .serialising = false, 512 .overlap_offset = offset, 513 .overlap_bytes = bytes, 514 }; 515 516 qemu_co_queue_init(&req->wait_queue); 517 518 qemu_co_mutex_lock(&bs->reqs_lock); 519 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 520 qemu_co_mutex_unlock(&bs->reqs_lock); 521 } 522 523 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 524 { 525 int64_t overlap_offset = req->offset & ~(align - 1); 526 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 527 - overlap_offset; 528 529 if (!req->serialising) { 530 atomic_inc(&req->bs->serialising_in_flight); 531 req->serialising = true; 532 } 533 534 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 535 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 536 } 537 538 /** 539 * Round a region to cluster boundaries 540 */ 541 void bdrv_round_to_clusters(BlockDriverState *bs, 542 int64_t offset, int64_t bytes, 543 int64_t *cluster_offset, 544 int64_t *cluster_bytes) 545 { 546 BlockDriverInfo bdi; 547 548 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 549 *cluster_offset = offset; 550 *cluster_bytes = bytes; 551 } else { 552 int64_t c = bdi.cluster_size; 553 *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 554 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 555 } 556 } 557 558 static int bdrv_get_cluster_size(BlockDriverState *bs) 559 { 560 BlockDriverInfo bdi; 561 int ret; 562 563 ret = bdrv_get_info(bs, &bdi); 564 if (ret < 0 || bdi.cluster_size == 0) { 565 return bs->bl.request_alignment; 566 } else { 567 return bdi.cluster_size; 568 } 569 } 570 571 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 572 int64_t offset, unsigned int bytes) 573 { 574 /* aaaa bbbb */ 575 if (offset >= req->overlap_offset + req->overlap_bytes) { 576 return false; 577 } 578 /* bbbb aaaa */ 579 if (req->overlap_offset >= offset + bytes) { 580 return false; 581 } 582 return true; 583 } 584 585 void bdrv_inc_in_flight(BlockDriverState *bs) 586 { 587 atomic_inc(&bs->in_flight); 588 } 589 590 static void dummy_bh_cb(void *opaque) 591 { 592 } 593 594 void bdrv_wakeup(BlockDriverState *bs) 595 { 596 /* The barrier (or an atomic op) is in the caller. */ 597 if (atomic_read(&bs->wakeup)) { 598 aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); 599 } 600 } 601 602 void bdrv_dec_in_flight(BlockDriverState *bs) 603 { 604 atomic_dec(&bs->in_flight); 605 bdrv_wakeup(bs); 606 } 607 608 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 609 { 610 BlockDriverState *bs = self->bs; 611 BdrvTrackedRequest *req; 612 bool retry; 613 bool waited = false; 614 615 if (!atomic_read(&bs->serialising_in_flight)) { 616 return false; 617 } 618 619 do { 620 retry = false; 621 qemu_co_mutex_lock(&bs->reqs_lock); 622 QLIST_FOREACH(req, &bs->tracked_requests, list) { 623 if (req == self || (!req->serialising && !self->serialising)) { 624 continue; 625 } 626 if (tracked_request_overlaps(req, self->overlap_offset, 627 self->overlap_bytes)) 628 { 629 /* Hitting this means there was a reentrant request, for 630 * example, a block driver issuing nested requests. This must 631 * never happen since it means deadlock. 632 */ 633 assert(qemu_coroutine_self() != req->co); 634 635 /* If the request is already (indirectly) waiting for us, or 636 * will wait for us as soon as it wakes up, then just go on 637 * (instead of producing a deadlock in the former case). */ 638 if (!req->waiting_for) { 639 self->waiting_for = req; 640 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 641 self->waiting_for = NULL; 642 retry = true; 643 waited = true; 644 break; 645 } 646 } 647 } 648 qemu_co_mutex_unlock(&bs->reqs_lock); 649 } while (retry); 650 651 return waited; 652 } 653 654 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 655 size_t size) 656 { 657 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 658 return -EIO; 659 } 660 661 if (!bdrv_is_inserted(bs)) { 662 return -ENOMEDIUM; 663 } 664 665 if (offset < 0) { 666 return -EIO; 667 } 668 669 return 0; 670 } 671 672 typedef struct RwCo { 673 BdrvChild *child; 674 int64_t offset; 675 QEMUIOVector *qiov; 676 bool is_write; 677 int ret; 678 BdrvRequestFlags flags; 679 } RwCo; 680 681 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 682 { 683 RwCo *rwco = opaque; 684 685 if (!rwco->is_write) { 686 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 687 rwco->qiov->size, rwco->qiov, 688 rwco->flags); 689 } else { 690 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 691 rwco->qiov->size, rwco->qiov, 692 rwco->flags); 693 } 694 } 695 696 /* 697 * Process a vectored synchronous request using coroutines 698 */ 699 static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 700 QEMUIOVector *qiov, bool is_write, 701 BdrvRequestFlags flags) 702 { 703 Coroutine *co; 704 RwCo rwco = { 705 .child = child, 706 .offset = offset, 707 .qiov = qiov, 708 .is_write = is_write, 709 .ret = NOT_DONE, 710 .flags = flags, 711 }; 712 713 if (qemu_in_coroutine()) { 714 /* Fast-path if already in coroutine context */ 715 bdrv_rw_co_entry(&rwco); 716 } else { 717 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 718 bdrv_coroutine_enter(child->bs, co); 719 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 720 } 721 return rwco.ret; 722 } 723 724 /* 725 * Process a synchronous request using coroutines 726 */ 727 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 728 int nb_sectors, bool is_write, BdrvRequestFlags flags) 729 { 730 QEMUIOVector qiov; 731 struct iovec iov = { 732 .iov_base = (void *)buf, 733 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 734 }; 735 736 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 737 return -EINVAL; 738 } 739 740 qemu_iovec_init_external(&qiov, &iov, 1); 741 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 742 &qiov, is_write, flags); 743 } 744 745 /* return < 0 if error. See bdrv_write() for the return codes */ 746 int bdrv_read(BdrvChild *child, int64_t sector_num, 747 uint8_t *buf, int nb_sectors) 748 { 749 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 750 } 751 752 /* Return < 0 if error. Important errors are: 753 -EIO generic I/O error (may happen for all errors) 754 -ENOMEDIUM No media inserted. 755 -EINVAL Invalid sector number or nb_sectors 756 -EACCES Trying to write a read-only device 757 */ 758 int bdrv_write(BdrvChild *child, int64_t sector_num, 759 const uint8_t *buf, int nb_sectors) 760 { 761 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 762 } 763 764 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 765 int bytes, BdrvRequestFlags flags) 766 { 767 QEMUIOVector qiov; 768 struct iovec iov = { 769 .iov_base = NULL, 770 .iov_len = bytes, 771 }; 772 773 qemu_iovec_init_external(&qiov, &iov, 1); 774 return bdrv_prwv_co(child, offset, &qiov, true, 775 BDRV_REQ_ZERO_WRITE | flags); 776 } 777 778 /* 779 * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 780 * The operation is sped up by checking the block status and only writing 781 * zeroes to the device if they currently do not return zeroes. Optional 782 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 783 * BDRV_REQ_FUA). 784 * 785 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 786 */ 787 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 788 { 789 int ret; 790 int64_t target_size, bytes, offset = 0; 791 BlockDriverState *bs = child->bs; 792 793 target_size = bdrv_getlength(bs); 794 if (target_size < 0) { 795 return target_size; 796 } 797 798 for (;;) { 799 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 800 if (bytes <= 0) { 801 return 0; 802 } 803 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 804 if (ret < 0) { 805 error_report("error getting block status at offset %" PRId64 ": %s", 806 offset, strerror(-ret)); 807 return ret; 808 } 809 if (ret & BDRV_BLOCK_ZERO) { 810 offset += bytes; 811 continue; 812 } 813 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 814 if (ret < 0) { 815 error_report("error writing zeroes at offset %" PRId64 ": %s", 816 offset, strerror(-ret)); 817 return ret; 818 } 819 offset += bytes; 820 } 821 } 822 823 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 824 { 825 int ret; 826 827 ret = bdrv_prwv_co(child, offset, qiov, false, 0); 828 if (ret < 0) { 829 return ret; 830 } 831 832 return qiov->size; 833 } 834 835 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 836 { 837 QEMUIOVector qiov; 838 struct iovec iov = { 839 .iov_base = (void *)buf, 840 .iov_len = bytes, 841 }; 842 843 if (bytes < 0) { 844 return -EINVAL; 845 } 846 847 qemu_iovec_init_external(&qiov, &iov, 1); 848 return bdrv_preadv(child, offset, &qiov); 849 } 850 851 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 852 { 853 int ret; 854 855 ret = bdrv_prwv_co(child, offset, qiov, true, 0); 856 if (ret < 0) { 857 return ret; 858 } 859 860 return qiov->size; 861 } 862 863 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 864 { 865 QEMUIOVector qiov; 866 struct iovec iov = { 867 .iov_base = (void *) buf, 868 .iov_len = bytes, 869 }; 870 871 if (bytes < 0) { 872 return -EINVAL; 873 } 874 875 qemu_iovec_init_external(&qiov, &iov, 1); 876 return bdrv_pwritev(child, offset, &qiov); 877 } 878 879 /* 880 * Writes to the file and ensures that no writes are reordered across this 881 * request (acts as a barrier) 882 * 883 * Returns 0 on success, -errno in error cases. 884 */ 885 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 886 const void *buf, int count) 887 { 888 int ret; 889 890 ret = bdrv_pwrite(child, offset, buf, count); 891 if (ret < 0) { 892 return ret; 893 } 894 895 ret = bdrv_flush(child->bs); 896 if (ret < 0) { 897 return ret; 898 } 899 900 return 0; 901 } 902 903 typedef struct CoroutineIOCompletion { 904 Coroutine *coroutine; 905 int ret; 906 } CoroutineIOCompletion; 907 908 static void bdrv_co_io_em_complete(void *opaque, int ret) 909 { 910 CoroutineIOCompletion *co = opaque; 911 912 co->ret = ret; 913 aio_co_wake(co->coroutine); 914 } 915 916 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 917 uint64_t offset, uint64_t bytes, 918 QEMUIOVector *qiov, int flags) 919 { 920 BlockDriver *drv = bs->drv; 921 int64_t sector_num; 922 unsigned int nb_sectors; 923 924 assert(!(flags & ~BDRV_REQ_MASK)); 925 926 if (!drv) { 927 return -ENOMEDIUM; 928 } 929 930 if (drv->bdrv_co_preadv) { 931 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 932 } 933 934 sector_num = offset >> BDRV_SECTOR_BITS; 935 nb_sectors = bytes >> BDRV_SECTOR_BITS; 936 937 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 938 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 939 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 940 941 if (drv->bdrv_co_readv) { 942 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 943 } else { 944 BlockAIOCB *acb; 945 CoroutineIOCompletion co = { 946 .coroutine = qemu_coroutine_self(), 947 }; 948 949 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, 950 bdrv_co_io_em_complete, &co); 951 if (acb == NULL) { 952 return -EIO; 953 } else { 954 qemu_coroutine_yield(); 955 return co.ret; 956 } 957 } 958 } 959 960 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 961 uint64_t offset, uint64_t bytes, 962 QEMUIOVector *qiov, int flags) 963 { 964 BlockDriver *drv = bs->drv; 965 int64_t sector_num; 966 unsigned int nb_sectors; 967 int ret; 968 969 assert(!(flags & ~BDRV_REQ_MASK)); 970 971 if (!drv) { 972 return -ENOMEDIUM; 973 } 974 975 if (drv->bdrv_co_pwritev) { 976 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 977 flags & bs->supported_write_flags); 978 flags &= ~bs->supported_write_flags; 979 goto emulate_flags; 980 } 981 982 sector_num = offset >> BDRV_SECTOR_BITS; 983 nb_sectors = bytes >> BDRV_SECTOR_BITS; 984 985 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 986 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 987 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 988 989 if (drv->bdrv_co_writev_flags) { 990 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, 991 flags & bs->supported_write_flags); 992 flags &= ~bs->supported_write_flags; 993 } else if (drv->bdrv_co_writev) { 994 assert(!bs->supported_write_flags); 995 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 996 } else { 997 BlockAIOCB *acb; 998 CoroutineIOCompletion co = { 999 .coroutine = qemu_coroutine_self(), 1000 }; 1001 1002 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, 1003 bdrv_co_io_em_complete, &co); 1004 if (acb == NULL) { 1005 ret = -EIO; 1006 } else { 1007 qemu_coroutine_yield(); 1008 ret = co.ret; 1009 } 1010 } 1011 1012 emulate_flags: 1013 if (ret == 0 && (flags & BDRV_REQ_FUA)) { 1014 ret = bdrv_co_flush(bs); 1015 } 1016 1017 return ret; 1018 } 1019 1020 static int coroutine_fn 1021 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1022 uint64_t bytes, QEMUIOVector *qiov) 1023 { 1024 BlockDriver *drv = bs->drv; 1025 1026 if (!drv) { 1027 return -ENOMEDIUM; 1028 } 1029 1030 if (!drv->bdrv_co_pwritev_compressed) { 1031 return -ENOTSUP; 1032 } 1033 1034 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 1035 } 1036 1037 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1038 int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 1039 { 1040 BlockDriverState *bs = child->bs; 1041 1042 /* Perform I/O through a temporary buffer so that users who scribble over 1043 * their read buffer while the operation is in progress do not end up 1044 * modifying the image file. This is critical for zero-copy guest I/O 1045 * where anything might happen inside guest memory. 1046 */ 1047 void *bounce_buffer; 1048 1049 BlockDriver *drv = bs->drv; 1050 struct iovec iov; 1051 QEMUIOVector local_qiov; 1052 int64_t cluster_offset; 1053 int64_t cluster_bytes; 1054 size_t skip_bytes; 1055 int ret; 1056 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1057 BDRV_REQUEST_MAX_BYTES); 1058 unsigned int progress = 0; 1059 1060 if (!drv) { 1061 return -ENOMEDIUM; 1062 } 1063 1064 /* FIXME We cannot require callers to have write permissions when all they 1065 * are doing is a read request. If we did things right, write permissions 1066 * would be obtained anyway, but internally by the copy-on-read code. As 1067 * long as it is implemented here rather than in a separate filter driver, 1068 * the copy-on-read code doesn't have its own BdrvChild, however, for which 1069 * it could request permissions. Therefore we have to bypass the permission 1070 * system for the moment. */ 1071 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1072 1073 /* Cover entire cluster so no additional backing file I/O is required when 1074 * allocating cluster in the image file. Note that this value may exceed 1075 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1076 * is one reason we loop rather than doing it all at once. 1077 */ 1078 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1079 skip_bytes = offset - cluster_offset; 1080 1081 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1082 cluster_offset, cluster_bytes); 1083 1084 bounce_buffer = qemu_try_blockalign(bs, 1085 MIN(MIN(max_transfer, cluster_bytes), 1086 MAX_BOUNCE_BUFFER)); 1087 if (bounce_buffer == NULL) { 1088 ret = -ENOMEM; 1089 goto err; 1090 } 1091 1092 while (cluster_bytes) { 1093 int64_t pnum; 1094 1095 ret = bdrv_is_allocated(bs, cluster_offset, 1096 MIN(cluster_bytes, max_transfer), &pnum); 1097 if (ret < 0) { 1098 /* Safe to treat errors in querying allocation as if 1099 * unallocated; we'll probably fail again soon on the 1100 * read, but at least that will set a decent errno. 1101 */ 1102 pnum = MIN(cluster_bytes, max_transfer); 1103 } 1104 1105 assert(skip_bytes < pnum); 1106 1107 if (ret <= 0) { 1108 /* Must copy-on-read; use the bounce buffer */ 1109 iov.iov_base = bounce_buffer; 1110 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1111 qemu_iovec_init_external(&local_qiov, &iov, 1); 1112 1113 ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1114 &local_qiov, 0); 1115 if (ret < 0) { 1116 goto err; 1117 } 1118 1119 bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1120 if (drv->bdrv_co_pwrite_zeroes && 1121 buffer_is_zero(bounce_buffer, pnum)) { 1122 /* FIXME: Should we (perhaps conditionally) be setting 1123 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1124 * that still correctly reads as zero? */ 1125 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0); 1126 } else { 1127 /* This does not change the data on the disk, it is not 1128 * necessary to flush even in cache=writethrough mode. 1129 */ 1130 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1131 &local_qiov, 0); 1132 } 1133 1134 if (ret < 0) { 1135 /* It might be okay to ignore write errors for guest 1136 * requests. If this is a deliberate copy-on-read 1137 * then we don't want to ignore the error. Simply 1138 * report it in all cases. 1139 */ 1140 goto err; 1141 } 1142 1143 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, 1144 pnum - skip_bytes); 1145 } else { 1146 /* Read directly into the destination */ 1147 qemu_iovec_init(&local_qiov, qiov->niov); 1148 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); 1149 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, 1150 &local_qiov, 0); 1151 qemu_iovec_destroy(&local_qiov); 1152 if (ret < 0) { 1153 goto err; 1154 } 1155 } 1156 1157 cluster_offset += pnum; 1158 cluster_bytes -= pnum; 1159 progress += pnum - skip_bytes; 1160 skip_bytes = 0; 1161 } 1162 ret = 0; 1163 1164 err: 1165 qemu_vfree(bounce_buffer); 1166 return ret; 1167 } 1168 1169 /* 1170 * Forwards an already correctly aligned request to the BlockDriver. This 1171 * handles copy on read, zeroing after EOF, and fragmentation of large 1172 * reads; any other features must be implemented by the caller. 1173 */ 1174 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 1175 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1176 int64_t align, QEMUIOVector *qiov, int flags) 1177 { 1178 BlockDriverState *bs = child->bs; 1179 int64_t total_bytes, max_bytes; 1180 int ret = 0; 1181 uint64_t bytes_remaining = bytes; 1182 int max_transfer; 1183 1184 assert(is_power_of_2(align)); 1185 assert((offset & (align - 1)) == 0); 1186 assert((bytes & (align - 1)) == 0); 1187 assert(!qiov || bytes == qiov->size); 1188 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1189 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1190 align); 1191 1192 /* TODO: We would need a per-BDS .supported_read_flags and 1193 * potential fallback support, if we ever implement any read flags 1194 * to pass through to drivers. For now, there aren't any 1195 * passthrough flags. */ 1196 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 1197 1198 /* Handle Copy on Read and associated serialisation */ 1199 if (flags & BDRV_REQ_COPY_ON_READ) { 1200 /* If we touch the same cluster it counts as an overlap. This 1201 * guarantees that allocating writes will be serialized and not race 1202 * with each other for the same cluster. For example, in copy-on-read 1203 * it ensures that the CoR read and write operations are atomic and 1204 * guest writes cannot interleave between them. */ 1205 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 1206 } 1207 1208 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 1209 wait_serialising_requests(req); 1210 } 1211 1212 if (flags & BDRV_REQ_COPY_ON_READ) { 1213 int64_t pnum; 1214 1215 ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 1216 if (ret < 0) { 1217 goto out; 1218 } 1219 1220 if (!ret || pnum != bytes) { 1221 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 1222 goto out; 1223 } 1224 } 1225 1226 /* Forward the request to the BlockDriver, possibly fragmenting it */ 1227 total_bytes = bdrv_getlength(bs); 1228 if (total_bytes < 0) { 1229 ret = total_bytes; 1230 goto out; 1231 } 1232 1233 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 1234 if (bytes <= max_bytes && bytes <= max_transfer) { 1235 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 1236 goto out; 1237 } 1238 1239 while (bytes_remaining) { 1240 int num; 1241 1242 if (max_bytes) { 1243 QEMUIOVector local_qiov; 1244 1245 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 1246 assert(num); 1247 qemu_iovec_init(&local_qiov, qiov->niov); 1248 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1249 1250 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1251 num, &local_qiov, 0); 1252 max_bytes -= num; 1253 qemu_iovec_destroy(&local_qiov); 1254 } else { 1255 num = bytes_remaining; 1256 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 1257 bytes_remaining); 1258 } 1259 if (ret < 0) { 1260 goto out; 1261 } 1262 bytes_remaining -= num; 1263 } 1264 1265 out: 1266 return ret < 0 ? ret : 0; 1267 } 1268 1269 /* 1270 * Handle a read request in coroutine context 1271 */ 1272 int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1273 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1274 BdrvRequestFlags flags) 1275 { 1276 BlockDriverState *bs = child->bs; 1277 BlockDriver *drv = bs->drv; 1278 BdrvTrackedRequest req; 1279 1280 uint64_t align = bs->bl.request_alignment; 1281 uint8_t *head_buf = NULL; 1282 uint8_t *tail_buf = NULL; 1283 QEMUIOVector local_qiov; 1284 bool use_local_qiov = false; 1285 int ret; 1286 1287 trace_bdrv_co_preadv(child->bs, offset, bytes, flags); 1288 1289 if (!drv) { 1290 return -ENOMEDIUM; 1291 } 1292 1293 ret = bdrv_check_byte_request(bs, offset, bytes); 1294 if (ret < 0) { 1295 return ret; 1296 } 1297 1298 bdrv_inc_in_flight(bs); 1299 1300 /* Don't do copy-on-read if we read data before write operation */ 1301 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 1302 flags |= BDRV_REQ_COPY_ON_READ; 1303 } 1304 1305 /* Align read if necessary by padding qiov */ 1306 if (offset & (align - 1)) { 1307 head_buf = qemu_blockalign(bs, align); 1308 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1309 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1310 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1311 use_local_qiov = true; 1312 1313 bytes += offset & (align - 1); 1314 offset = offset & ~(align - 1); 1315 } 1316 1317 if ((offset + bytes) & (align - 1)) { 1318 if (!use_local_qiov) { 1319 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1320 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1321 use_local_qiov = true; 1322 } 1323 tail_buf = qemu_blockalign(bs, align); 1324 qemu_iovec_add(&local_qiov, tail_buf, 1325 align - ((offset + bytes) & (align - 1))); 1326 1327 bytes = ROUND_UP(bytes, align); 1328 } 1329 1330 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1331 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 1332 use_local_qiov ? &local_qiov : qiov, 1333 flags); 1334 tracked_request_end(&req); 1335 bdrv_dec_in_flight(bs); 1336 1337 if (use_local_qiov) { 1338 qemu_iovec_destroy(&local_qiov); 1339 qemu_vfree(head_buf); 1340 qemu_vfree(tail_buf); 1341 } 1342 1343 return ret; 1344 } 1345 1346 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child, 1347 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1348 BdrvRequestFlags flags) 1349 { 1350 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1351 return -EINVAL; 1352 } 1353 1354 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS, 1355 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1356 } 1357 1358 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, 1359 int nb_sectors, QEMUIOVector *qiov) 1360 { 1361 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); 1362 } 1363 1364 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1365 int64_t offset, int bytes, BdrvRequestFlags flags) 1366 { 1367 BlockDriver *drv = bs->drv; 1368 QEMUIOVector qiov; 1369 struct iovec iov = {0}; 1370 int ret = 0; 1371 bool need_flush = false; 1372 int head = 0; 1373 int tail = 0; 1374 1375 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1376 int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1377 bs->bl.request_alignment); 1378 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1379 1380 if (!drv) { 1381 return -ENOMEDIUM; 1382 } 1383 1384 assert(alignment % bs->bl.request_alignment == 0); 1385 head = offset % alignment; 1386 tail = (offset + bytes) % alignment; 1387 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1388 assert(max_write_zeroes >= bs->bl.request_alignment); 1389 1390 while (bytes > 0 && !ret) { 1391 int num = bytes; 1392 1393 /* Align request. Block drivers can expect the "bulk" of the request 1394 * to be aligned, and that unaligned requests do not cross cluster 1395 * boundaries. 1396 */ 1397 if (head) { 1398 /* Make a small request up to the first aligned sector. For 1399 * convenience, limit this request to max_transfer even if 1400 * we don't need to fall back to writes. */ 1401 num = MIN(MIN(bytes, max_transfer), alignment - head); 1402 head = (head + num) % alignment; 1403 assert(num < max_write_zeroes); 1404 } else if (tail && num > alignment) { 1405 /* Shorten the request to the last aligned sector. */ 1406 num -= tail; 1407 } 1408 1409 /* limit request size */ 1410 if (num > max_write_zeroes) { 1411 num = max_write_zeroes; 1412 } 1413 1414 ret = -ENOTSUP; 1415 /* First try the efficient write zeroes operation */ 1416 if (drv->bdrv_co_pwrite_zeroes) { 1417 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1418 flags & bs->supported_zero_flags); 1419 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1420 !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1421 need_flush = true; 1422 } 1423 } else { 1424 assert(!bs->supported_zero_flags); 1425 } 1426 1427 if (ret == -ENOTSUP) { 1428 /* Fall back to bounce buffer if write zeroes is unsupported */ 1429 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1430 1431 if ((flags & BDRV_REQ_FUA) && 1432 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1433 /* No need for bdrv_driver_pwrite() to do a fallback 1434 * flush on each chunk; use just one at the end */ 1435 write_flags &= ~BDRV_REQ_FUA; 1436 need_flush = true; 1437 } 1438 num = MIN(num, max_transfer); 1439 iov.iov_len = num; 1440 if (iov.iov_base == NULL) { 1441 iov.iov_base = qemu_try_blockalign(bs, num); 1442 if (iov.iov_base == NULL) { 1443 ret = -ENOMEM; 1444 goto fail; 1445 } 1446 memset(iov.iov_base, 0, num); 1447 } 1448 qemu_iovec_init_external(&qiov, &iov, 1); 1449 1450 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 1451 1452 /* Keep bounce buffer around if it is big enough for all 1453 * all future requests. 1454 */ 1455 if (num < max_transfer) { 1456 qemu_vfree(iov.iov_base); 1457 iov.iov_base = NULL; 1458 } 1459 } 1460 1461 offset += num; 1462 bytes -= num; 1463 } 1464 1465 fail: 1466 if (ret == 0 && need_flush) { 1467 ret = bdrv_co_flush(bs); 1468 } 1469 qemu_vfree(iov.iov_base); 1470 return ret; 1471 } 1472 1473 /* 1474 * Forwards an already correctly aligned write request to the BlockDriver, 1475 * after possibly fragmenting it. 1476 */ 1477 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1478 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1479 int64_t align, QEMUIOVector *qiov, int flags) 1480 { 1481 BlockDriverState *bs = child->bs; 1482 BlockDriver *drv = bs->drv; 1483 bool waited; 1484 int ret; 1485 1486 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 1487 uint64_t bytes_remaining = bytes; 1488 int max_transfer; 1489 1490 if (!drv) { 1491 return -ENOMEDIUM; 1492 } 1493 1494 if (bdrv_has_readonly_bitmaps(bs)) { 1495 return -EPERM; 1496 } 1497 1498 assert(is_power_of_2(align)); 1499 assert((offset & (align - 1)) == 0); 1500 assert((bytes & (align - 1)) == 0); 1501 assert(!qiov || bytes == qiov->size); 1502 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1503 assert(!(flags & ~BDRV_REQ_MASK)); 1504 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 1505 align); 1506 1507 waited = wait_serialising_requests(req); 1508 assert(!waited || !req->serialising); 1509 assert(req->overlap_offset <= offset); 1510 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1511 assert(child->perm & BLK_PERM_WRITE); 1512 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 1513 1514 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1515 1516 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1517 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 1518 qemu_iovec_is_zero(qiov)) { 1519 flags |= BDRV_REQ_ZERO_WRITE; 1520 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1521 flags |= BDRV_REQ_MAY_UNMAP; 1522 } 1523 } 1524 1525 if (ret < 0) { 1526 /* Do nothing, write notifier decided to fail this request */ 1527 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1528 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1529 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 1530 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 1531 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 1532 } else if (bytes <= max_transfer) { 1533 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1534 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 1535 } else { 1536 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1537 while (bytes_remaining) { 1538 int num = MIN(bytes_remaining, max_transfer); 1539 QEMUIOVector local_qiov; 1540 int local_flags = flags; 1541 1542 assert(num); 1543 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 1544 !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1545 /* If FUA is going to be emulated by flush, we only 1546 * need to flush on the last iteration */ 1547 local_flags &= ~BDRV_REQ_FUA; 1548 } 1549 qemu_iovec_init(&local_qiov, qiov->niov); 1550 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 1551 1552 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1553 num, &local_qiov, local_flags); 1554 qemu_iovec_destroy(&local_qiov); 1555 if (ret < 0) { 1556 break; 1557 } 1558 bytes_remaining -= num; 1559 } 1560 } 1561 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1562 1563 atomic_inc(&bs->write_gen); 1564 bdrv_set_dirty(bs, offset, bytes); 1565 1566 stat64_max(&bs->wr_highest_offset, offset + bytes); 1567 1568 if (ret >= 0) { 1569 bs->total_sectors = MAX(bs->total_sectors, end_sector); 1570 ret = 0; 1571 } 1572 1573 return ret; 1574 } 1575 1576 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 1577 int64_t offset, 1578 unsigned int bytes, 1579 BdrvRequestFlags flags, 1580 BdrvTrackedRequest *req) 1581 { 1582 BlockDriverState *bs = child->bs; 1583 uint8_t *buf = NULL; 1584 QEMUIOVector local_qiov; 1585 struct iovec iov; 1586 uint64_t align = bs->bl.request_alignment; 1587 unsigned int head_padding_bytes, tail_padding_bytes; 1588 int ret = 0; 1589 1590 head_padding_bytes = offset & (align - 1); 1591 tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 1592 1593 1594 assert(flags & BDRV_REQ_ZERO_WRITE); 1595 if (head_padding_bytes || tail_padding_bytes) { 1596 buf = qemu_blockalign(bs, align); 1597 iov = (struct iovec) { 1598 .iov_base = buf, 1599 .iov_len = align, 1600 }; 1601 qemu_iovec_init_external(&local_qiov, &iov, 1); 1602 } 1603 if (head_padding_bytes) { 1604 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1605 1606 /* RMW the unaligned part before head. */ 1607 mark_request_serialising(req, align); 1608 wait_serialising_requests(req); 1609 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1610 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 1611 align, &local_qiov, 0); 1612 if (ret < 0) { 1613 goto fail; 1614 } 1615 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1616 1617 memset(buf + head_padding_bytes, 0, zero_bytes); 1618 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1619 align, &local_qiov, 1620 flags & ~BDRV_REQ_ZERO_WRITE); 1621 if (ret < 0) { 1622 goto fail; 1623 } 1624 offset += zero_bytes; 1625 bytes -= zero_bytes; 1626 } 1627 1628 assert(!bytes || (offset & (align - 1)) == 0); 1629 if (bytes >= align) { 1630 /* Write the aligned part in the middle. */ 1631 uint64_t aligned_bytes = bytes & ~(align - 1); 1632 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 1633 NULL, flags); 1634 if (ret < 0) { 1635 goto fail; 1636 } 1637 bytes -= aligned_bytes; 1638 offset += aligned_bytes; 1639 } 1640 1641 assert(!bytes || (offset & (align - 1)) == 0); 1642 if (bytes) { 1643 assert(align == tail_padding_bytes + bytes); 1644 /* RMW the unaligned part after tail. */ 1645 mark_request_serialising(req, align); 1646 wait_serialising_requests(req); 1647 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1648 ret = bdrv_aligned_preadv(child, req, offset, align, 1649 align, &local_qiov, 0); 1650 if (ret < 0) { 1651 goto fail; 1652 } 1653 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1654 1655 memset(buf, 0, bytes); 1656 ret = bdrv_aligned_pwritev(child, req, offset, align, align, 1657 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1658 } 1659 fail: 1660 qemu_vfree(buf); 1661 return ret; 1662 1663 } 1664 1665 /* 1666 * Handle a write request in coroutine context 1667 */ 1668 int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 1669 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1670 BdrvRequestFlags flags) 1671 { 1672 BlockDriverState *bs = child->bs; 1673 BdrvTrackedRequest req; 1674 uint64_t align = bs->bl.request_alignment; 1675 uint8_t *head_buf = NULL; 1676 uint8_t *tail_buf = NULL; 1677 QEMUIOVector local_qiov; 1678 bool use_local_qiov = false; 1679 int ret; 1680 1681 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 1682 1683 if (!bs->drv) { 1684 return -ENOMEDIUM; 1685 } 1686 if (bs->read_only) { 1687 return -EPERM; 1688 } 1689 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1690 1691 ret = bdrv_check_byte_request(bs, offset, bytes); 1692 if (ret < 0) { 1693 return ret; 1694 } 1695 1696 bdrv_inc_in_flight(bs); 1697 /* 1698 * Align write if necessary by performing a read-modify-write cycle. 1699 * Pad qiov with the read parts and be sure to have a tracked request not 1700 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1701 */ 1702 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1703 1704 if (!qiov) { 1705 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 1706 goto out; 1707 } 1708 1709 if (offset & (align - 1)) { 1710 QEMUIOVector head_qiov; 1711 struct iovec head_iov; 1712 1713 mark_request_serialising(&req, align); 1714 wait_serialising_requests(&req); 1715 1716 head_buf = qemu_blockalign(bs, align); 1717 head_iov = (struct iovec) { 1718 .iov_base = head_buf, 1719 .iov_len = align, 1720 }; 1721 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1722 1723 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1724 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 1725 align, &head_qiov, 0); 1726 if (ret < 0) { 1727 goto fail; 1728 } 1729 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1730 1731 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1732 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1733 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1734 use_local_qiov = true; 1735 1736 bytes += offset & (align - 1); 1737 offset = offset & ~(align - 1); 1738 1739 /* We have read the tail already if the request is smaller 1740 * than one aligned block. 1741 */ 1742 if (bytes < align) { 1743 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1744 bytes = align; 1745 } 1746 } 1747 1748 if ((offset + bytes) & (align - 1)) { 1749 QEMUIOVector tail_qiov; 1750 struct iovec tail_iov; 1751 size_t tail_bytes; 1752 bool waited; 1753 1754 mark_request_serialising(&req, align); 1755 waited = wait_serialising_requests(&req); 1756 assert(!waited || !use_local_qiov); 1757 1758 tail_buf = qemu_blockalign(bs, align); 1759 tail_iov = (struct iovec) { 1760 .iov_base = tail_buf, 1761 .iov_len = align, 1762 }; 1763 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1764 1765 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1766 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 1767 align, align, &tail_qiov, 0); 1768 if (ret < 0) { 1769 goto fail; 1770 } 1771 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1772 1773 if (!use_local_qiov) { 1774 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1775 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1776 use_local_qiov = true; 1777 } 1778 1779 tail_bytes = (offset + bytes) & (align - 1); 1780 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1781 1782 bytes = ROUND_UP(bytes, align); 1783 } 1784 1785 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 1786 use_local_qiov ? &local_qiov : qiov, 1787 flags); 1788 1789 fail: 1790 1791 if (use_local_qiov) { 1792 qemu_iovec_destroy(&local_qiov); 1793 } 1794 qemu_vfree(head_buf); 1795 qemu_vfree(tail_buf); 1796 out: 1797 tracked_request_end(&req); 1798 bdrv_dec_in_flight(bs); 1799 return ret; 1800 } 1801 1802 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child, 1803 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1804 BdrvRequestFlags flags) 1805 { 1806 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1807 return -EINVAL; 1808 } 1809 1810 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS, 1811 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1812 } 1813 1814 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num, 1815 int nb_sectors, QEMUIOVector *qiov) 1816 { 1817 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0); 1818 } 1819 1820 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1821 int bytes, BdrvRequestFlags flags) 1822 { 1823 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 1824 1825 if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 1826 flags &= ~BDRV_REQ_MAY_UNMAP; 1827 } 1828 1829 return bdrv_co_pwritev(child, offset, bytes, NULL, 1830 BDRV_REQ_ZERO_WRITE | flags); 1831 } 1832 1833 /* 1834 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 1835 */ 1836 int bdrv_flush_all(void) 1837 { 1838 BdrvNextIterator it; 1839 BlockDriverState *bs = NULL; 1840 int result = 0; 1841 1842 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 1843 AioContext *aio_context = bdrv_get_aio_context(bs); 1844 int ret; 1845 1846 aio_context_acquire(aio_context); 1847 ret = bdrv_flush(bs); 1848 if (ret < 0 && !result) { 1849 result = ret; 1850 } 1851 aio_context_release(aio_context); 1852 } 1853 1854 return result; 1855 } 1856 1857 1858 typedef struct BdrvCoBlockStatusData { 1859 BlockDriverState *bs; 1860 BlockDriverState *base; 1861 bool want_zero; 1862 int64_t offset; 1863 int64_t bytes; 1864 int64_t *pnum; 1865 int64_t *map; 1866 BlockDriverState **file; 1867 int ret; 1868 bool done; 1869 } BdrvCoBlockStatusData; 1870 1871 int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs, 1872 int64_t sector_num, 1873 int nb_sectors, 1874 int *pnum, 1875 BlockDriverState **file) 1876 { 1877 assert(bs->file && bs->file->bs); 1878 *pnum = nb_sectors; 1879 *file = bs->file->bs; 1880 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | 1881 (sector_num << BDRV_SECTOR_BITS); 1882 } 1883 1884 int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs, 1885 int64_t sector_num, 1886 int nb_sectors, 1887 int *pnum, 1888 BlockDriverState **file) 1889 { 1890 assert(bs->backing && bs->backing->bs); 1891 *pnum = nb_sectors; 1892 *file = bs->backing->bs; 1893 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | 1894 (sector_num << BDRV_SECTOR_BITS); 1895 } 1896 1897 /* 1898 * Returns the allocation status of the specified sectors. 1899 * Drivers not implementing the functionality are assumed to not support 1900 * backing files, hence all their sectors are reported as allocated. 1901 * 1902 * If 'want_zero' is true, the caller is querying for mapping purposes, 1903 * and the result should include BDRV_BLOCK_OFFSET_VALID and 1904 * BDRV_BLOCK_ZERO where possible; otherwise, the result may omit those 1905 * bits particularly if it allows for a larger value in 'pnum'. 1906 * 1907 * If 'offset' is beyond the end of the disk image the return value is 1908 * BDRV_BLOCK_EOF and 'pnum' is set to 0. 1909 * 1910 * 'bytes' is the max value 'pnum' should be set to. If bytes goes 1911 * beyond the end of the disk image it will be clamped; if 'pnum' is set to 1912 * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 1913 * 1914 * 'pnum' is set to the number of bytes (including and immediately 1915 * following the specified offset) that are easily known to be in the 1916 * same allocated/unallocated state. Note that a second call starting 1917 * at the original offset plus returned pnum may have the same status. 1918 * The returned value is non-zero on success except at end-of-file. 1919 * 1920 * Returns negative errno on failure. Otherwise, if the 1921 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 1922 * set to the host mapping and BDS corresponding to the guest offset. 1923 */ 1924 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 1925 bool want_zero, 1926 int64_t offset, int64_t bytes, 1927 int64_t *pnum, int64_t *map, 1928 BlockDriverState **file) 1929 { 1930 int64_t total_size; 1931 int64_t n; /* bytes */ 1932 int ret; 1933 int64_t local_map = 0; 1934 BlockDriverState *local_file = NULL; 1935 int64_t aligned_offset, aligned_bytes; 1936 uint32_t align; 1937 1938 assert(pnum); 1939 *pnum = 0; 1940 total_size = bdrv_getlength(bs); 1941 if (total_size < 0) { 1942 ret = total_size; 1943 goto early_out; 1944 } 1945 1946 if (offset >= total_size) { 1947 ret = BDRV_BLOCK_EOF; 1948 goto early_out; 1949 } 1950 if (!bytes) { 1951 ret = 0; 1952 goto early_out; 1953 } 1954 1955 n = total_size - offset; 1956 if (n < bytes) { 1957 bytes = n; 1958 } 1959 1960 /* Must be non-NULL or bdrv_getlength() would have failed */ 1961 assert(bs->drv); 1962 if (!bs->drv->bdrv_co_get_block_status) { 1963 *pnum = bytes; 1964 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1965 if (offset + bytes == total_size) { 1966 ret |= BDRV_BLOCK_EOF; 1967 } 1968 if (bs->drv->protocol_name) { 1969 ret |= BDRV_BLOCK_OFFSET_VALID; 1970 local_map = offset; 1971 local_file = bs; 1972 } 1973 goto early_out; 1974 } 1975 1976 bdrv_inc_in_flight(bs); 1977 1978 /* Round out to request_alignment boundaries */ 1979 /* TODO: until we have a byte-based driver callback, we also have to 1980 * round out to sectors, even if that is bigger than request_alignment */ 1981 align = MAX(bs->bl.request_alignment, BDRV_SECTOR_SIZE); 1982 aligned_offset = QEMU_ALIGN_DOWN(offset, align); 1983 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 1984 1985 { 1986 int count; /* sectors */ 1987 int64_t longret; 1988 1989 assert(QEMU_IS_ALIGNED(aligned_offset | aligned_bytes, 1990 BDRV_SECTOR_SIZE)); 1991 /* 1992 * The contract allows us to return pnum smaller than bytes, even 1993 * if the next query would see the same status; we truncate the 1994 * request to avoid overflowing the driver's 32-bit interface. 1995 */ 1996 longret = bs->drv->bdrv_co_get_block_status( 1997 bs, aligned_offset >> BDRV_SECTOR_BITS, 1998 MIN(INT_MAX, aligned_bytes) >> BDRV_SECTOR_BITS, &count, 1999 &local_file); 2000 if (longret < 0) { 2001 assert(INT_MIN <= longret); 2002 ret = longret; 2003 goto out; 2004 } 2005 if (longret & BDRV_BLOCK_OFFSET_VALID) { 2006 local_map = longret & BDRV_BLOCK_OFFSET_MASK; 2007 } 2008 ret = longret & ~BDRV_BLOCK_OFFSET_MASK; 2009 *pnum = count * BDRV_SECTOR_SIZE; 2010 } 2011 2012 /* 2013 * The driver's result must be a multiple of request_alignment. 2014 * Clamp pnum and adjust map to original request. 2015 */ 2016 assert(QEMU_IS_ALIGNED(*pnum, align) && align > offset - aligned_offset); 2017 *pnum -= offset - aligned_offset; 2018 if (*pnum > bytes) { 2019 *pnum = bytes; 2020 } 2021 if (ret & BDRV_BLOCK_OFFSET_VALID) { 2022 local_map += offset - aligned_offset; 2023 } 2024 2025 if (ret & BDRV_BLOCK_RAW) { 2026 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 2027 ret = bdrv_co_block_status(local_file, want_zero, local_map, 2028 *pnum, pnum, &local_map, &local_file); 2029 goto out; 2030 } 2031 2032 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 2033 ret |= BDRV_BLOCK_ALLOCATED; 2034 } else if (want_zero) { 2035 if (bdrv_unallocated_blocks_are_zero(bs)) { 2036 ret |= BDRV_BLOCK_ZERO; 2037 } else if (bs->backing) { 2038 BlockDriverState *bs2 = bs->backing->bs; 2039 int64_t size2 = bdrv_getlength(bs2); 2040 2041 if (size2 >= 0 && offset >= size2) { 2042 ret |= BDRV_BLOCK_ZERO; 2043 } 2044 } 2045 } 2046 2047 if (want_zero && local_file && local_file != bs && 2048 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 2049 (ret & BDRV_BLOCK_OFFSET_VALID)) { 2050 int64_t file_pnum; 2051 int ret2; 2052 2053 ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 2054 *pnum, &file_pnum, NULL, NULL); 2055 if (ret2 >= 0) { 2056 /* Ignore errors. This is just providing extra information, it 2057 * is useful but not necessary. 2058 */ 2059 if (ret2 & BDRV_BLOCK_EOF && 2060 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2061 /* 2062 * It is valid for the format block driver to read 2063 * beyond the end of the underlying file's current 2064 * size; such areas read as zero. 2065 */ 2066 ret |= BDRV_BLOCK_ZERO; 2067 } else { 2068 /* Limit request to the range reported by the protocol driver */ 2069 *pnum = file_pnum; 2070 ret |= (ret2 & BDRV_BLOCK_ZERO); 2071 } 2072 } 2073 } 2074 2075 out: 2076 bdrv_dec_in_flight(bs); 2077 if (ret >= 0 && offset + *pnum == total_size) { 2078 ret |= BDRV_BLOCK_EOF; 2079 } 2080 early_out: 2081 if (file) { 2082 *file = local_file; 2083 } 2084 if (map) { 2085 *map = local_map; 2086 } 2087 return ret; 2088 } 2089 2090 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2091 BlockDriverState *base, 2092 bool want_zero, 2093 int64_t offset, 2094 int64_t bytes, 2095 int64_t *pnum, 2096 int64_t *map, 2097 BlockDriverState **file) 2098 { 2099 BlockDriverState *p; 2100 int ret = 0; 2101 bool first = true; 2102 2103 assert(bs != base); 2104 for (p = bs; p != base; p = backing_bs(p)) { 2105 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 2106 file); 2107 if (ret < 0) { 2108 break; 2109 } 2110 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2111 /* 2112 * Reading beyond the end of the file continues to read 2113 * zeroes, but we can only widen the result to the 2114 * unallocated length we learned from an earlier 2115 * iteration. 2116 */ 2117 *pnum = bytes; 2118 } 2119 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2120 break; 2121 } 2122 /* [offset, pnum] unallocated on this layer, which could be only 2123 * the first part of [offset, bytes]. */ 2124 bytes = MIN(bytes, *pnum); 2125 first = false; 2126 } 2127 return ret; 2128 } 2129 2130 /* Coroutine wrapper for bdrv_block_status_above() */ 2131 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 2132 { 2133 BdrvCoBlockStatusData *data = opaque; 2134 2135 data->ret = bdrv_co_block_status_above(data->bs, data->base, 2136 data->want_zero, 2137 data->offset, data->bytes, 2138 data->pnum, data->map, data->file); 2139 data->done = true; 2140 } 2141 2142 /* 2143 * Synchronous wrapper around bdrv_co_block_status_above(). 2144 * 2145 * See bdrv_co_block_status_above() for details. 2146 */ 2147 static int bdrv_common_block_status_above(BlockDriverState *bs, 2148 BlockDriverState *base, 2149 bool want_zero, int64_t offset, 2150 int64_t bytes, int64_t *pnum, 2151 int64_t *map, 2152 BlockDriverState **file) 2153 { 2154 Coroutine *co; 2155 BdrvCoBlockStatusData data = { 2156 .bs = bs, 2157 .base = base, 2158 .want_zero = want_zero, 2159 .offset = offset, 2160 .bytes = bytes, 2161 .pnum = pnum, 2162 .map = map, 2163 .file = file, 2164 .done = false, 2165 }; 2166 2167 if (qemu_in_coroutine()) { 2168 /* Fast-path if already in coroutine context */ 2169 bdrv_block_status_above_co_entry(&data); 2170 } else { 2171 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2172 bdrv_coroutine_enter(bs, co); 2173 BDRV_POLL_WHILE(bs, !data.done); 2174 } 2175 return data.ret; 2176 } 2177 2178 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 2179 int64_t offset, int64_t bytes, int64_t *pnum, 2180 int64_t *map, BlockDriverState **file) 2181 { 2182 return bdrv_common_block_status_above(bs, base, true, offset, bytes, 2183 pnum, map, file); 2184 } 2185 2186 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2187 int64_t *pnum, int64_t *map, BlockDriverState **file) 2188 { 2189 return bdrv_block_status_above(bs, backing_bs(bs), 2190 offset, bytes, pnum, map, file); 2191 } 2192 2193 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2194 int64_t bytes, int64_t *pnum) 2195 { 2196 int ret; 2197 int64_t dummy; 2198 2199 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 2200 bytes, pnum ? pnum : &dummy, NULL, 2201 NULL); 2202 if (ret < 0) { 2203 return ret; 2204 } 2205 return !!(ret & BDRV_BLOCK_ALLOCATED); 2206 } 2207 2208 /* 2209 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 2210 * 2211 * Return true if (a prefix of) the given range is allocated in any image 2212 * between BASE and TOP (inclusive). BASE can be NULL to check if the given 2213 * offset is allocated in any image of the chain. Return false otherwise, 2214 * or negative errno on failure. 2215 * 2216 * 'pnum' is set to the number of bytes (including and immediately 2217 * following the specified offset) that are known to be in the same 2218 * allocated/unallocated state. Note that a subsequent call starting 2219 * at 'offset + *pnum' may return the same allocation status (in other 2220 * words, the result is not necessarily the maximum possible range); 2221 * but 'pnum' will only be 0 when end of file is reached. 2222 * 2223 */ 2224 int bdrv_is_allocated_above(BlockDriverState *top, 2225 BlockDriverState *base, 2226 int64_t offset, int64_t bytes, int64_t *pnum) 2227 { 2228 BlockDriverState *intermediate; 2229 int ret; 2230 int64_t n = bytes; 2231 2232 intermediate = top; 2233 while (intermediate && intermediate != base) { 2234 int64_t pnum_inter; 2235 int64_t size_inter; 2236 2237 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 2238 if (ret < 0) { 2239 return ret; 2240 } 2241 if (ret) { 2242 *pnum = pnum_inter; 2243 return 1; 2244 } 2245 2246 size_inter = bdrv_getlength(intermediate); 2247 if (size_inter < 0) { 2248 return size_inter; 2249 } 2250 if (n > pnum_inter && 2251 (intermediate == top || offset + pnum_inter < size_inter)) { 2252 n = pnum_inter; 2253 } 2254 2255 intermediate = backing_bs(intermediate); 2256 } 2257 2258 *pnum = n; 2259 return 0; 2260 } 2261 2262 typedef struct BdrvVmstateCo { 2263 BlockDriverState *bs; 2264 QEMUIOVector *qiov; 2265 int64_t pos; 2266 bool is_read; 2267 int ret; 2268 } BdrvVmstateCo; 2269 2270 static int coroutine_fn 2271 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2272 bool is_read) 2273 { 2274 BlockDriver *drv = bs->drv; 2275 int ret = -ENOTSUP; 2276 2277 bdrv_inc_in_flight(bs); 2278 2279 if (!drv) { 2280 ret = -ENOMEDIUM; 2281 } else if (drv->bdrv_load_vmstate) { 2282 if (is_read) { 2283 ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2284 } else { 2285 ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2286 } 2287 } else if (bs->file) { 2288 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 2289 } 2290 2291 bdrv_dec_in_flight(bs); 2292 return ret; 2293 } 2294 2295 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 2296 { 2297 BdrvVmstateCo *co = opaque; 2298 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 2299 } 2300 2301 static inline int 2302 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 2303 bool is_read) 2304 { 2305 if (qemu_in_coroutine()) { 2306 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 2307 } else { 2308 BdrvVmstateCo data = { 2309 .bs = bs, 2310 .qiov = qiov, 2311 .pos = pos, 2312 .is_read = is_read, 2313 .ret = -EINPROGRESS, 2314 }; 2315 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 2316 2317 bdrv_coroutine_enter(bs, co); 2318 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 2319 return data.ret; 2320 } 2321 } 2322 2323 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2324 int64_t pos, int size) 2325 { 2326 QEMUIOVector qiov; 2327 struct iovec iov = { 2328 .iov_base = (void *) buf, 2329 .iov_len = size, 2330 }; 2331 int ret; 2332 2333 qemu_iovec_init_external(&qiov, &iov, 1); 2334 2335 ret = bdrv_writev_vmstate(bs, &qiov, pos); 2336 if (ret < 0) { 2337 return ret; 2338 } 2339 2340 return size; 2341 } 2342 2343 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2344 { 2345 return bdrv_rw_vmstate(bs, qiov, pos, false); 2346 } 2347 2348 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2349 int64_t pos, int size) 2350 { 2351 QEMUIOVector qiov; 2352 struct iovec iov = { 2353 .iov_base = buf, 2354 .iov_len = size, 2355 }; 2356 int ret; 2357 2358 qemu_iovec_init_external(&qiov, &iov, 1); 2359 ret = bdrv_readv_vmstate(bs, &qiov, pos); 2360 if (ret < 0) { 2361 return ret; 2362 } 2363 2364 return size; 2365 } 2366 2367 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2368 { 2369 return bdrv_rw_vmstate(bs, qiov, pos, true); 2370 } 2371 2372 /**************************************************************/ 2373 /* async I/Os */ 2374 2375 void bdrv_aio_cancel(BlockAIOCB *acb) 2376 { 2377 qemu_aio_ref(acb); 2378 bdrv_aio_cancel_async(acb); 2379 while (acb->refcnt > 1) { 2380 if (acb->aiocb_info->get_aio_context) { 2381 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2382 } else if (acb->bs) { 2383 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 2384 * assert that we're not using an I/O thread. Thread-safe 2385 * code should use bdrv_aio_cancel_async exclusively. 2386 */ 2387 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 2388 aio_poll(bdrv_get_aio_context(acb->bs), true); 2389 } else { 2390 abort(); 2391 } 2392 } 2393 qemu_aio_unref(acb); 2394 } 2395 2396 /* Async version of aio cancel. The caller is not blocked if the acb implements 2397 * cancel_async, otherwise we do nothing and let the request normally complete. 2398 * In either case the completion callback must be called. */ 2399 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2400 { 2401 if (acb->aiocb_info->cancel_async) { 2402 acb->aiocb_info->cancel_async(acb); 2403 } 2404 } 2405 2406 /**************************************************************/ 2407 /* Coroutine block device emulation */ 2408 2409 typedef struct FlushCo { 2410 BlockDriverState *bs; 2411 int ret; 2412 } FlushCo; 2413 2414 2415 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2416 { 2417 FlushCo *rwco = opaque; 2418 2419 rwco->ret = bdrv_co_flush(rwco->bs); 2420 } 2421 2422 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2423 { 2424 int current_gen; 2425 int ret = 0; 2426 2427 bdrv_inc_in_flight(bs); 2428 2429 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2430 bdrv_is_sg(bs)) { 2431 goto early_exit; 2432 } 2433 2434 qemu_co_mutex_lock(&bs->reqs_lock); 2435 current_gen = atomic_read(&bs->write_gen); 2436 2437 /* Wait until any previous flushes are completed */ 2438 while (bs->active_flush_req) { 2439 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 2440 } 2441 2442 /* Flushes reach this point in nondecreasing current_gen order. */ 2443 bs->active_flush_req = true; 2444 qemu_co_mutex_unlock(&bs->reqs_lock); 2445 2446 /* Write back all layers by calling one driver function */ 2447 if (bs->drv->bdrv_co_flush) { 2448 ret = bs->drv->bdrv_co_flush(bs); 2449 goto out; 2450 } 2451 2452 /* Write back cached data to the OS even with cache=unsafe */ 2453 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2454 if (bs->drv->bdrv_co_flush_to_os) { 2455 ret = bs->drv->bdrv_co_flush_to_os(bs); 2456 if (ret < 0) { 2457 goto out; 2458 } 2459 } 2460 2461 /* But don't actually force it to the disk with cache=unsafe */ 2462 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2463 goto flush_parent; 2464 } 2465 2466 /* Check if we really need to flush anything */ 2467 if (bs->flushed_gen == current_gen) { 2468 goto flush_parent; 2469 } 2470 2471 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2472 if (!bs->drv) { 2473 /* bs->drv->bdrv_co_flush() might have ejected the BDS 2474 * (even in case of apparent success) */ 2475 ret = -ENOMEDIUM; 2476 goto out; 2477 } 2478 if (bs->drv->bdrv_co_flush_to_disk) { 2479 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2480 } else if (bs->drv->bdrv_aio_flush) { 2481 BlockAIOCB *acb; 2482 CoroutineIOCompletion co = { 2483 .coroutine = qemu_coroutine_self(), 2484 }; 2485 2486 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2487 if (acb == NULL) { 2488 ret = -EIO; 2489 } else { 2490 qemu_coroutine_yield(); 2491 ret = co.ret; 2492 } 2493 } else { 2494 /* 2495 * Some block drivers always operate in either writethrough or unsafe 2496 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2497 * know how the server works (because the behaviour is hardcoded or 2498 * depends on server-side configuration), so we can't ensure that 2499 * everything is safe on disk. Returning an error doesn't work because 2500 * that would break guests even if the server operates in writethrough 2501 * mode. 2502 * 2503 * Let's hope the user knows what he's doing. 2504 */ 2505 ret = 0; 2506 } 2507 2508 if (ret < 0) { 2509 goto out; 2510 } 2511 2512 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2513 * in the case of cache=unsafe, so there are no useless flushes. 2514 */ 2515 flush_parent: 2516 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2517 out: 2518 /* Notify any pending flushes that we have completed */ 2519 if (ret == 0) { 2520 bs->flushed_gen = current_gen; 2521 } 2522 2523 qemu_co_mutex_lock(&bs->reqs_lock); 2524 bs->active_flush_req = false; 2525 /* Return value is ignored - it's ok if wait queue is empty */ 2526 qemu_co_queue_next(&bs->flush_queue); 2527 qemu_co_mutex_unlock(&bs->reqs_lock); 2528 2529 early_exit: 2530 bdrv_dec_in_flight(bs); 2531 return ret; 2532 } 2533 2534 int bdrv_flush(BlockDriverState *bs) 2535 { 2536 Coroutine *co; 2537 FlushCo flush_co = { 2538 .bs = bs, 2539 .ret = NOT_DONE, 2540 }; 2541 2542 if (qemu_in_coroutine()) { 2543 /* Fast-path if already in coroutine context */ 2544 bdrv_flush_co_entry(&flush_co); 2545 } else { 2546 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2547 bdrv_coroutine_enter(bs, co); 2548 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 2549 } 2550 2551 return flush_co.ret; 2552 } 2553 2554 typedef struct DiscardCo { 2555 BlockDriverState *bs; 2556 int64_t offset; 2557 int bytes; 2558 int ret; 2559 } DiscardCo; 2560 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 2561 { 2562 DiscardCo *rwco = opaque; 2563 2564 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); 2565 } 2566 2567 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2568 int bytes) 2569 { 2570 BdrvTrackedRequest req; 2571 int max_pdiscard, ret; 2572 int head, tail, align; 2573 2574 if (!bs->drv) { 2575 return -ENOMEDIUM; 2576 } 2577 2578 if (bdrv_has_readonly_bitmaps(bs)) { 2579 return -EPERM; 2580 } 2581 2582 ret = bdrv_check_byte_request(bs, offset, bytes); 2583 if (ret < 0) { 2584 return ret; 2585 } else if (bs->read_only) { 2586 return -EPERM; 2587 } 2588 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2589 2590 /* Do nothing if disabled. */ 2591 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2592 return 0; 2593 } 2594 2595 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 2596 return 0; 2597 } 2598 2599 /* Discard is advisory, but some devices track and coalesce 2600 * unaligned requests, so we must pass everything down rather than 2601 * round here. Still, most devices will just silently ignore 2602 * unaligned requests (by returning -ENOTSUP), so we must fragment 2603 * the request accordingly. */ 2604 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2605 assert(align % bs->bl.request_alignment == 0); 2606 head = offset % align; 2607 tail = (offset + bytes) % align; 2608 2609 bdrv_inc_in_flight(bs); 2610 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 2611 2612 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2613 if (ret < 0) { 2614 goto out; 2615 } 2616 2617 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 2618 align); 2619 assert(max_pdiscard >= bs->bl.request_alignment); 2620 2621 while (bytes > 0) { 2622 int num = bytes; 2623 2624 if (head) { 2625 /* Make small requests to get to alignment boundaries. */ 2626 num = MIN(bytes, align - head); 2627 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 2628 num %= bs->bl.request_alignment; 2629 } 2630 head = (head + num) % align; 2631 assert(num < max_pdiscard); 2632 } else if (tail) { 2633 if (num > align) { 2634 /* Shorten the request to the last aligned cluster. */ 2635 num -= tail; 2636 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 2637 tail > bs->bl.request_alignment) { 2638 tail %= bs->bl.request_alignment; 2639 num -= tail; 2640 } 2641 } 2642 /* limit request size */ 2643 if (num > max_pdiscard) { 2644 num = max_pdiscard; 2645 } 2646 2647 if (!bs->drv) { 2648 ret = -ENOMEDIUM; 2649 goto out; 2650 } 2651 if (bs->drv->bdrv_co_pdiscard) { 2652 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 2653 } else { 2654 BlockAIOCB *acb; 2655 CoroutineIOCompletion co = { 2656 .coroutine = qemu_coroutine_self(), 2657 }; 2658 2659 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 2660 bdrv_co_io_em_complete, &co); 2661 if (acb == NULL) { 2662 ret = -EIO; 2663 goto out; 2664 } else { 2665 qemu_coroutine_yield(); 2666 ret = co.ret; 2667 } 2668 } 2669 if (ret && ret != -ENOTSUP) { 2670 goto out; 2671 } 2672 2673 offset += num; 2674 bytes -= num; 2675 } 2676 ret = 0; 2677 out: 2678 atomic_inc(&bs->write_gen); 2679 bdrv_set_dirty(bs, req.offset, req.bytes); 2680 tracked_request_end(&req); 2681 bdrv_dec_in_flight(bs); 2682 return ret; 2683 } 2684 2685 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 2686 { 2687 Coroutine *co; 2688 DiscardCo rwco = { 2689 .bs = bs, 2690 .offset = offset, 2691 .bytes = bytes, 2692 .ret = NOT_DONE, 2693 }; 2694 2695 if (qemu_in_coroutine()) { 2696 /* Fast-path if already in coroutine context */ 2697 bdrv_pdiscard_co_entry(&rwco); 2698 } else { 2699 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2700 bdrv_coroutine_enter(bs, co); 2701 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 2702 } 2703 2704 return rwco.ret; 2705 } 2706 2707 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 2708 { 2709 BlockDriver *drv = bs->drv; 2710 CoroutineIOCompletion co = { 2711 .coroutine = qemu_coroutine_self(), 2712 }; 2713 BlockAIOCB *acb; 2714 2715 bdrv_inc_in_flight(bs); 2716 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 2717 co.ret = -ENOTSUP; 2718 goto out; 2719 } 2720 2721 if (drv->bdrv_co_ioctl) { 2722 co.ret = drv->bdrv_co_ioctl(bs, req, buf); 2723 } else { 2724 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2725 if (!acb) { 2726 co.ret = -ENOTSUP; 2727 goto out; 2728 } 2729 qemu_coroutine_yield(); 2730 } 2731 out: 2732 bdrv_dec_in_flight(bs); 2733 return co.ret; 2734 } 2735 2736 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2737 { 2738 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2739 } 2740 2741 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2742 { 2743 return memset(qemu_blockalign(bs, size), 0, size); 2744 } 2745 2746 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2747 { 2748 size_t align = bdrv_opt_mem_align(bs); 2749 2750 /* Ensure that NULL is never returned on success */ 2751 assert(align > 0); 2752 if (size == 0) { 2753 size = align; 2754 } 2755 2756 return qemu_try_memalign(align, size); 2757 } 2758 2759 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2760 { 2761 void *mem = qemu_try_blockalign(bs, size); 2762 2763 if (mem) { 2764 memset(mem, 0, size); 2765 } 2766 2767 return mem; 2768 } 2769 2770 /* 2771 * Check if all memory in this vector is sector aligned. 2772 */ 2773 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2774 { 2775 int i; 2776 size_t alignment = bdrv_min_mem_align(bs); 2777 2778 for (i = 0; i < qiov->niov; i++) { 2779 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2780 return false; 2781 } 2782 if (qiov->iov[i].iov_len % alignment) { 2783 return false; 2784 } 2785 } 2786 2787 return true; 2788 } 2789 2790 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2791 NotifierWithReturn *notifier) 2792 { 2793 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2794 } 2795 2796 void bdrv_io_plug(BlockDriverState *bs) 2797 { 2798 BdrvChild *child; 2799 2800 QLIST_FOREACH(child, &bs->children, next) { 2801 bdrv_io_plug(child->bs); 2802 } 2803 2804 if (atomic_fetch_inc(&bs->io_plugged) == 0) { 2805 BlockDriver *drv = bs->drv; 2806 if (drv && drv->bdrv_io_plug) { 2807 drv->bdrv_io_plug(bs); 2808 } 2809 } 2810 } 2811 2812 void bdrv_io_unplug(BlockDriverState *bs) 2813 { 2814 BdrvChild *child; 2815 2816 assert(bs->io_plugged); 2817 if (atomic_fetch_dec(&bs->io_plugged) == 1) { 2818 BlockDriver *drv = bs->drv; 2819 if (drv && drv->bdrv_io_unplug) { 2820 drv->bdrv_io_unplug(bs); 2821 } 2822 } 2823 2824 QLIST_FOREACH(child, &bs->children, next) { 2825 bdrv_io_unplug(child->bs); 2826 } 2827 } 2828 2829 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 2830 { 2831 BdrvChild *child; 2832 2833 if (bs->drv && bs->drv->bdrv_register_buf) { 2834 bs->drv->bdrv_register_buf(bs, host, size); 2835 } 2836 QLIST_FOREACH(child, &bs->children, next) { 2837 bdrv_register_buf(child->bs, host, size); 2838 } 2839 } 2840 2841 void bdrv_unregister_buf(BlockDriverState *bs, void *host) 2842 { 2843 BdrvChild *child; 2844 2845 if (bs->drv && bs->drv->bdrv_unregister_buf) { 2846 bs->drv->bdrv_unregister_buf(bs, host); 2847 } 2848 QLIST_FOREACH(child, &bs->children, next) { 2849 bdrv_unregister_buf(child->bs, host); 2850 } 2851 } 2852