1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "block/throttle-groups.h" 31 #include "qemu/cutils.h" 32 #include "qapi/error.h" 33 #include "qemu/error-report.h" 34 35 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 36 37 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 38 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 39 BlockCompletionFunc *cb, void *opaque); 40 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 41 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 42 BlockCompletionFunc *cb, void *opaque); 43 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 44 int64_t sector_num, int nb_sectors, 45 QEMUIOVector *iov); 46 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 47 int64_t sector_num, int nb_sectors, 48 QEMUIOVector *iov); 49 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 50 int64_t sector_num, 51 QEMUIOVector *qiov, 52 int nb_sectors, 53 BdrvRequestFlags flags, 54 BlockCompletionFunc *cb, 55 void *opaque, 56 bool is_write); 57 static void coroutine_fn bdrv_co_do_rw(void *opaque); 58 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 59 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 60 61 /* throttling disk I/O limits */ 62 void bdrv_set_io_limits(BlockDriverState *bs, 63 ThrottleConfig *cfg) 64 { 65 int i; 66 67 throttle_group_config(bs, cfg); 68 69 for (i = 0; i < 2; i++) { 70 qemu_co_enter_next(&bs->throttled_reqs[i]); 71 } 72 } 73 74 /* this function drain all the throttled IOs */ 75 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 76 { 77 bool drained = false; 78 bool enabled = bs->io_limits_enabled; 79 int i; 80 81 bs->io_limits_enabled = false; 82 83 for (i = 0; i < 2; i++) { 84 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 85 drained = true; 86 } 87 } 88 89 bs->io_limits_enabled = enabled; 90 91 return drained; 92 } 93 94 void bdrv_io_limits_disable(BlockDriverState *bs) 95 { 96 bs->io_limits_enabled = false; 97 bdrv_start_throttled_reqs(bs); 98 throttle_group_unregister_bs(bs); 99 } 100 101 /* should be called before bdrv_set_io_limits if a limit is set */ 102 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 103 { 104 assert(!bs->io_limits_enabled); 105 throttle_group_register_bs(bs, group); 106 bs->io_limits_enabled = true; 107 } 108 109 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 110 { 111 /* this bs is not part of any group */ 112 if (!bs->throttle_state) { 113 return; 114 } 115 116 /* this bs is a part of the same group than the one we want */ 117 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 118 return; 119 } 120 121 /* need to change the group this bs belong to */ 122 bdrv_io_limits_disable(bs); 123 bdrv_io_limits_enable(bs, group); 124 } 125 126 void bdrv_setup_io_funcs(BlockDriver *bdrv) 127 { 128 /* Block drivers without coroutine functions need emulation */ 129 if (!bdrv->bdrv_co_readv) { 130 bdrv->bdrv_co_readv = bdrv_co_readv_em; 131 bdrv->bdrv_co_writev = bdrv_co_writev_em; 132 133 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 134 * the block driver lacks aio we need to emulate that too. 135 */ 136 if (!bdrv->bdrv_aio_readv) { 137 /* add AIO emulation layer */ 138 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 139 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 140 } 141 } 142 } 143 144 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 145 { 146 BlockDriver *drv = bs->drv; 147 Error *local_err = NULL; 148 149 memset(&bs->bl, 0, sizeof(bs->bl)); 150 151 if (!drv) { 152 return; 153 } 154 155 /* Take some limits from the children as a default */ 156 if (bs->file) { 157 bdrv_refresh_limits(bs->file->bs, &local_err); 158 if (local_err) { 159 error_propagate(errp, local_err); 160 return; 161 } 162 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 163 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 164 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 165 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 166 bs->bl.max_iov = bs->file->bs->bl.max_iov; 167 } else { 168 bs->bl.min_mem_alignment = 512; 169 bs->bl.opt_mem_alignment = getpagesize(); 170 171 /* Safe default since most protocols use readv()/writev()/etc */ 172 bs->bl.max_iov = IOV_MAX; 173 } 174 175 if (bs->backing) { 176 bdrv_refresh_limits(bs->backing->bs, &local_err); 177 if (local_err) { 178 error_propagate(errp, local_err); 179 return; 180 } 181 bs->bl.opt_transfer_length = 182 MAX(bs->bl.opt_transfer_length, 183 bs->backing->bs->bl.opt_transfer_length); 184 bs->bl.max_transfer_length = 185 MIN_NON_ZERO(bs->bl.max_transfer_length, 186 bs->backing->bs->bl.max_transfer_length); 187 bs->bl.opt_mem_alignment = 188 MAX(bs->bl.opt_mem_alignment, 189 bs->backing->bs->bl.opt_mem_alignment); 190 bs->bl.min_mem_alignment = 191 MAX(bs->bl.min_mem_alignment, 192 bs->backing->bs->bl.min_mem_alignment); 193 bs->bl.max_iov = 194 MIN(bs->bl.max_iov, 195 bs->backing->bs->bl.max_iov); 196 } 197 198 /* Then let the driver override it */ 199 if (drv->bdrv_refresh_limits) { 200 drv->bdrv_refresh_limits(bs, errp); 201 } 202 } 203 204 /** 205 * The copy-on-read flag is actually a reference count so multiple users may 206 * use the feature without worrying about clobbering its previous state. 207 * Copy-on-read stays enabled until all users have called to disable it. 208 */ 209 void bdrv_enable_copy_on_read(BlockDriverState *bs) 210 { 211 bs->copy_on_read++; 212 } 213 214 void bdrv_disable_copy_on_read(BlockDriverState *bs) 215 { 216 assert(bs->copy_on_read > 0); 217 bs->copy_on_read--; 218 } 219 220 /* Check if any requests are in-flight (including throttled requests) */ 221 bool bdrv_requests_pending(BlockDriverState *bs) 222 { 223 BdrvChild *child; 224 225 if (!QLIST_EMPTY(&bs->tracked_requests)) { 226 return true; 227 } 228 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 229 return true; 230 } 231 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 232 return true; 233 } 234 235 QLIST_FOREACH(child, &bs->children, next) { 236 if (bdrv_requests_pending(child->bs)) { 237 return true; 238 } 239 } 240 241 return false; 242 } 243 244 static void bdrv_drain_recurse(BlockDriverState *bs) 245 { 246 BdrvChild *child; 247 248 if (bs->drv && bs->drv->bdrv_drain) { 249 bs->drv->bdrv_drain(bs); 250 } 251 QLIST_FOREACH(child, &bs->children, next) { 252 bdrv_drain_recurse(child->bs); 253 } 254 } 255 256 typedef struct { 257 Coroutine *co; 258 BlockDriverState *bs; 259 QEMUBH *bh; 260 bool done; 261 } BdrvCoDrainData; 262 263 static void bdrv_co_drain_bh_cb(void *opaque) 264 { 265 BdrvCoDrainData *data = opaque; 266 Coroutine *co = data->co; 267 268 qemu_bh_delete(data->bh); 269 bdrv_drain(data->bs); 270 data->done = true; 271 qemu_coroutine_enter(co, NULL); 272 } 273 274 void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 275 { 276 BdrvCoDrainData data; 277 278 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 279 * other coroutines run if they were queued from 280 * qemu_co_queue_run_restart(). */ 281 282 assert(qemu_in_coroutine()); 283 data = (BdrvCoDrainData) { 284 .co = qemu_coroutine_self(), 285 .bs = bs, 286 .done = false, 287 .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data), 288 }; 289 qemu_bh_schedule(data.bh); 290 291 qemu_coroutine_yield(); 292 /* If we are resumed from some other event (such as an aio completion or a 293 * timer callback), it is a bug in the caller that should be fixed. */ 294 assert(data.done); 295 } 296 297 /* 298 * Wait for pending requests to complete on a single BlockDriverState subtree, 299 * and suspend block driver's internal I/O until next request arrives. 300 * 301 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 302 * AioContext. 303 * 304 * Only this BlockDriverState's AioContext is run, so in-flight requests must 305 * not depend on events in other AioContexts. In that case, use 306 * bdrv_drain_all() instead. 307 */ 308 void bdrv_drain(BlockDriverState *bs) 309 { 310 bool busy = true; 311 312 bdrv_drain_recurse(bs); 313 if (qemu_in_coroutine()) { 314 bdrv_co_drain(bs); 315 return; 316 } 317 while (busy) { 318 /* Keep iterating */ 319 bdrv_flush_io_queue(bs); 320 busy = bdrv_requests_pending(bs); 321 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 322 } 323 } 324 325 /* 326 * Wait for pending requests to complete across all BlockDriverStates 327 * 328 * This function does not flush data to disk, use bdrv_flush_all() for that 329 * after calling this function. 330 */ 331 void bdrv_drain_all(void) 332 { 333 /* Always run first iteration so any pending completion BHs run */ 334 bool busy = true; 335 BlockDriverState *bs = NULL; 336 GSList *aio_ctxs = NULL, *ctx; 337 338 while ((bs = bdrv_next(bs))) { 339 AioContext *aio_context = bdrv_get_aio_context(bs); 340 341 aio_context_acquire(aio_context); 342 if (bs->job) { 343 block_job_pause(bs->job); 344 } 345 bdrv_drain_recurse(bs); 346 aio_context_release(aio_context); 347 348 if (!g_slist_find(aio_ctxs, aio_context)) { 349 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 350 } 351 } 352 353 /* Note that completion of an asynchronous I/O operation can trigger any 354 * number of other I/O operations on other devices---for example a 355 * coroutine can submit an I/O request to another device in response to 356 * request completion. Therefore we must keep looping until there was no 357 * more activity rather than simply draining each device independently. 358 */ 359 while (busy) { 360 busy = false; 361 362 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 363 AioContext *aio_context = ctx->data; 364 bs = NULL; 365 366 aio_context_acquire(aio_context); 367 while ((bs = bdrv_next(bs))) { 368 if (aio_context == bdrv_get_aio_context(bs)) { 369 bdrv_flush_io_queue(bs); 370 if (bdrv_requests_pending(bs)) { 371 busy = true; 372 aio_poll(aio_context, busy); 373 } 374 } 375 } 376 busy |= aio_poll(aio_context, false); 377 aio_context_release(aio_context); 378 } 379 } 380 381 bs = NULL; 382 while ((bs = bdrv_next(bs))) { 383 AioContext *aio_context = bdrv_get_aio_context(bs); 384 385 aio_context_acquire(aio_context); 386 if (bs->job) { 387 block_job_resume(bs->job); 388 } 389 aio_context_release(aio_context); 390 } 391 g_slist_free(aio_ctxs); 392 } 393 394 /** 395 * Remove an active request from the tracked requests list 396 * 397 * This function should be called when a tracked request is completing. 398 */ 399 static void tracked_request_end(BdrvTrackedRequest *req) 400 { 401 if (req->serialising) { 402 req->bs->serialising_in_flight--; 403 } 404 405 QLIST_REMOVE(req, list); 406 qemu_co_queue_restart_all(&req->wait_queue); 407 } 408 409 /** 410 * Add an active request to the tracked requests list 411 */ 412 static void tracked_request_begin(BdrvTrackedRequest *req, 413 BlockDriverState *bs, 414 int64_t offset, 415 unsigned int bytes, 416 enum BdrvTrackedRequestType type) 417 { 418 *req = (BdrvTrackedRequest){ 419 .bs = bs, 420 .offset = offset, 421 .bytes = bytes, 422 .type = type, 423 .co = qemu_coroutine_self(), 424 .serialising = false, 425 .overlap_offset = offset, 426 .overlap_bytes = bytes, 427 }; 428 429 qemu_co_queue_init(&req->wait_queue); 430 431 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 432 } 433 434 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 435 { 436 int64_t overlap_offset = req->offset & ~(align - 1); 437 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 438 - overlap_offset; 439 440 if (!req->serialising) { 441 req->bs->serialising_in_flight++; 442 req->serialising = true; 443 } 444 445 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 446 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 447 } 448 449 /** 450 * Round a region to cluster boundaries 451 */ 452 void bdrv_round_to_clusters(BlockDriverState *bs, 453 int64_t sector_num, int nb_sectors, 454 int64_t *cluster_sector_num, 455 int *cluster_nb_sectors) 456 { 457 BlockDriverInfo bdi; 458 459 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 460 *cluster_sector_num = sector_num; 461 *cluster_nb_sectors = nb_sectors; 462 } else { 463 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 464 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 465 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 466 nb_sectors, c); 467 } 468 } 469 470 static int bdrv_get_cluster_size(BlockDriverState *bs) 471 { 472 BlockDriverInfo bdi; 473 int ret; 474 475 ret = bdrv_get_info(bs, &bdi); 476 if (ret < 0 || bdi.cluster_size == 0) { 477 return bs->request_alignment; 478 } else { 479 return bdi.cluster_size; 480 } 481 } 482 483 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 484 int64_t offset, unsigned int bytes) 485 { 486 /* aaaa bbbb */ 487 if (offset >= req->overlap_offset + req->overlap_bytes) { 488 return false; 489 } 490 /* bbbb aaaa */ 491 if (req->overlap_offset >= offset + bytes) { 492 return false; 493 } 494 return true; 495 } 496 497 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 498 { 499 BlockDriverState *bs = self->bs; 500 BdrvTrackedRequest *req; 501 bool retry; 502 bool waited = false; 503 504 if (!bs->serialising_in_flight) { 505 return false; 506 } 507 508 do { 509 retry = false; 510 QLIST_FOREACH(req, &bs->tracked_requests, list) { 511 if (req == self || (!req->serialising && !self->serialising)) { 512 continue; 513 } 514 if (tracked_request_overlaps(req, self->overlap_offset, 515 self->overlap_bytes)) 516 { 517 /* Hitting this means there was a reentrant request, for 518 * example, a block driver issuing nested requests. This must 519 * never happen since it means deadlock. 520 */ 521 assert(qemu_coroutine_self() != req->co); 522 523 /* If the request is already (indirectly) waiting for us, or 524 * will wait for us as soon as it wakes up, then just go on 525 * (instead of producing a deadlock in the former case). */ 526 if (!req->waiting_for) { 527 self->waiting_for = req; 528 qemu_co_queue_wait(&req->wait_queue); 529 self->waiting_for = NULL; 530 retry = true; 531 waited = true; 532 break; 533 } 534 } 535 } 536 } while (retry); 537 538 return waited; 539 } 540 541 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 542 size_t size) 543 { 544 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 545 return -EIO; 546 } 547 548 if (!bdrv_is_inserted(bs)) { 549 return -ENOMEDIUM; 550 } 551 552 if (offset < 0) { 553 return -EIO; 554 } 555 556 return 0; 557 } 558 559 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 560 int nb_sectors) 561 { 562 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 563 return -EIO; 564 } 565 566 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 567 nb_sectors * BDRV_SECTOR_SIZE); 568 } 569 570 typedef struct RwCo { 571 BlockDriverState *bs; 572 int64_t offset; 573 QEMUIOVector *qiov; 574 bool is_write; 575 int ret; 576 BdrvRequestFlags flags; 577 } RwCo; 578 579 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 580 { 581 RwCo *rwco = opaque; 582 583 if (!rwco->is_write) { 584 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 585 rwco->qiov->size, rwco->qiov, 586 rwco->flags); 587 } else { 588 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 589 rwco->qiov->size, rwco->qiov, 590 rwco->flags); 591 } 592 } 593 594 /* 595 * Process a vectored synchronous request using coroutines 596 */ 597 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 598 QEMUIOVector *qiov, bool is_write, 599 BdrvRequestFlags flags) 600 { 601 Coroutine *co; 602 RwCo rwco = { 603 .bs = bs, 604 .offset = offset, 605 .qiov = qiov, 606 .is_write = is_write, 607 .ret = NOT_DONE, 608 .flags = flags, 609 }; 610 611 /** 612 * In sync call context, when the vcpu is blocked, this throttling timer 613 * will not fire; so the I/O throttling function has to be disabled here 614 * if it has been enabled. 615 */ 616 if (bs->io_limits_enabled) { 617 fprintf(stderr, "Disabling I/O throttling on '%s' due " 618 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 619 bdrv_io_limits_disable(bs); 620 } 621 622 if (qemu_in_coroutine()) { 623 /* Fast-path if already in coroutine context */ 624 bdrv_rw_co_entry(&rwco); 625 } else { 626 AioContext *aio_context = bdrv_get_aio_context(bs); 627 628 co = qemu_coroutine_create(bdrv_rw_co_entry); 629 qemu_coroutine_enter(co, &rwco); 630 while (rwco.ret == NOT_DONE) { 631 aio_poll(aio_context, true); 632 } 633 } 634 return rwco.ret; 635 } 636 637 /* 638 * Process a synchronous request using coroutines 639 */ 640 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 641 int nb_sectors, bool is_write, BdrvRequestFlags flags) 642 { 643 QEMUIOVector qiov; 644 struct iovec iov = { 645 .iov_base = (void *)buf, 646 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 647 }; 648 649 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 650 return -EINVAL; 651 } 652 653 qemu_iovec_init_external(&qiov, &iov, 1); 654 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 655 &qiov, is_write, flags); 656 } 657 658 /* return < 0 if error. See bdrv_write() for the return codes */ 659 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 660 uint8_t *buf, int nb_sectors) 661 { 662 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 663 } 664 665 /* Return < 0 if error. Important errors are: 666 -EIO generic I/O error (may happen for all errors) 667 -ENOMEDIUM No media inserted. 668 -EINVAL Invalid sector number or nb_sectors 669 -EACCES Trying to write a read-only device 670 */ 671 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 672 const uint8_t *buf, int nb_sectors) 673 { 674 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 675 } 676 677 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 678 int nb_sectors, BdrvRequestFlags flags) 679 { 680 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 681 BDRV_REQ_ZERO_WRITE | flags); 682 } 683 684 /* 685 * Completely zero out a block device with the help of bdrv_write_zeroes. 686 * The operation is sped up by checking the block status and only writing 687 * zeroes to the device if they currently do not return zeroes. Optional 688 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 689 * 690 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 691 */ 692 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 693 { 694 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 695 BlockDriverState *file; 696 int n; 697 698 target_sectors = bdrv_nb_sectors(bs); 699 if (target_sectors < 0) { 700 return target_sectors; 701 } 702 703 for (;;) { 704 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 705 if (nb_sectors <= 0) { 706 return 0; 707 } 708 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); 709 if (ret < 0) { 710 error_report("error getting block status at sector %" PRId64 ": %s", 711 sector_num, strerror(-ret)); 712 return ret; 713 } 714 if (ret & BDRV_BLOCK_ZERO) { 715 sector_num += n; 716 continue; 717 } 718 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 719 if (ret < 0) { 720 error_report("error writing zeroes at sector %" PRId64 ": %s", 721 sector_num, strerror(-ret)); 722 return ret; 723 } 724 sector_num += n; 725 } 726 } 727 728 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 729 { 730 QEMUIOVector qiov; 731 struct iovec iov = { 732 .iov_base = (void *)buf, 733 .iov_len = bytes, 734 }; 735 int ret; 736 737 if (bytes < 0) { 738 return -EINVAL; 739 } 740 741 qemu_iovec_init_external(&qiov, &iov, 1); 742 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 743 if (ret < 0) { 744 return ret; 745 } 746 747 return bytes; 748 } 749 750 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 751 { 752 int ret; 753 754 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 755 if (ret < 0) { 756 return ret; 757 } 758 759 return qiov->size; 760 } 761 762 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 763 const void *buf, int bytes) 764 { 765 QEMUIOVector qiov; 766 struct iovec iov = { 767 .iov_base = (void *) buf, 768 .iov_len = bytes, 769 }; 770 771 if (bytes < 0) { 772 return -EINVAL; 773 } 774 775 qemu_iovec_init_external(&qiov, &iov, 1); 776 return bdrv_pwritev(bs, offset, &qiov); 777 } 778 779 /* 780 * Writes to the file and ensures that no writes are reordered across this 781 * request (acts as a barrier) 782 * 783 * Returns 0 on success, -errno in error cases. 784 */ 785 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 786 const void *buf, int count) 787 { 788 int ret; 789 790 ret = bdrv_pwrite(bs, offset, buf, count); 791 if (ret < 0) { 792 return ret; 793 } 794 795 ret = bdrv_flush(bs); 796 if (ret < 0) { 797 return ret; 798 } 799 800 return 0; 801 } 802 803 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 804 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 805 { 806 /* Perform I/O through a temporary buffer so that users who scribble over 807 * their read buffer while the operation is in progress do not end up 808 * modifying the image file. This is critical for zero-copy guest I/O 809 * where anything might happen inside guest memory. 810 */ 811 void *bounce_buffer; 812 813 BlockDriver *drv = bs->drv; 814 struct iovec iov; 815 QEMUIOVector bounce_qiov; 816 int64_t cluster_sector_num; 817 int cluster_nb_sectors; 818 size_t skip_bytes; 819 int ret; 820 821 /* Cover entire cluster so no additional backing file I/O is required when 822 * allocating cluster in the image file. 823 */ 824 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 825 &cluster_sector_num, &cluster_nb_sectors); 826 827 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 828 cluster_sector_num, cluster_nb_sectors); 829 830 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 831 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 832 if (bounce_buffer == NULL) { 833 ret = -ENOMEM; 834 goto err; 835 } 836 837 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 838 839 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 840 &bounce_qiov); 841 if (ret < 0) { 842 goto err; 843 } 844 845 if (drv->bdrv_co_write_zeroes && 846 buffer_is_zero(bounce_buffer, iov.iov_len)) { 847 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 848 cluster_nb_sectors, 0); 849 } else { 850 /* This does not change the data on the disk, it is not necessary 851 * to flush even in cache=writethrough mode. 852 */ 853 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 854 &bounce_qiov); 855 } 856 857 if (ret < 0) { 858 /* It might be okay to ignore write errors for guest requests. If this 859 * is a deliberate copy-on-read then we don't want to ignore the error. 860 * Simply report it in all cases. 861 */ 862 goto err; 863 } 864 865 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 866 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 867 nb_sectors * BDRV_SECTOR_SIZE); 868 869 err: 870 qemu_vfree(bounce_buffer); 871 return ret; 872 } 873 874 /* 875 * Forwards an already correctly aligned request to the BlockDriver. This 876 * handles copy on read and zeroing after EOF; any other features must be 877 * implemented by the caller. 878 */ 879 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 880 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 881 int64_t align, QEMUIOVector *qiov, int flags) 882 { 883 BlockDriver *drv = bs->drv; 884 int ret; 885 886 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 887 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 888 889 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 890 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 891 assert(!qiov || bytes == qiov->size); 892 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 893 894 /* Handle Copy on Read and associated serialisation */ 895 if (flags & BDRV_REQ_COPY_ON_READ) { 896 /* If we touch the same cluster it counts as an overlap. This 897 * guarantees that allocating writes will be serialized and not race 898 * with each other for the same cluster. For example, in copy-on-read 899 * it ensures that the CoR read and write operations are atomic and 900 * guest writes cannot interleave between them. */ 901 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 902 } 903 904 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 905 wait_serialising_requests(req); 906 } 907 908 if (flags & BDRV_REQ_COPY_ON_READ) { 909 int pnum; 910 911 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 912 if (ret < 0) { 913 goto out; 914 } 915 916 if (!ret || pnum != nb_sectors) { 917 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 918 goto out; 919 } 920 } 921 922 /* Forward the request to the BlockDriver */ 923 if (!bs->zero_beyond_eof) { 924 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 925 } else { 926 /* Read zeros after EOF */ 927 int64_t total_sectors, max_nb_sectors; 928 929 total_sectors = bdrv_nb_sectors(bs); 930 if (total_sectors < 0) { 931 ret = total_sectors; 932 goto out; 933 } 934 935 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 936 align >> BDRV_SECTOR_BITS); 937 if (nb_sectors < max_nb_sectors) { 938 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 939 } else if (max_nb_sectors > 0) { 940 QEMUIOVector local_qiov; 941 942 qemu_iovec_init(&local_qiov, qiov->niov); 943 qemu_iovec_concat(&local_qiov, qiov, 0, 944 max_nb_sectors * BDRV_SECTOR_SIZE); 945 946 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 947 &local_qiov); 948 949 qemu_iovec_destroy(&local_qiov); 950 } else { 951 ret = 0; 952 } 953 954 /* Reading beyond end of file is supposed to produce zeroes */ 955 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 956 uint64_t offset = MAX(0, total_sectors - sector_num); 957 uint64_t bytes = (sector_num + nb_sectors - offset) * 958 BDRV_SECTOR_SIZE; 959 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 960 } 961 } 962 963 out: 964 return ret; 965 } 966 967 /* 968 * Handle a read request in coroutine context 969 */ 970 int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 971 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 972 BdrvRequestFlags flags) 973 { 974 BlockDriver *drv = bs->drv; 975 BdrvTrackedRequest req; 976 977 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 978 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 979 uint8_t *head_buf = NULL; 980 uint8_t *tail_buf = NULL; 981 QEMUIOVector local_qiov; 982 bool use_local_qiov = false; 983 int ret; 984 985 if (!drv) { 986 return -ENOMEDIUM; 987 } 988 989 ret = bdrv_check_byte_request(bs, offset, bytes); 990 if (ret < 0) { 991 return ret; 992 } 993 994 /* Don't do copy-on-read if we read data before write operation */ 995 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 996 flags |= BDRV_REQ_COPY_ON_READ; 997 } 998 999 /* throttling disk I/O */ 1000 if (bs->io_limits_enabled) { 1001 throttle_group_co_io_limits_intercept(bs, bytes, false); 1002 } 1003 1004 /* Align read if necessary by padding qiov */ 1005 if (offset & (align - 1)) { 1006 head_buf = qemu_blockalign(bs, align); 1007 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1008 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1009 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1010 use_local_qiov = true; 1011 1012 bytes += offset & (align - 1); 1013 offset = offset & ~(align - 1); 1014 } 1015 1016 if ((offset + bytes) & (align - 1)) { 1017 if (!use_local_qiov) { 1018 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1019 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1020 use_local_qiov = true; 1021 } 1022 tail_buf = qemu_blockalign(bs, align); 1023 qemu_iovec_add(&local_qiov, tail_buf, 1024 align - ((offset + bytes) & (align - 1))); 1025 1026 bytes = ROUND_UP(bytes, align); 1027 } 1028 1029 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1030 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 1031 use_local_qiov ? &local_qiov : qiov, 1032 flags); 1033 tracked_request_end(&req); 1034 1035 if (use_local_qiov) { 1036 qemu_iovec_destroy(&local_qiov); 1037 qemu_vfree(head_buf); 1038 qemu_vfree(tail_buf); 1039 } 1040 1041 return ret; 1042 } 1043 1044 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1045 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1046 BdrvRequestFlags flags) 1047 { 1048 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1049 return -EINVAL; 1050 } 1051 1052 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1053 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1054 } 1055 1056 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1057 int nb_sectors, QEMUIOVector *qiov) 1058 { 1059 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1060 1061 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1062 } 1063 1064 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, 1065 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1066 { 1067 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); 1068 1069 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1070 BDRV_REQ_NO_SERIALISING); 1071 } 1072 1073 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1074 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1075 { 1076 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1077 1078 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1079 BDRV_REQ_COPY_ON_READ); 1080 } 1081 1082 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1083 1084 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1085 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1086 { 1087 BlockDriver *drv = bs->drv; 1088 QEMUIOVector qiov; 1089 struct iovec iov = {0}; 1090 int ret = 0; 1091 1092 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1093 BDRV_REQUEST_MAX_SECTORS); 1094 1095 while (nb_sectors > 0 && !ret) { 1096 int num = nb_sectors; 1097 1098 /* Align request. Block drivers can expect the "bulk" of the request 1099 * to be aligned. 1100 */ 1101 if (bs->bl.write_zeroes_alignment 1102 && num > bs->bl.write_zeroes_alignment) { 1103 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1104 /* Make a small request up to the first aligned sector. */ 1105 num = bs->bl.write_zeroes_alignment; 1106 num -= sector_num % bs->bl.write_zeroes_alignment; 1107 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1108 /* Shorten the request to the last aligned sector. num cannot 1109 * underflow because num > bs->bl.write_zeroes_alignment. 1110 */ 1111 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1112 } 1113 } 1114 1115 /* limit request size */ 1116 if (num > max_write_zeroes) { 1117 num = max_write_zeroes; 1118 } 1119 1120 ret = -ENOTSUP; 1121 /* First try the efficient write zeroes operation */ 1122 if (drv->bdrv_co_write_zeroes) { 1123 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1124 } 1125 1126 if (ret == -ENOTSUP) { 1127 /* Fall back to bounce buffer if write zeroes is unsupported */ 1128 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1129 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1130 num = MIN(num, max_xfer_len); 1131 iov.iov_len = num * BDRV_SECTOR_SIZE; 1132 if (iov.iov_base == NULL) { 1133 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1134 if (iov.iov_base == NULL) { 1135 ret = -ENOMEM; 1136 goto fail; 1137 } 1138 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1139 } 1140 qemu_iovec_init_external(&qiov, &iov, 1); 1141 1142 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1143 1144 /* Keep bounce buffer around if it is big enough for all 1145 * all future requests. 1146 */ 1147 if (num < max_xfer_len) { 1148 qemu_vfree(iov.iov_base); 1149 iov.iov_base = NULL; 1150 } 1151 } 1152 1153 sector_num += num; 1154 nb_sectors -= num; 1155 } 1156 1157 fail: 1158 qemu_vfree(iov.iov_base); 1159 return ret; 1160 } 1161 1162 /* 1163 * Forwards an already correctly aligned write request to the BlockDriver. 1164 */ 1165 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1166 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1167 QEMUIOVector *qiov, int flags) 1168 { 1169 BlockDriver *drv = bs->drv; 1170 bool waited; 1171 int ret; 1172 1173 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1174 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1175 1176 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1177 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1178 assert(!qiov || bytes == qiov->size); 1179 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1180 1181 waited = wait_serialising_requests(req); 1182 assert(!waited || !req->serialising); 1183 assert(req->overlap_offset <= offset); 1184 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1185 1186 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1187 1188 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1189 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1190 qemu_iovec_is_zero(qiov)) { 1191 flags |= BDRV_REQ_ZERO_WRITE; 1192 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1193 flags |= BDRV_REQ_MAY_UNMAP; 1194 } 1195 } 1196 1197 if (ret < 0) { 1198 /* Do nothing, write notifier decided to fail this request */ 1199 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1200 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1201 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1202 } else if (drv->bdrv_co_writev_flags) { 1203 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1204 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, 1205 flags); 1206 } else { 1207 assert(drv->supported_write_flags == 0); 1208 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1209 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1210 } 1211 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1212 1213 if (ret == 0 && (flags & BDRV_REQ_FUA) && 1214 !(drv->supported_write_flags & BDRV_REQ_FUA)) 1215 { 1216 ret = bdrv_co_flush(bs); 1217 } 1218 1219 bdrv_set_dirty(bs, sector_num, nb_sectors); 1220 1221 if (bs->wr_highest_offset < offset + bytes) { 1222 bs->wr_highest_offset = offset + bytes; 1223 } 1224 1225 if (ret >= 0) { 1226 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1227 } 1228 1229 return ret; 1230 } 1231 1232 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1233 int64_t offset, 1234 unsigned int bytes, 1235 BdrvRequestFlags flags, 1236 BdrvTrackedRequest *req) 1237 { 1238 uint8_t *buf = NULL; 1239 QEMUIOVector local_qiov; 1240 struct iovec iov; 1241 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1242 unsigned int head_padding_bytes, tail_padding_bytes; 1243 int ret = 0; 1244 1245 head_padding_bytes = offset & (align - 1); 1246 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1247 1248 1249 assert(flags & BDRV_REQ_ZERO_WRITE); 1250 if (head_padding_bytes || tail_padding_bytes) { 1251 buf = qemu_blockalign(bs, align); 1252 iov = (struct iovec) { 1253 .iov_base = buf, 1254 .iov_len = align, 1255 }; 1256 qemu_iovec_init_external(&local_qiov, &iov, 1); 1257 } 1258 if (head_padding_bytes) { 1259 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1260 1261 /* RMW the unaligned part before head. */ 1262 mark_request_serialising(req, align); 1263 wait_serialising_requests(req); 1264 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1265 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1266 align, &local_qiov, 0); 1267 if (ret < 0) { 1268 goto fail; 1269 } 1270 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1271 1272 memset(buf + head_padding_bytes, 0, zero_bytes); 1273 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1274 &local_qiov, 1275 flags & ~BDRV_REQ_ZERO_WRITE); 1276 if (ret < 0) { 1277 goto fail; 1278 } 1279 offset += zero_bytes; 1280 bytes -= zero_bytes; 1281 } 1282 1283 assert(!bytes || (offset & (align - 1)) == 0); 1284 if (bytes >= align) { 1285 /* Write the aligned part in the middle. */ 1286 uint64_t aligned_bytes = bytes & ~(align - 1); 1287 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1288 NULL, flags); 1289 if (ret < 0) { 1290 goto fail; 1291 } 1292 bytes -= aligned_bytes; 1293 offset += aligned_bytes; 1294 } 1295 1296 assert(!bytes || (offset & (align - 1)) == 0); 1297 if (bytes) { 1298 assert(align == tail_padding_bytes + bytes); 1299 /* RMW the unaligned part after tail. */ 1300 mark_request_serialising(req, align); 1301 wait_serialising_requests(req); 1302 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1303 ret = bdrv_aligned_preadv(bs, req, offset, align, 1304 align, &local_qiov, 0); 1305 if (ret < 0) { 1306 goto fail; 1307 } 1308 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1309 1310 memset(buf, 0, bytes); 1311 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1312 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1313 } 1314 fail: 1315 qemu_vfree(buf); 1316 return ret; 1317 1318 } 1319 1320 /* 1321 * Handle a write request in coroutine context 1322 */ 1323 int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1324 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1325 BdrvRequestFlags flags) 1326 { 1327 BdrvTrackedRequest req; 1328 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1329 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1330 uint8_t *head_buf = NULL; 1331 uint8_t *tail_buf = NULL; 1332 QEMUIOVector local_qiov; 1333 bool use_local_qiov = false; 1334 int ret; 1335 1336 if (!bs->drv) { 1337 return -ENOMEDIUM; 1338 } 1339 if (bs->read_only) { 1340 return -EPERM; 1341 } 1342 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1343 1344 ret = bdrv_check_byte_request(bs, offset, bytes); 1345 if (ret < 0) { 1346 return ret; 1347 } 1348 1349 /* throttling disk I/O */ 1350 if (bs->io_limits_enabled) { 1351 throttle_group_co_io_limits_intercept(bs, bytes, true); 1352 } 1353 1354 /* 1355 * Align write if necessary by performing a read-modify-write cycle. 1356 * Pad qiov with the read parts and be sure to have a tracked request not 1357 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1358 */ 1359 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1360 1361 if (!qiov) { 1362 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1363 goto out; 1364 } 1365 1366 if (offset & (align - 1)) { 1367 QEMUIOVector head_qiov; 1368 struct iovec head_iov; 1369 1370 mark_request_serialising(&req, align); 1371 wait_serialising_requests(&req); 1372 1373 head_buf = qemu_blockalign(bs, align); 1374 head_iov = (struct iovec) { 1375 .iov_base = head_buf, 1376 .iov_len = align, 1377 }; 1378 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1379 1380 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1381 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1382 align, &head_qiov, 0); 1383 if (ret < 0) { 1384 goto fail; 1385 } 1386 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1387 1388 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1389 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1390 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1391 use_local_qiov = true; 1392 1393 bytes += offset & (align - 1); 1394 offset = offset & ~(align - 1); 1395 } 1396 1397 if ((offset + bytes) & (align - 1)) { 1398 QEMUIOVector tail_qiov; 1399 struct iovec tail_iov; 1400 size_t tail_bytes; 1401 bool waited; 1402 1403 mark_request_serialising(&req, align); 1404 waited = wait_serialising_requests(&req); 1405 assert(!waited || !use_local_qiov); 1406 1407 tail_buf = qemu_blockalign(bs, align); 1408 tail_iov = (struct iovec) { 1409 .iov_base = tail_buf, 1410 .iov_len = align, 1411 }; 1412 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1413 1414 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1415 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1416 align, &tail_qiov, 0); 1417 if (ret < 0) { 1418 goto fail; 1419 } 1420 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1421 1422 if (!use_local_qiov) { 1423 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1424 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1425 use_local_qiov = true; 1426 } 1427 1428 tail_bytes = (offset + bytes) & (align - 1); 1429 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1430 1431 bytes = ROUND_UP(bytes, align); 1432 } 1433 1434 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1435 use_local_qiov ? &local_qiov : qiov, 1436 flags); 1437 1438 fail: 1439 1440 if (use_local_qiov) { 1441 qemu_iovec_destroy(&local_qiov); 1442 } 1443 qemu_vfree(head_buf); 1444 qemu_vfree(tail_buf); 1445 out: 1446 tracked_request_end(&req); 1447 return ret; 1448 } 1449 1450 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1451 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1452 BdrvRequestFlags flags) 1453 { 1454 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1455 return -EINVAL; 1456 } 1457 1458 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1459 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1460 } 1461 1462 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1463 int nb_sectors, QEMUIOVector *qiov) 1464 { 1465 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1466 1467 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1468 } 1469 1470 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1471 int64_t sector_num, int nb_sectors, 1472 BdrvRequestFlags flags) 1473 { 1474 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1475 1476 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1477 flags &= ~BDRV_REQ_MAY_UNMAP; 1478 } 1479 1480 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1481 BDRV_REQ_ZERO_WRITE | flags); 1482 } 1483 1484 typedef struct BdrvCoGetBlockStatusData { 1485 BlockDriverState *bs; 1486 BlockDriverState *base; 1487 BlockDriverState **file; 1488 int64_t sector_num; 1489 int nb_sectors; 1490 int *pnum; 1491 int64_t ret; 1492 bool done; 1493 } BdrvCoGetBlockStatusData; 1494 1495 /* 1496 * Returns the allocation status of the specified sectors. 1497 * Drivers not implementing the functionality are assumed to not support 1498 * backing files, hence all their sectors are reported as allocated. 1499 * 1500 * If 'sector_num' is beyond the end of the disk image the return value is 0 1501 * and 'pnum' is set to 0. 1502 * 1503 * 'pnum' is set to the number of sectors (including and immediately following 1504 * the specified sector) that are known to be in the same 1505 * allocated/unallocated state. 1506 * 1507 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1508 * beyond the end of the disk image it will be clamped. 1509 * 1510 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' 1511 * points to the BDS which the sector range is allocated in. 1512 */ 1513 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1514 int64_t sector_num, 1515 int nb_sectors, int *pnum, 1516 BlockDriverState **file) 1517 { 1518 int64_t total_sectors; 1519 int64_t n; 1520 int64_t ret, ret2; 1521 1522 total_sectors = bdrv_nb_sectors(bs); 1523 if (total_sectors < 0) { 1524 return total_sectors; 1525 } 1526 1527 if (sector_num >= total_sectors) { 1528 *pnum = 0; 1529 return 0; 1530 } 1531 1532 n = total_sectors - sector_num; 1533 if (n < nb_sectors) { 1534 nb_sectors = n; 1535 } 1536 1537 if (!bs->drv->bdrv_co_get_block_status) { 1538 *pnum = nb_sectors; 1539 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1540 if (bs->drv->protocol_name) { 1541 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1542 } 1543 return ret; 1544 } 1545 1546 *file = NULL; 1547 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, 1548 file); 1549 if (ret < 0) { 1550 *pnum = 0; 1551 return ret; 1552 } 1553 1554 if (ret & BDRV_BLOCK_RAW) { 1555 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1556 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1557 *pnum, pnum, file); 1558 } 1559 1560 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1561 ret |= BDRV_BLOCK_ALLOCATED; 1562 } else { 1563 if (bdrv_unallocated_blocks_are_zero(bs)) { 1564 ret |= BDRV_BLOCK_ZERO; 1565 } else if (bs->backing) { 1566 BlockDriverState *bs2 = bs->backing->bs; 1567 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1568 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1569 ret |= BDRV_BLOCK_ZERO; 1570 } 1571 } 1572 } 1573 1574 if (*file && *file != bs && 1575 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1576 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1577 BlockDriverState *file2; 1578 int file_pnum; 1579 1580 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1581 *pnum, &file_pnum, &file2); 1582 if (ret2 >= 0) { 1583 /* Ignore errors. This is just providing extra information, it 1584 * is useful but not necessary. 1585 */ 1586 if (!file_pnum) { 1587 /* !file_pnum indicates an offset at or beyond the EOF; it is 1588 * perfectly valid for the format block driver to point to such 1589 * offsets, so catch it and mark everything as zero */ 1590 ret |= BDRV_BLOCK_ZERO; 1591 } else { 1592 /* Limit request to the range reported by the protocol driver */ 1593 *pnum = file_pnum; 1594 ret |= (ret2 & BDRV_BLOCK_ZERO); 1595 } 1596 } 1597 } 1598 1599 return ret; 1600 } 1601 1602 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1603 BlockDriverState *base, 1604 int64_t sector_num, 1605 int nb_sectors, 1606 int *pnum, 1607 BlockDriverState **file) 1608 { 1609 BlockDriverState *p; 1610 int64_t ret = 0; 1611 1612 assert(bs != base); 1613 for (p = bs; p != base; p = backing_bs(p)) { 1614 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); 1615 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1616 break; 1617 } 1618 /* [sector_num, pnum] unallocated on this layer, which could be only 1619 * the first part of [sector_num, nb_sectors]. */ 1620 nb_sectors = MIN(nb_sectors, *pnum); 1621 } 1622 return ret; 1623 } 1624 1625 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1626 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1627 { 1628 BdrvCoGetBlockStatusData *data = opaque; 1629 1630 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1631 data->sector_num, 1632 data->nb_sectors, 1633 data->pnum, 1634 data->file); 1635 data->done = true; 1636 } 1637 1638 /* 1639 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1640 * 1641 * See bdrv_co_get_block_status_above() for details. 1642 */ 1643 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1644 BlockDriverState *base, 1645 int64_t sector_num, 1646 int nb_sectors, int *pnum, 1647 BlockDriverState **file) 1648 { 1649 Coroutine *co; 1650 BdrvCoGetBlockStatusData data = { 1651 .bs = bs, 1652 .base = base, 1653 .file = file, 1654 .sector_num = sector_num, 1655 .nb_sectors = nb_sectors, 1656 .pnum = pnum, 1657 .done = false, 1658 }; 1659 1660 if (qemu_in_coroutine()) { 1661 /* Fast-path if already in coroutine context */ 1662 bdrv_get_block_status_above_co_entry(&data); 1663 } else { 1664 AioContext *aio_context = bdrv_get_aio_context(bs); 1665 1666 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1667 qemu_coroutine_enter(co, &data); 1668 while (!data.done) { 1669 aio_poll(aio_context, true); 1670 } 1671 } 1672 return data.ret; 1673 } 1674 1675 int64_t bdrv_get_block_status(BlockDriverState *bs, 1676 int64_t sector_num, 1677 int nb_sectors, int *pnum, 1678 BlockDriverState **file) 1679 { 1680 return bdrv_get_block_status_above(bs, backing_bs(bs), 1681 sector_num, nb_sectors, pnum, file); 1682 } 1683 1684 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1685 int nb_sectors, int *pnum) 1686 { 1687 BlockDriverState *file; 1688 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, 1689 &file); 1690 if (ret < 0) { 1691 return ret; 1692 } 1693 return !!(ret & BDRV_BLOCK_ALLOCATED); 1694 } 1695 1696 /* 1697 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1698 * 1699 * Return true if the given sector is allocated in any image between 1700 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1701 * sector is allocated in any image of the chain. Return false otherwise. 1702 * 1703 * 'pnum' is set to the number of sectors (including and immediately following 1704 * the specified sector) that are known to be in the same 1705 * allocated/unallocated state. 1706 * 1707 */ 1708 int bdrv_is_allocated_above(BlockDriverState *top, 1709 BlockDriverState *base, 1710 int64_t sector_num, 1711 int nb_sectors, int *pnum) 1712 { 1713 BlockDriverState *intermediate; 1714 int ret, n = nb_sectors; 1715 1716 intermediate = top; 1717 while (intermediate && intermediate != base) { 1718 int pnum_inter; 1719 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1720 &pnum_inter); 1721 if (ret < 0) { 1722 return ret; 1723 } else if (ret) { 1724 *pnum = pnum_inter; 1725 return 1; 1726 } 1727 1728 /* 1729 * [sector_num, nb_sectors] is unallocated on top but intermediate 1730 * might have 1731 * 1732 * [sector_num+x, nr_sectors] allocated. 1733 */ 1734 if (n > pnum_inter && 1735 (intermediate == top || 1736 sector_num + pnum_inter < intermediate->total_sectors)) { 1737 n = pnum_inter; 1738 } 1739 1740 intermediate = backing_bs(intermediate); 1741 } 1742 1743 *pnum = n; 1744 return 0; 1745 } 1746 1747 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1748 const uint8_t *buf, int nb_sectors) 1749 { 1750 BlockDriver *drv = bs->drv; 1751 int ret; 1752 1753 if (!drv) { 1754 return -ENOMEDIUM; 1755 } 1756 if (!drv->bdrv_write_compressed) { 1757 return -ENOTSUP; 1758 } 1759 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1760 if (ret < 0) { 1761 return ret; 1762 } 1763 1764 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1765 1766 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1767 } 1768 1769 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1770 int64_t pos, int size) 1771 { 1772 QEMUIOVector qiov; 1773 struct iovec iov = { 1774 .iov_base = (void *) buf, 1775 .iov_len = size, 1776 }; 1777 1778 qemu_iovec_init_external(&qiov, &iov, 1); 1779 return bdrv_writev_vmstate(bs, &qiov, pos); 1780 } 1781 1782 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1783 { 1784 BlockDriver *drv = bs->drv; 1785 1786 if (!drv) { 1787 return -ENOMEDIUM; 1788 } else if (drv->bdrv_save_vmstate) { 1789 return drv->bdrv_save_vmstate(bs, qiov, pos); 1790 } else if (bs->file) { 1791 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1792 } 1793 1794 return -ENOTSUP; 1795 } 1796 1797 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1798 int64_t pos, int size) 1799 { 1800 BlockDriver *drv = bs->drv; 1801 if (!drv) 1802 return -ENOMEDIUM; 1803 if (drv->bdrv_load_vmstate) 1804 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1805 if (bs->file) 1806 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1807 return -ENOTSUP; 1808 } 1809 1810 /**************************************************************/ 1811 /* async I/Os */ 1812 1813 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1814 QEMUIOVector *qiov, int nb_sectors, 1815 BlockCompletionFunc *cb, void *opaque) 1816 { 1817 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1818 1819 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1820 cb, opaque, false); 1821 } 1822 1823 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1824 QEMUIOVector *qiov, int nb_sectors, 1825 BlockCompletionFunc *cb, void *opaque) 1826 { 1827 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1828 1829 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1830 cb, opaque, true); 1831 } 1832 1833 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1834 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1835 BlockCompletionFunc *cb, void *opaque) 1836 { 1837 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1838 1839 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1840 BDRV_REQ_ZERO_WRITE | flags, 1841 cb, opaque, true); 1842 } 1843 1844 1845 typedef struct MultiwriteCB { 1846 int error; 1847 int num_requests; 1848 int num_callbacks; 1849 struct { 1850 BlockCompletionFunc *cb; 1851 void *opaque; 1852 QEMUIOVector *free_qiov; 1853 } callbacks[]; 1854 } MultiwriteCB; 1855 1856 static void multiwrite_user_cb(MultiwriteCB *mcb) 1857 { 1858 int i; 1859 1860 for (i = 0; i < mcb->num_callbacks; i++) { 1861 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1862 if (mcb->callbacks[i].free_qiov) { 1863 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1864 } 1865 g_free(mcb->callbacks[i].free_qiov); 1866 } 1867 } 1868 1869 static void multiwrite_cb(void *opaque, int ret) 1870 { 1871 MultiwriteCB *mcb = opaque; 1872 1873 trace_multiwrite_cb(mcb, ret); 1874 1875 if (ret < 0 && !mcb->error) { 1876 mcb->error = ret; 1877 } 1878 1879 mcb->num_requests--; 1880 if (mcb->num_requests == 0) { 1881 multiwrite_user_cb(mcb); 1882 g_free(mcb); 1883 } 1884 } 1885 1886 static int multiwrite_req_compare(const void *a, const void *b) 1887 { 1888 const BlockRequest *req1 = a, *req2 = b; 1889 1890 /* 1891 * Note that we can't simply subtract req2->sector from req1->sector 1892 * here as that could overflow the return value. 1893 */ 1894 if (req1->sector > req2->sector) { 1895 return 1; 1896 } else if (req1->sector < req2->sector) { 1897 return -1; 1898 } else { 1899 return 0; 1900 } 1901 } 1902 1903 /* 1904 * Takes a bunch of requests and tries to merge them. Returns the number of 1905 * requests that remain after merging. 1906 */ 1907 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1908 int num_reqs, MultiwriteCB *mcb) 1909 { 1910 int i, outidx; 1911 1912 // Sort requests by start sector 1913 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1914 1915 // Check if adjacent requests touch the same clusters. If so, combine them, 1916 // filling up gaps with zero sectors. 1917 outidx = 0; 1918 for (i = 1; i < num_reqs; i++) { 1919 int merge = 0; 1920 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1921 1922 // Handle exactly sequential writes and overlapping writes. 1923 if (reqs[i].sector <= oldreq_last) { 1924 merge = 1; 1925 } 1926 1927 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > 1928 bs->bl.max_iov) { 1929 merge = 0; 1930 } 1931 1932 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1933 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1934 merge = 0; 1935 } 1936 1937 if (merge) { 1938 size_t size; 1939 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1940 qemu_iovec_init(qiov, 1941 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1942 1943 // Add the first request to the merged one. If the requests are 1944 // overlapping, drop the last sectors of the first request. 1945 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1946 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1947 1948 // We should need to add any zeros between the two requests 1949 assert (reqs[i].sector <= oldreq_last); 1950 1951 // Add the second request 1952 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1953 1954 // Add tail of first request, if necessary 1955 if (qiov->size < reqs[outidx].qiov->size) { 1956 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1957 reqs[outidx].qiov->size - qiov->size); 1958 } 1959 1960 reqs[outidx].nb_sectors = qiov->size >> 9; 1961 reqs[outidx].qiov = qiov; 1962 1963 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1964 } else { 1965 outidx++; 1966 reqs[outidx].sector = reqs[i].sector; 1967 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1968 reqs[outidx].qiov = reqs[i].qiov; 1969 } 1970 } 1971 1972 if (bs->blk) { 1973 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1974 num_reqs - outidx - 1); 1975 } 1976 1977 return outidx + 1; 1978 } 1979 1980 /* 1981 * Submit multiple AIO write requests at once. 1982 * 1983 * On success, the function returns 0 and all requests in the reqs array have 1984 * been submitted. In error case this function returns -1, and any of the 1985 * requests may or may not be submitted yet. In particular, this means that the 1986 * callback will be called for some of the requests, for others it won't. The 1987 * caller must check the error field of the BlockRequest to wait for the right 1988 * callbacks (if error != 0, no callback will be called). 1989 * 1990 * The implementation may modify the contents of the reqs array, e.g. to merge 1991 * requests. However, the fields opaque and error are left unmodified as they 1992 * are used to signal failure for a single request to the caller. 1993 */ 1994 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1995 { 1996 MultiwriteCB *mcb; 1997 int i; 1998 1999 /* don't submit writes if we don't have a medium */ 2000 if (bs->drv == NULL) { 2001 for (i = 0; i < num_reqs; i++) { 2002 reqs[i].error = -ENOMEDIUM; 2003 } 2004 return -1; 2005 } 2006 2007 if (num_reqs == 0) { 2008 return 0; 2009 } 2010 2011 // Create MultiwriteCB structure 2012 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 2013 mcb->num_requests = 0; 2014 mcb->num_callbacks = num_reqs; 2015 2016 for (i = 0; i < num_reqs; i++) { 2017 mcb->callbacks[i].cb = reqs[i].cb; 2018 mcb->callbacks[i].opaque = reqs[i].opaque; 2019 } 2020 2021 // Check for mergable requests 2022 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 2023 2024 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 2025 2026 /* Run the aio requests. */ 2027 mcb->num_requests = num_reqs; 2028 for (i = 0; i < num_reqs; i++) { 2029 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 2030 reqs[i].nb_sectors, reqs[i].flags, 2031 multiwrite_cb, mcb, 2032 true); 2033 } 2034 2035 return 0; 2036 } 2037 2038 void bdrv_aio_cancel(BlockAIOCB *acb) 2039 { 2040 qemu_aio_ref(acb); 2041 bdrv_aio_cancel_async(acb); 2042 while (acb->refcnt > 1) { 2043 if (acb->aiocb_info->get_aio_context) { 2044 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2045 } else if (acb->bs) { 2046 aio_poll(bdrv_get_aio_context(acb->bs), true); 2047 } else { 2048 abort(); 2049 } 2050 } 2051 qemu_aio_unref(acb); 2052 } 2053 2054 /* Async version of aio cancel. The caller is not blocked if the acb implements 2055 * cancel_async, otherwise we do nothing and let the request normally complete. 2056 * In either case the completion callback must be called. */ 2057 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2058 { 2059 if (acb->aiocb_info->cancel_async) { 2060 acb->aiocb_info->cancel_async(acb); 2061 } 2062 } 2063 2064 /**************************************************************/ 2065 /* async block device emulation */ 2066 2067 typedef struct BlockAIOCBSync { 2068 BlockAIOCB common; 2069 QEMUBH *bh; 2070 int ret; 2071 /* vector translation state */ 2072 QEMUIOVector *qiov; 2073 uint8_t *bounce; 2074 int is_write; 2075 } BlockAIOCBSync; 2076 2077 static const AIOCBInfo bdrv_em_aiocb_info = { 2078 .aiocb_size = sizeof(BlockAIOCBSync), 2079 }; 2080 2081 static void bdrv_aio_bh_cb(void *opaque) 2082 { 2083 BlockAIOCBSync *acb = opaque; 2084 2085 if (!acb->is_write && acb->ret >= 0) { 2086 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2087 } 2088 qemu_vfree(acb->bounce); 2089 acb->common.cb(acb->common.opaque, acb->ret); 2090 qemu_bh_delete(acb->bh); 2091 acb->bh = NULL; 2092 qemu_aio_unref(acb); 2093 } 2094 2095 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2096 int64_t sector_num, 2097 QEMUIOVector *qiov, 2098 int nb_sectors, 2099 BlockCompletionFunc *cb, 2100 void *opaque, 2101 int is_write) 2102 2103 { 2104 BlockAIOCBSync *acb; 2105 2106 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2107 acb->is_write = is_write; 2108 acb->qiov = qiov; 2109 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2110 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2111 2112 if (acb->bounce == NULL) { 2113 acb->ret = -ENOMEM; 2114 } else if (is_write) { 2115 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2116 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2117 } else { 2118 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2119 } 2120 2121 qemu_bh_schedule(acb->bh); 2122 2123 return &acb->common; 2124 } 2125 2126 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2127 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2128 BlockCompletionFunc *cb, void *opaque) 2129 { 2130 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2131 } 2132 2133 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2134 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2135 BlockCompletionFunc *cb, void *opaque) 2136 { 2137 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2138 } 2139 2140 2141 typedef struct BlockAIOCBCoroutine { 2142 BlockAIOCB common; 2143 BlockRequest req; 2144 bool is_write; 2145 bool need_bh; 2146 bool *done; 2147 QEMUBH* bh; 2148 } BlockAIOCBCoroutine; 2149 2150 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2151 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2152 }; 2153 2154 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2155 { 2156 if (!acb->need_bh) { 2157 acb->common.cb(acb->common.opaque, acb->req.error); 2158 qemu_aio_unref(acb); 2159 } 2160 } 2161 2162 static void bdrv_co_em_bh(void *opaque) 2163 { 2164 BlockAIOCBCoroutine *acb = opaque; 2165 2166 assert(!acb->need_bh); 2167 qemu_bh_delete(acb->bh); 2168 bdrv_co_complete(acb); 2169 } 2170 2171 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2172 { 2173 acb->need_bh = false; 2174 if (acb->req.error != -EINPROGRESS) { 2175 BlockDriverState *bs = acb->common.bs; 2176 2177 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2178 qemu_bh_schedule(acb->bh); 2179 } 2180 } 2181 2182 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2183 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2184 { 2185 BlockAIOCBCoroutine *acb = opaque; 2186 BlockDriverState *bs = acb->common.bs; 2187 2188 if (!acb->is_write) { 2189 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2190 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2191 } else { 2192 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2193 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2194 } 2195 2196 bdrv_co_complete(acb); 2197 } 2198 2199 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2200 int64_t sector_num, 2201 QEMUIOVector *qiov, 2202 int nb_sectors, 2203 BdrvRequestFlags flags, 2204 BlockCompletionFunc *cb, 2205 void *opaque, 2206 bool is_write) 2207 { 2208 Coroutine *co; 2209 BlockAIOCBCoroutine *acb; 2210 2211 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2212 acb->need_bh = true; 2213 acb->req.error = -EINPROGRESS; 2214 acb->req.sector = sector_num; 2215 acb->req.nb_sectors = nb_sectors; 2216 acb->req.qiov = qiov; 2217 acb->req.flags = flags; 2218 acb->is_write = is_write; 2219 2220 co = qemu_coroutine_create(bdrv_co_do_rw); 2221 qemu_coroutine_enter(co, acb); 2222 2223 bdrv_co_maybe_schedule_bh(acb); 2224 return &acb->common; 2225 } 2226 2227 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2228 { 2229 BlockAIOCBCoroutine *acb = opaque; 2230 BlockDriverState *bs = acb->common.bs; 2231 2232 acb->req.error = bdrv_co_flush(bs); 2233 bdrv_co_complete(acb); 2234 } 2235 2236 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2237 BlockCompletionFunc *cb, void *opaque) 2238 { 2239 trace_bdrv_aio_flush(bs, opaque); 2240 2241 Coroutine *co; 2242 BlockAIOCBCoroutine *acb; 2243 2244 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2245 acb->need_bh = true; 2246 acb->req.error = -EINPROGRESS; 2247 2248 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2249 qemu_coroutine_enter(co, acb); 2250 2251 bdrv_co_maybe_schedule_bh(acb); 2252 return &acb->common; 2253 } 2254 2255 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2256 { 2257 BlockAIOCBCoroutine *acb = opaque; 2258 BlockDriverState *bs = acb->common.bs; 2259 2260 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2261 bdrv_co_complete(acb); 2262 } 2263 2264 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2265 int64_t sector_num, int nb_sectors, 2266 BlockCompletionFunc *cb, void *opaque) 2267 { 2268 Coroutine *co; 2269 BlockAIOCBCoroutine *acb; 2270 2271 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2272 2273 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2274 acb->need_bh = true; 2275 acb->req.error = -EINPROGRESS; 2276 acb->req.sector = sector_num; 2277 acb->req.nb_sectors = nb_sectors; 2278 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2279 qemu_coroutine_enter(co, acb); 2280 2281 bdrv_co_maybe_schedule_bh(acb); 2282 return &acb->common; 2283 } 2284 2285 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2286 BlockCompletionFunc *cb, void *opaque) 2287 { 2288 BlockAIOCB *acb; 2289 2290 acb = g_malloc(aiocb_info->aiocb_size); 2291 acb->aiocb_info = aiocb_info; 2292 acb->bs = bs; 2293 acb->cb = cb; 2294 acb->opaque = opaque; 2295 acb->refcnt = 1; 2296 return acb; 2297 } 2298 2299 void qemu_aio_ref(void *p) 2300 { 2301 BlockAIOCB *acb = p; 2302 acb->refcnt++; 2303 } 2304 2305 void qemu_aio_unref(void *p) 2306 { 2307 BlockAIOCB *acb = p; 2308 assert(acb->refcnt > 0); 2309 if (--acb->refcnt == 0) { 2310 g_free(acb); 2311 } 2312 } 2313 2314 /**************************************************************/ 2315 /* Coroutine block device emulation */ 2316 2317 typedef struct CoroutineIOCompletion { 2318 Coroutine *coroutine; 2319 int ret; 2320 } CoroutineIOCompletion; 2321 2322 static void bdrv_co_io_em_complete(void *opaque, int ret) 2323 { 2324 CoroutineIOCompletion *co = opaque; 2325 2326 co->ret = ret; 2327 qemu_coroutine_enter(co->coroutine, NULL); 2328 } 2329 2330 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2331 int nb_sectors, QEMUIOVector *iov, 2332 bool is_write) 2333 { 2334 CoroutineIOCompletion co = { 2335 .coroutine = qemu_coroutine_self(), 2336 }; 2337 BlockAIOCB *acb; 2338 2339 if (is_write) { 2340 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2341 bdrv_co_io_em_complete, &co); 2342 } else { 2343 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2344 bdrv_co_io_em_complete, &co); 2345 } 2346 2347 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2348 if (!acb) { 2349 return -EIO; 2350 } 2351 qemu_coroutine_yield(); 2352 2353 return co.ret; 2354 } 2355 2356 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2357 int64_t sector_num, int nb_sectors, 2358 QEMUIOVector *iov) 2359 { 2360 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2361 } 2362 2363 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2364 int64_t sector_num, int nb_sectors, 2365 QEMUIOVector *iov) 2366 { 2367 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2368 } 2369 2370 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2371 { 2372 RwCo *rwco = opaque; 2373 2374 rwco->ret = bdrv_co_flush(rwco->bs); 2375 } 2376 2377 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2378 { 2379 int ret; 2380 BdrvTrackedRequest req; 2381 2382 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2383 bdrv_is_sg(bs)) { 2384 return 0; 2385 } 2386 2387 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2388 2389 /* Write back all layers by calling one driver function */ 2390 if (bs->drv->bdrv_co_flush) { 2391 ret = bs->drv->bdrv_co_flush(bs); 2392 goto out; 2393 } 2394 2395 /* Write back cached data to the OS even with cache=unsafe */ 2396 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2397 if (bs->drv->bdrv_co_flush_to_os) { 2398 ret = bs->drv->bdrv_co_flush_to_os(bs); 2399 if (ret < 0) { 2400 goto out; 2401 } 2402 } 2403 2404 /* But don't actually force it to the disk with cache=unsafe */ 2405 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2406 goto flush_parent; 2407 } 2408 2409 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2410 if (bs->drv->bdrv_co_flush_to_disk) { 2411 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2412 } else if (bs->drv->bdrv_aio_flush) { 2413 BlockAIOCB *acb; 2414 CoroutineIOCompletion co = { 2415 .coroutine = qemu_coroutine_self(), 2416 }; 2417 2418 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2419 if (acb == NULL) { 2420 ret = -EIO; 2421 } else { 2422 qemu_coroutine_yield(); 2423 ret = co.ret; 2424 } 2425 } else { 2426 /* 2427 * Some block drivers always operate in either writethrough or unsafe 2428 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2429 * know how the server works (because the behaviour is hardcoded or 2430 * depends on server-side configuration), so we can't ensure that 2431 * everything is safe on disk. Returning an error doesn't work because 2432 * that would break guests even if the server operates in writethrough 2433 * mode. 2434 * 2435 * Let's hope the user knows what he's doing. 2436 */ 2437 ret = 0; 2438 } 2439 if (ret < 0) { 2440 goto out; 2441 } 2442 2443 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2444 * in the case of cache=unsafe, so there are no useless flushes. 2445 */ 2446 flush_parent: 2447 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2448 out: 2449 tracked_request_end(&req); 2450 return ret; 2451 } 2452 2453 int bdrv_flush(BlockDriverState *bs) 2454 { 2455 Coroutine *co; 2456 RwCo rwco = { 2457 .bs = bs, 2458 .ret = NOT_DONE, 2459 }; 2460 2461 if (qemu_in_coroutine()) { 2462 /* Fast-path if already in coroutine context */ 2463 bdrv_flush_co_entry(&rwco); 2464 } else { 2465 AioContext *aio_context = bdrv_get_aio_context(bs); 2466 2467 co = qemu_coroutine_create(bdrv_flush_co_entry); 2468 qemu_coroutine_enter(co, &rwco); 2469 while (rwco.ret == NOT_DONE) { 2470 aio_poll(aio_context, true); 2471 } 2472 } 2473 2474 return rwco.ret; 2475 } 2476 2477 typedef struct DiscardCo { 2478 BlockDriverState *bs; 2479 int64_t sector_num; 2480 int nb_sectors; 2481 int ret; 2482 } DiscardCo; 2483 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2484 { 2485 DiscardCo *rwco = opaque; 2486 2487 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2488 } 2489 2490 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2491 int nb_sectors) 2492 { 2493 BdrvTrackedRequest req; 2494 int max_discard, ret; 2495 2496 if (!bs->drv) { 2497 return -ENOMEDIUM; 2498 } 2499 2500 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2501 if (ret < 0) { 2502 return ret; 2503 } else if (bs->read_only) { 2504 return -EPERM; 2505 } 2506 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2507 2508 /* Do nothing if disabled. */ 2509 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2510 return 0; 2511 } 2512 2513 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2514 return 0; 2515 } 2516 2517 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2518 BDRV_TRACKED_DISCARD); 2519 bdrv_set_dirty(bs, sector_num, nb_sectors); 2520 2521 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2522 while (nb_sectors > 0) { 2523 int ret; 2524 int num = nb_sectors; 2525 2526 /* align request */ 2527 if (bs->bl.discard_alignment && 2528 num >= bs->bl.discard_alignment && 2529 sector_num % bs->bl.discard_alignment) { 2530 if (num > bs->bl.discard_alignment) { 2531 num = bs->bl.discard_alignment; 2532 } 2533 num -= sector_num % bs->bl.discard_alignment; 2534 } 2535 2536 /* limit request size */ 2537 if (num > max_discard) { 2538 num = max_discard; 2539 } 2540 2541 if (bs->drv->bdrv_co_discard) { 2542 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2543 } else { 2544 BlockAIOCB *acb; 2545 CoroutineIOCompletion co = { 2546 .coroutine = qemu_coroutine_self(), 2547 }; 2548 2549 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2550 bdrv_co_io_em_complete, &co); 2551 if (acb == NULL) { 2552 ret = -EIO; 2553 goto out; 2554 } else { 2555 qemu_coroutine_yield(); 2556 ret = co.ret; 2557 } 2558 } 2559 if (ret && ret != -ENOTSUP) { 2560 goto out; 2561 } 2562 2563 sector_num += num; 2564 nb_sectors -= num; 2565 } 2566 ret = 0; 2567 out: 2568 tracked_request_end(&req); 2569 return ret; 2570 } 2571 2572 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2573 { 2574 Coroutine *co; 2575 DiscardCo rwco = { 2576 .bs = bs, 2577 .sector_num = sector_num, 2578 .nb_sectors = nb_sectors, 2579 .ret = NOT_DONE, 2580 }; 2581 2582 if (qemu_in_coroutine()) { 2583 /* Fast-path if already in coroutine context */ 2584 bdrv_discard_co_entry(&rwco); 2585 } else { 2586 AioContext *aio_context = bdrv_get_aio_context(bs); 2587 2588 co = qemu_coroutine_create(bdrv_discard_co_entry); 2589 qemu_coroutine_enter(co, &rwco); 2590 while (rwco.ret == NOT_DONE) { 2591 aio_poll(aio_context, true); 2592 } 2593 } 2594 2595 return rwco.ret; 2596 } 2597 2598 typedef struct { 2599 CoroutineIOCompletion *co; 2600 QEMUBH *bh; 2601 } BdrvIoctlCompletionData; 2602 2603 static void bdrv_ioctl_bh_cb(void *opaque) 2604 { 2605 BdrvIoctlCompletionData *data = opaque; 2606 2607 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2608 qemu_bh_delete(data->bh); 2609 } 2610 2611 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2612 { 2613 BlockDriver *drv = bs->drv; 2614 BdrvTrackedRequest tracked_req; 2615 CoroutineIOCompletion co = { 2616 .coroutine = qemu_coroutine_self(), 2617 }; 2618 BlockAIOCB *acb; 2619 2620 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2621 if (!drv || !drv->bdrv_aio_ioctl) { 2622 co.ret = -ENOTSUP; 2623 goto out; 2624 } 2625 2626 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2627 if (!acb) { 2628 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2629 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2630 bdrv_ioctl_bh_cb, data); 2631 data->co = &co; 2632 qemu_bh_schedule(data->bh); 2633 } 2634 qemu_coroutine_yield(); 2635 out: 2636 tracked_request_end(&tracked_req); 2637 return co.ret; 2638 } 2639 2640 typedef struct { 2641 BlockDriverState *bs; 2642 int req; 2643 void *buf; 2644 int ret; 2645 } BdrvIoctlCoData; 2646 2647 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2648 { 2649 BdrvIoctlCoData *data = opaque; 2650 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2651 } 2652 2653 /* needed for generic scsi interface */ 2654 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2655 { 2656 BdrvIoctlCoData data = { 2657 .bs = bs, 2658 .req = req, 2659 .buf = buf, 2660 .ret = -EINPROGRESS, 2661 }; 2662 2663 if (qemu_in_coroutine()) { 2664 /* Fast-path if already in coroutine context */ 2665 bdrv_co_ioctl_entry(&data); 2666 } else { 2667 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2668 2669 qemu_coroutine_enter(co, &data); 2670 while (data.ret == -EINPROGRESS) { 2671 aio_poll(bdrv_get_aio_context(bs), true); 2672 } 2673 } 2674 return data.ret; 2675 } 2676 2677 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2678 { 2679 BlockAIOCBCoroutine *acb = opaque; 2680 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2681 acb->req.req, acb->req.buf); 2682 bdrv_co_complete(acb); 2683 } 2684 2685 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2686 unsigned long int req, void *buf, 2687 BlockCompletionFunc *cb, void *opaque) 2688 { 2689 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2690 bs, cb, opaque); 2691 Coroutine *co; 2692 2693 acb->need_bh = true; 2694 acb->req.error = -EINPROGRESS; 2695 acb->req.req = req; 2696 acb->req.buf = buf; 2697 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2698 qemu_coroutine_enter(co, acb); 2699 2700 bdrv_co_maybe_schedule_bh(acb); 2701 return &acb->common; 2702 } 2703 2704 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2705 { 2706 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2707 } 2708 2709 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2710 { 2711 return memset(qemu_blockalign(bs, size), 0, size); 2712 } 2713 2714 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2715 { 2716 size_t align = bdrv_opt_mem_align(bs); 2717 2718 /* Ensure that NULL is never returned on success */ 2719 assert(align > 0); 2720 if (size == 0) { 2721 size = align; 2722 } 2723 2724 return qemu_try_memalign(align, size); 2725 } 2726 2727 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2728 { 2729 void *mem = qemu_try_blockalign(bs, size); 2730 2731 if (mem) { 2732 memset(mem, 0, size); 2733 } 2734 2735 return mem; 2736 } 2737 2738 /* 2739 * Check if all memory in this vector is sector aligned. 2740 */ 2741 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2742 { 2743 int i; 2744 size_t alignment = bdrv_min_mem_align(bs); 2745 2746 for (i = 0; i < qiov->niov; i++) { 2747 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2748 return false; 2749 } 2750 if (qiov->iov[i].iov_len % alignment) { 2751 return false; 2752 } 2753 } 2754 2755 return true; 2756 } 2757 2758 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2759 NotifierWithReturn *notifier) 2760 { 2761 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2762 } 2763 2764 void bdrv_io_plug(BlockDriverState *bs) 2765 { 2766 BlockDriver *drv = bs->drv; 2767 if (drv && drv->bdrv_io_plug) { 2768 drv->bdrv_io_plug(bs); 2769 } else if (bs->file) { 2770 bdrv_io_plug(bs->file->bs); 2771 } 2772 } 2773 2774 void bdrv_io_unplug(BlockDriverState *bs) 2775 { 2776 BlockDriver *drv = bs->drv; 2777 if (drv && drv->bdrv_io_unplug) { 2778 drv->bdrv_io_unplug(bs); 2779 } else if (bs->file) { 2780 bdrv_io_unplug(bs->file->bs); 2781 } 2782 } 2783 2784 void bdrv_flush_io_queue(BlockDriverState *bs) 2785 { 2786 BlockDriver *drv = bs->drv; 2787 if (drv && drv->bdrv_flush_io_queue) { 2788 drv->bdrv_flush_io_queue(bs); 2789 } else if (bs->file) { 2790 bdrv_flush_io_queue(bs->file->bs); 2791 } 2792 bdrv_start_throttled_reqs(bs); 2793 } 2794 2795 void bdrv_drained_begin(BlockDriverState *bs) 2796 { 2797 if (!bs->quiesce_counter++) { 2798 aio_disable_external(bdrv_get_aio_context(bs)); 2799 } 2800 bdrv_drain(bs); 2801 } 2802 2803 void bdrv_drained_end(BlockDriverState *bs) 2804 { 2805 assert(bs->quiesce_counter > 0); 2806 if (--bs->quiesce_counter > 0) { 2807 return; 2808 } 2809 aio_enable_external(bdrv_get_aio_context(bs)); 2810 } 2811