1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "block/throttle-groups.h" 31 #include "qemu/error-report.h" 32 33 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 34 35 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 36 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 37 BlockCompletionFunc *cb, void *opaque); 38 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 39 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 40 BlockCompletionFunc *cb, void *opaque); 41 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 42 int64_t sector_num, int nb_sectors, 43 QEMUIOVector *iov); 44 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 45 int64_t sector_num, int nb_sectors, 46 QEMUIOVector *iov); 47 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 48 int64_t sector_num, 49 QEMUIOVector *qiov, 50 int nb_sectors, 51 BdrvRequestFlags flags, 52 BlockCompletionFunc *cb, 53 void *opaque, 54 bool is_write); 55 static void coroutine_fn bdrv_co_do_rw(void *opaque); 56 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 57 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 58 59 /* throttling disk I/O limits */ 60 void bdrv_set_io_limits(BlockDriverState *bs, 61 ThrottleConfig *cfg) 62 { 63 int i; 64 65 throttle_group_config(bs, cfg); 66 67 for (i = 0; i < 2; i++) { 68 qemu_co_enter_next(&bs->throttled_reqs[i]); 69 } 70 } 71 72 /* this function drain all the throttled IOs */ 73 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 74 { 75 bool drained = false; 76 bool enabled = bs->io_limits_enabled; 77 int i; 78 79 bs->io_limits_enabled = false; 80 81 for (i = 0; i < 2; i++) { 82 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 83 drained = true; 84 } 85 } 86 87 bs->io_limits_enabled = enabled; 88 89 return drained; 90 } 91 92 void bdrv_io_limits_disable(BlockDriverState *bs) 93 { 94 bs->io_limits_enabled = false; 95 bdrv_start_throttled_reqs(bs); 96 throttle_group_unregister_bs(bs); 97 } 98 99 /* should be called before bdrv_set_io_limits if a limit is set */ 100 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 101 { 102 assert(!bs->io_limits_enabled); 103 throttle_group_register_bs(bs, group); 104 bs->io_limits_enabled = true; 105 } 106 107 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 108 { 109 /* this bs is not part of any group */ 110 if (!bs->throttle_state) { 111 return; 112 } 113 114 /* this bs is a part of the same group than the one we want */ 115 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 116 return; 117 } 118 119 /* need to change the group this bs belong to */ 120 bdrv_io_limits_disable(bs); 121 bdrv_io_limits_enable(bs, group); 122 } 123 124 void bdrv_setup_io_funcs(BlockDriver *bdrv) 125 { 126 /* Block drivers without coroutine functions need emulation */ 127 if (!bdrv->bdrv_co_readv) { 128 bdrv->bdrv_co_readv = bdrv_co_readv_em; 129 bdrv->bdrv_co_writev = bdrv_co_writev_em; 130 131 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 132 * the block driver lacks aio we need to emulate that too. 133 */ 134 if (!bdrv->bdrv_aio_readv) { 135 /* add AIO emulation layer */ 136 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 137 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 138 } 139 } 140 } 141 142 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 143 { 144 BlockDriver *drv = bs->drv; 145 Error *local_err = NULL; 146 147 memset(&bs->bl, 0, sizeof(bs->bl)); 148 149 if (!drv) { 150 return; 151 } 152 153 /* Take some limits from the children as a default */ 154 if (bs->file) { 155 bdrv_refresh_limits(bs->file->bs, &local_err); 156 if (local_err) { 157 error_propagate(errp, local_err); 158 return; 159 } 160 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 161 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 162 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 163 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 164 bs->bl.max_iov = bs->file->bs->bl.max_iov; 165 } else { 166 bs->bl.min_mem_alignment = 512; 167 bs->bl.opt_mem_alignment = getpagesize(); 168 169 /* Safe default since most protocols use readv()/writev()/etc */ 170 bs->bl.max_iov = IOV_MAX; 171 } 172 173 if (bs->backing) { 174 bdrv_refresh_limits(bs->backing->bs, &local_err); 175 if (local_err) { 176 error_propagate(errp, local_err); 177 return; 178 } 179 bs->bl.opt_transfer_length = 180 MAX(bs->bl.opt_transfer_length, 181 bs->backing->bs->bl.opt_transfer_length); 182 bs->bl.max_transfer_length = 183 MIN_NON_ZERO(bs->bl.max_transfer_length, 184 bs->backing->bs->bl.max_transfer_length); 185 bs->bl.opt_mem_alignment = 186 MAX(bs->bl.opt_mem_alignment, 187 bs->backing->bs->bl.opt_mem_alignment); 188 bs->bl.min_mem_alignment = 189 MAX(bs->bl.min_mem_alignment, 190 bs->backing->bs->bl.min_mem_alignment); 191 bs->bl.max_iov = 192 MIN(bs->bl.max_iov, 193 bs->backing->bs->bl.max_iov); 194 } 195 196 /* Then let the driver override it */ 197 if (drv->bdrv_refresh_limits) { 198 drv->bdrv_refresh_limits(bs, errp); 199 } 200 } 201 202 /** 203 * The copy-on-read flag is actually a reference count so multiple users may 204 * use the feature without worrying about clobbering its previous state. 205 * Copy-on-read stays enabled until all users have called to disable it. 206 */ 207 void bdrv_enable_copy_on_read(BlockDriverState *bs) 208 { 209 bs->copy_on_read++; 210 } 211 212 void bdrv_disable_copy_on_read(BlockDriverState *bs) 213 { 214 assert(bs->copy_on_read > 0); 215 bs->copy_on_read--; 216 } 217 218 /* Check if any requests are in-flight (including throttled requests) */ 219 bool bdrv_requests_pending(BlockDriverState *bs) 220 { 221 BdrvChild *child; 222 223 if (!QLIST_EMPTY(&bs->tracked_requests)) { 224 return true; 225 } 226 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 227 return true; 228 } 229 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 230 return true; 231 } 232 233 QLIST_FOREACH(child, &bs->children, next) { 234 if (bdrv_requests_pending(child->bs)) { 235 return true; 236 } 237 } 238 239 return false; 240 } 241 242 static void bdrv_drain_recurse(BlockDriverState *bs) 243 { 244 BdrvChild *child; 245 246 if (bs->drv && bs->drv->bdrv_drain) { 247 bs->drv->bdrv_drain(bs); 248 } 249 QLIST_FOREACH(child, &bs->children, next) { 250 bdrv_drain_recurse(child->bs); 251 } 252 } 253 254 /* 255 * Wait for pending requests to complete on a single BlockDriverState subtree, 256 * and suspend block driver's internal I/O until next request arrives. 257 * 258 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 259 * AioContext. 260 * 261 * Only this BlockDriverState's AioContext is run, so in-flight requests must 262 * not depend on events in other AioContexts. In that case, use 263 * bdrv_drain_all() instead. 264 */ 265 void bdrv_drain(BlockDriverState *bs) 266 { 267 bool busy = true; 268 269 bdrv_drain_recurse(bs); 270 while (busy) { 271 /* Keep iterating */ 272 bdrv_flush_io_queue(bs); 273 busy = bdrv_requests_pending(bs); 274 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 275 } 276 } 277 278 /* 279 * Wait for pending requests to complete across all BlockDriverStates 280 * 281 * This function does not flush data to disk, use bdrv_flush_all() for that 282 * after calling this function. 283 */ 284 void bdrv_drain_all(void) 285 { 286 /* Always run first iteration so any pending completion BHs run */ 287 bool busy = true; 288 BlockDriverState *bs = NULL; 289 GSList *aio_ctxs = NULL, *ctx; 290 291 while ((bs = bdrv_next(bs))) { 292 AioContext *aio_context = bdrv_get_aio_context(bs); 293 294 aio_context_acquire(aio_context); 295 if (bs->job) { 296 block_job_pause(bs->job); 297 } 298 bdrv_drain_recurse(bs); 299 aio_context_release(aio_context); 300 301 if (!g_slist_find(aio_ctxs, aio_context)) { 302 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 303 } 304 } 305 306 /* Note that completion of an asynchronous I/O operation can trigger any 307 * number of other I/O operations on other devices---for example a 308 * coroutine can submit an I/O request to another device in response to 309 * request completion. Therefore we must keep looping until there was no 310 * more activity rather than simply draining each device independently. 311 */ 312 while (busy) { 313 busy = false; 314 315 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 316 AioContext *aio_context = ctx->data; 317 bs = NULL; 318 319 aio_context_acquire(aio_context); 320 while ((bs = bdrv_next(bs))) { 321 if (aio_context == bdrv_get_aio_context(bs)) { 322 bdrv_flush_io_queue(bs); 323 if (bdrv_requests_pending(bs)) { 324 busy = true; 325 aio_poll(aio_context, busy); 326 } 327 } 328 } 329 busy |= aio_poll(aio_context, false); 330 aio_context_release(aio_context); 331 } 332 } 333 334 bs = NULL; 335 while ((bs = bdrv_next(bs))) { 336 AioContext *aio_context = bdrv_get_aio_context(bs); 337 338 aio_context_acquire(aio_context); 339 if (bs->job) { 340 block_job_resume(bs->job); 341 } 342 aio_context_release(aio_context); 343 } 344 g_slist_free(aio_ctxs); 345 } 346 347 /** 348 * Remove an active request from the tracked requests list 349 * 350 * This function should be called when a tracked request is completing. 351 */ 352 static void tracked_request_end(BdrvTrackedRequest *req) 353 { 354 if (req->serialising) { 355 req->bs->serialising_in_flight--; 356 } 357 358 QLIST_REMOVE(req, list); 359 qemu_co_queue_restart_all(&req->wait_queue); 360 } 361 362 /** 363 * Add an active request to the tracked requests list 364 */ 365 static void tracked_request_begin(BdrvTrackedRequest *req, 366 BlockDriverState *bs, 367 int64_t offset, 368 unsigned int bytes, 369 enum BdrvTrackedRequestType type) 370 { 371 *req = (BdrvTrackedRequest){ 372 .bs = bs, 373 .offset = offset, 374 .bytes = bytes, 375 .type = type, 376 .co = qemu_coroutine_self(), 377 .serialising = false, 378 .overlap_offset = offset, 379 .overlap_bytes = bytes, 380 }; 381 382 qemu_co_queue_init(&req->wait_queue); 383 384 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 385 } 386 387 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 388 { 389 int64_t overlap_offset = req->offset & ~(align - 1); 390 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 391 - overlap_offset; 392 393 if (!req->serialising) { 394 req->bs->serialising_in_flight++; 395 req->serialising = true; 396 } 397 398 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 399 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 400 } 401 402 /** 403 * Round a region to cluster boundaries 404 */ 405 void bdrv_round_to_clusters(BlockDriverState *bs, 406 int64_t sector_num, int nb_sectors, 407 int64_t *cluster_sector_num, 408 int *cluster_nb_sectors) 409 { 410 BlockDriverInfo bdi; 411 412 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 413 *cluster_sector_num = sector_num; 414 *cluster_nb_sectors = nb_sectors; 415 } else { 416 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 417 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 418 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 419 nb_sectors, c); 420 } 421 } 422 423 static int bdrv_get_cluster_size(BlockDriverState *bs) 424 { 425 BlockDriverInfo bdi; 426 int ret; 427 428 ret = bdrv_get_info(bs, &bdi); 429 if (ret < 0 || bdi.cluster_size == 0) { 430 return bs->request_alignment; 431 } else { 432 return bdi.cluster_size; 433 } 434 } 435 436 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 437 int64_t offset, unsigned int bytes) 438 { 439 /* aaaa bbbb */ 440 if (offset >= req->overlap_offset + req->overlap_bytes) { 441 return false; 442 } 443 /* bbbb aaaa */ 444 if (req->overlap_offset >= offset + bytes) { 445 return false; 446 } 447 return true; 448 } 449 450 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 451 { 452 BlockDriverState *bs = self->bs; 453 BdrvTrackedRequest *req; 454 bool retry; 455 bool waited = false; 456 457 if (!bs->serialising_in_flight) { 458 return false; 459 } 460 461 do { 462 retry = false; 463 QLIST_FOREACH(req, &bs->tracked_requests, list) { 464 if (req == self || (!req->serialising && !self->serialising)) { 465 continue; 466 } 467 if (tracked_request_overlaps(req, self->overlap_offset, 468 self->overlap_bytes)) 469 { 470 /* Hitting this means there was a reentrant request, for 471 * example, a block driver issuing nested requests. This must 472 * never happen since it means deadlock. 473 */ 474 assert(qemu_coroutine_self() != req->co); 475 476 /* If the request is already (indirectly) waiting for us, or 477 * will wait for us as soon as it wakes up, then just go on 478 * (instead of producing a deadlock in the former case). */ 479 if (!req->waiting_for) { 480 self->waiting_for = req; 481 qemu_co_queue_wait(&req->wait_queue); 482 self->waiting_for = NULL; 483 retry = true; 484 waited = true; 485 break; 486 } 487 } 488 } 489 } while (retry); 490 491 return waited; 492 } 493 494 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 495 size_t size) 496 { 497 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 498 return -EIO; 499 } 500 501 if (!bdrv_is_inserted(bs)) { 502 return -ENOMEDIUM; 503 } 504 505 if (offset < 0) { 506 return -EIO; 507 } 508 509 return 0; 510 } 511 512 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 513 int nb_sectors) 514 { 515 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 516 return -EIO; 517 } 518 519 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 520 nb_sectors * BDRV_SECTOR_SIZE); 521 } 522 523 typedef struct RwCo { 524 BlockDriverState *bs; 525 int64_t offset; 526 QEMUIOVector *qiov; 527 bool is_write; 528 int ret; 529 BdrvRequestFlags flags; 530 } RwCo; 531 532 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 533 { 534 RwCo *rwco = opaque; 535 536 if (!rwco->is_write) { 537 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 538 rwco->qiov->size, rwco->qiov, 539 rwco->flags); 540 } else { 541 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 542 rwco->qiov->size, rwco->qiov, 543 rwco->flags); 544 } 545 } 546 547 /* 548 * Process a vectored synchronous request using coroutines 549 */ 550 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 551 QEMUIOVector *qiov, bool is_write, 552 BdrvRequestFlags flags) 553 { 554 Coroutine *co; 555 RwCo rwco = { 556 .bs = bs, 557 .offset = offset, 558 .qiov = qiov, 559 .is_write = is_write, 560 .ret = NOT_DONE, 561 .flags = flags, 562 }; 563 564 /** 565 * In sync call context, when the vcpu is blocked, this throttling timer 566 * will not fire; so the I/O throttling function has to be disabled here 567 * if it has been enabled. 568 */ 569 if (bs->io_limits_enabled) { 570 fprintf(stderr, "Disabling I/O throttling on '%s' due " 571 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 572 bdrv_io_limits_disable(bs); 573 } 574 575 if (qemu_in_coroutine()) { 576 /* Fast-path if already in coroutine context */ 577 bdrv_rw_co_entry(&rwco); 578 } else { 579 AioContext *aio_context = bdrv_get_aio_context(bs); 580 581 co = qemu_coroutine_create(bdrv_rw_co_entry); 582 qemu_coroutine_enter(co, &rwco); 583 while (rwco.ret == NOT_DONE) { 584 aio_poll(aio_context, true); 585 } 586 } 587 return rwco.ret; 588 } 589 590 /* 591 * Process a synchronous request using coroutines 592 */ 593 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 594 int nb_sectors, bool is_write, BdrvRequestFlags flags) 595 { 596 QEMUIOVector qiov; 597 struct iovec iov = { 598 .iov_base = (void *)buf, 599 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 600 }; 601 602 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 603 return -EINVAL; 604 } 605 606 qemu_iovec_init_external(&qiov, &iov, 1); 607 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 608 &qiov, is_write, flags); 609 } 610 611 /* return < 0 if error. See bdrv_write() for the return codes */ 612 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 613 uint8_t *buf, int nb_sectors) 614 { 615 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 616 } 617 618 /* Return < 0 if error. Important errors are: 619 -EIO generic I/O error (may happen for all errors) 620 -ENOMEDIUM No media inserted. 621 -EINVAL Invalid sector number or nb_sectors 622 -EACCES Trying to write a read-only device 623 */ 624 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 625 const uint8_t *buf, int nb_sectors) 626 { 627 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 628 } 629 630 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 631 int nb_sectors, BdrvRequestFlags flags) 632 { 633 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 634 BDRV_REQ_ZERO_WRITE | flags); 635 } 636 637 /* 638 * Completely zero out a block device with the help of bdrv_write_zeroes. 639 * The operation is sped up by checking the block status and only writing 640 * zeroes to the device if they currently do not return zeroes. Optional 641 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 642 * 643 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 644 */ 645 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 646 { 647 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 648 BlockDriverState *file; 649 int n; 650 651 target_sectors = bdrv_nb_sectors(bs); 652 if (target_sectors < 0) { 653 return target_sectors; 654 } 655 656 for (;;) { 657 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 658 if (nb_sectors <= 0) { 659 return 0; 660 } 661 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); 662 if (ret < 0) { 663 error_report("error getting block status at sector %" PRId64 ": %s", 664 sector_num, strerror(-ret)); 665 return ret; 666 } 667 if (ret & BDRV_BLOCK_ZERO) { 668 sector_num += n; 669 continue; 670 } 671 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 672 if (ret < 0) { 673 error_report("error writing zeroes at sector %" PRId64 ": %s", 674 sector_num, strerror(-ret)); 675 return ret; 676 } 677 sector_num += n; 678 } 679 } 680 681 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 682 { 683 QEMUIOVector qiov; 684 struct iovec iov = { 685 .iov_base = (void *)buf, 686 .iov_len = bytes, 687 }; 688 int ret; 689 690 if (bytes < 0) { 691 return -EINVAL; 692 } 693 694 qemu_iovec_init_external(&qiov, &iov, 1); 695 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 696 if (ret < 0) { 697 return ret; 698 } 699 700 return bytes; 701 } 702 703 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 704 { 705 int ret; 706 707 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 708 if (ret < 0) { 709 return ret; 710 } 711 712 return qiov->size; 713 } 714 715 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 716 const void *buf, int bytes) 717 { 718 QEMUIOVector qiov; 719 struct iovec iov = { 720 .iov_base = (void *) buf, 721 .iov_len = bytes, 722 }; 723 724 if (bytes < 0) { 725 return -EINVAL; 726 } 727 728 qemu_iovec_init_external(&qiov, &iov, 1); 729 return bdrv_pwritev(bs, offset, &qiov); 730 } 731 732 /* 733 * Writes to the file and ensures that no writes are reordered across this 734 * request (acts as a barrier) 735 * 736 * Returns 0 on success, -errno in error cases. 737 */ 738 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 739 const void *buf, int count) 740 { 741 int ret; 742 743 ret = bdrv_pwrite(bs, offset, buf, count); 744 if (ret < 0) { 745 return ret; 746 } 747 748 /* No flush needed for cache modes that already do it */ 749 if (bs->enable_write_cache) { 750 bdrv_flush(bs); 751 } 752 753 return 0; 754 } 755 756 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 757 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 758 { 759 /* Perform I/O through a temporary buffer so that users who scribble over 760 * their read buffer while the operation is in progress do not end up 761 * modifying the image file. This is critical for zero-copy guest I/O 762 * where anything might happen inside guest memory. 763 */ 764 void *bounce_buffer; 765 766 BlockDriver *drv = bs->drv; 767 struct iovec iov; 768 QEMUIOVector bounce_qiov; 769 int64_t cluster_sector_num; 770 int cluster_nb_sectors; 771 size_t skip_bytes; 772 int ret; 773 774 /* Cover entire cluster so no additional backing file I/O is required when 775 * allocating cluster in the image file. 776 */ 777 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 778 &cluster_sector_num, &cluster_nb_sectors); 779 780 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 781 cluster_sector_num, cluster_nb_sectors); 782 783 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 784 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 785 if (bounce_buffer == NULL) { 786 ret = -ENOMEM; 787 goto err; 788 } 789 790 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 791 792 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 793 &bounce_qiov); 794 if (ret < 0) { 795 goto err; 796 } 797 798 if (drv->bdrv_co_write_zeroes && 799 buffer_is_zero(bounce_buffer, iov.iov_len)) { 800 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 801 cluster_nb_sectors, 0); 802 } else { 803 /* This does not change the data on the disk, it is not necessary 804 * to flush even in cache=writethrough mode. 805 */ 806 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 807 &bounce_qiov); 808 } 809 810 if (ret < 0) { 811 /* It might be okay to ignore write errors for guest requests. If this 812 * is a deliberate copy-on-read then we don't want to ignore the error. 813 * Simply report it in all cases. 814 */ 815 goto err; 816 } 817 818 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 819 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 820 nb_sectors * BDRV_SECTOR_SIZE); 821 822 err: 823 qemu_vfree(bounce_buffer); 824 return ret; 825 } 826 827 /* 828 * Forwards an already correctly aligned request to the BlockDriver. This 829 * handles copy on read and zeroing after EOF; any other features must be 830 * implemented by the caller. 831 */ 832 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 833 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 834 int64_t align, QEMUIOVector *qiov, int flags) 835 { 836 BlockDriver *drv = bs->drv; 837 int ret; 838 839 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 840 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 841 842 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 843 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 844 assert(!qiov || bytes == qiov->size); 845 846 /* Handle Copy on Read and associated serialisation */ 847 if (flags & BDRV_REQ_COPY_ON_READ) { 848 /* If we touch the same cluster it counts as an overlap. This 849 * guarantees that allocating writes will be serialized and not race 850 * with each other for the same cluster. For example, in copy-on-read 851 * it ensures that the CoR read and write operations are atomic and 852 * guest writes cannot interleave between them. */ 853 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 854 } 855 856 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 857 wait_serialising_requests(req); 858 } 859 860 if (flags & BDRV_REQ_COPY_ON_READ) { 861 int pnum; 862 863 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 864 if (ret < 0) { 865 goto out; 866 } 867 868 if (!ret || pnum != nb_sectors) { 869 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 870 goto out; 871 } 872 } 873 874 /* Forward the request to the BlockDriver */ 875 if (!bs->zero_beyond_eof) { 876 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 877 } else { 878 /* Read zeros after EOF */ 879 int64_t total_sectors, max_nb_sectors; 880 881 total_sectors = bdrv_nb_sectors(bs); 882 if (total_sectors < 0) { 883 ret = total_sectors; 884 goto out; 885 } 886 887 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 888 align >> BDRV_SECTOR_BITS); 889 if (nb_sectors < max_nb_sectors) { 890 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 891 } else if (max_nb_sectors > 0) { 892 QEMUIOVector local_qiov; 893 894 qemu_iovec_init(&local_qiov, qiov->niov); 895 qemu_iovec_concat(&local_qiov, qiov, 0, 896 max_nb_sectors * BDRV_SECTOR_SIZE); 897 898 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 899 &local_qiov); 900 901 qemu_iovec_destroy(&local_qiov); 902 } else { 903 ret = 0; 904 } 905 906 /* Reading beyond end of file is supposed to produce zeroes */ 907 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 908 uint64_t offset = MAX(0, total_sectors - sector_num); 909 uint64_t bytes = (sector_num + nb_sectors - offset) * 910 BDRV_SECTOR_SIZE; 911 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 912 } 913 } 914 915 out: 916 return ret; 917 } 918 919 /* 920 * Handle a read request in coroutine context 921 */ 922 int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 923 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 924 BdrvRequestFlags flags) 925 { 926 BlockDriver *drv = bs->drv; 927 BdrvTrackedRequest req; 928 929 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 930 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 931 uint8_t *head_buf = NULL; 932 uint8_t *tail_buf = NULL; 933 QEMUIOVector local_qiov; 934 bool use_local_qiov = false; 935 int ret; 936 937 if (!drv) { 938 return -ENOMEDIUM; 939 } 940 941 ret = bdrv_check_byte_request(bs, offset, bytes); 942 if (ret < 0) { 943 return ret; 944 } 945 946 /* Don't do copy-on-read if we read data before write operation */ 947 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 948 flags |= BDRV_REQ_COPY_ON_READ; 949 } 950 951 /* throttling disk I/O */ 952 if (bs->io_limits_enabled) { 953 throttle_group_co_io_limits_intercept(bs, bytes, false); 954 } 955 956 /* Align read if necessary by padding qiov */ 957 if (offset & (align - 1)) { 958 head_buf = qemu_blockalign(bs, align); 959 qemu_iovec_init(&local_qiov, qiov->niov + 2); 960 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 961 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 962 use_local_qiov = true; 963 964 bytes += offset & (align - 1); 965 offset = offset & ~(align - 1); 966 } 967 968 if ((offset + bytes) & (align - 1)) { 969 if (!use_local_qiov) { 970 qemu_iovec_init(&local_qiov, qiov->niov + 1); 971 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 972 use_local_qiov = true; 973 } 974 tail_buf = qemu_blockalign(bs, align); 975 qemu_iovec_add(&local_qiov, tail_buf, 976 align - ((offset + bytes) & (align - 1))); 977 978 bytes = ROUND_UP(bytes, align); 979 } 980 981 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 982 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 983 use_local_qiov ? &local_qiov : qiov, 984 flags); 985 tracked_request_end(&req); 986 987 if (use_local_qiov) { 988 qemu_iovec_destroy(&local_qiov); 989 qemu_vfree(head_buf); 990 qemu_vfree(tail_buf); 991 } 992 993 return ret; 994 } 995 996 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 997 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 998 BdrvRequestFlags flags) 999 { 1000 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1001 return -EINVAL; 1002 } 1003 1004 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1005 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1006 } 1007 1008 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1009 int nb_sectors, QEMUIOVector *qiov) 1010 { 1011 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1012 1013 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1014 } 1015 1016 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, 1017 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1018 { 1019 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); 1020 1021 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1022 BDRV_REQ_NO_SERIALISING); 1023 } 1024 1025 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1026 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1027 { 1028 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1029 1030 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1031 BDRV_REQ_COPY_ON_READ); 1032 } 1033 1034 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1035 1036 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1037 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1038 { 1039 BlockDriver *drv = bs->drv; 1040 QEMUIOVector qiov; 1041 struct iovec iov = {0}; 1042 int ret = 0; 1043 1044 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1045 BDRV_REQUEST_MAX_SECTORS); 1046 1047 while (nb_sectors > 0 && !ret) { 1048 int num = nb_sectors; 1049 1050 /* Align request. Block drivers can expect the "bulk" of the request 1051 * to be aligned. 1052 */ 1053 if (bs->bl.write_zeroes_alignment 1054 && num > bs->bl.write_zeroes_alignment) { 1055 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1056 /* Make a small request up to the first aligned sector. */ 1057 num = bs->bl.write_zeroes_alignment; 1058 num -= sector_num % bs->bl.write_zeroes_alignment; 1059 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1060 /* Shorten the request to the last aligned sector. num cannot 1061 * underflow because num > bs->bl.write_zeroes_alignment. 1062 */ 1063 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1064 } 1065 } 1066 1067 /* limit request size */ 1068 if (num > max_write_zeroes) { 1069 num = max_write_zeroes; 1070 } 1071 1072 ret = -ENOTSUP; 1073 /* First try the efficient write zeroes operation */ 1074 if (drv->bdrv_co_write_zeroes) { 1075 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1076 } 1077 1078 if (ret == -ENOTSUP) { 1079 /* Fall back to bounce buffer if write zeroes is unsupported */ 1080 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1081 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1082 num = MIN(num, max_xfer_len); 1083 iov.iov_len = num * BDRV_SECTOR_SIZE; 1084 if (iov.iov_base == NULL) { 1085 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1086 if (iov.iov_base == NULL) { 1087 ret = -ENOMEM; 1088 goto fail; 1089 } 1090 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1091 } 1092 qemu_iovec_init_external(&qiov, &iov, 1); 1093 1094 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1095 1096 /* Keep bounce buffer around if it is big enough for all 1097 * all future requests. 1098 */ 1099 if (num < max_xfer_len) { 1100 qemu_vfree(iov.iov_base); 1101 iov.iov_base = NULL; 1102 } 1103 } 1104 1105 sector_num += num; 1106 nb_sectors -= num; 1107 } 1108 1109 fail: 1110 qemu_vfree(iov.iov_base); 1111 return ret; 1112 } 1113 1114 /* 1115 * Forwards an already correctly aligned write request to the BlockDriver. 1116 */ 1117 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1118 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1119 QEMUIOVector *qiov, int flags) 1120 { 1121 BlockDriver *drv = bs->drv; 1122 bool waited; 1123 int ret; 1124 1125 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1126 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1127 1128 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1129 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1130 assert(!qiov || bytes == qiov->size); 1131 1132 waited = wait_serialising_requests(req); 1133 assert(!waited || !req->serialising); 1134 assert(req->overlap_offset <= offset); 1135 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1136 1137 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1138 1139 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1140 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1141 qemu_iovec_is_zero(qiov)) { 1142 flags |= BDRV_REQ_ZERO_WRITE; 1143 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1144 flags |= BDRV_REQ_MAY_UNMAP; 1145 } 1146 } 1147 1148 if (ret < 0) { 1149 /* Do nothing, write notifier decided to fail this request */ 1150 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1151 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1152 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1153 } else { 1154 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1155 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1156 } 1157 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1158 1159 if (ret == 0 && !bs->enable_write_cache) { 1160 ret = bdrv_co_flush(bs); 1161 } 1162 1163 bdrv_set_dirty(bs, sector_num, nb_sectors); 1164 1165 if (bs->wr_highest_offset < offset + bytes) { 1166 bs->wr_highest_offset = offset + bytes; 1167 } 1168 1169 if (ret >= 0) { 1170 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1171 } 1172 1173 return ret; 1174 } 1175 1176 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1177 int64_t offset, 1178 unsigned int bytes, 1179 BdrvRequestFlags flags, 1180 BdrvTrackedRequest *req) 1181 { 1182 uint8_t *buf = NULL; 1183 QEMUIOVector local_qiov; 1184 struct iovec iov; 1185 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1186 unsigned int head_padding_bytes, tail_padding_bytes; 1187 int ret = 0; 1188 1189 head_padding_bytes = offset & (align - 1); 1190 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1191 1192 1193 assert(flags & BDRV_REQ_ZERO_WRITE); 1194 if (head_padding_bytes || tail_padding_bytes) { 1195 buf = qemu_blockalign(bs, align); 1196 iov = (struct iovec) { 1197 .iov_base = buf, 1198 .iov_len = align, 1199 }; 1200 qemu_iovec_init_external(&local_qiov, &iov, 1); 1201 } 1202 if (head_padding_bytes) { 1203 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1204 1205 /* RMW the unaligned part before head. */ 1206 mark_request_serialising(req, align); 1207 wait_serialising_requests(req); 1208 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1209 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1210 align, &local_qiov, 0); 1211 if (ret < 0) { 1212 goto fail; 1213 } 1214 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1215 1216 memset(buf + head_padding_bytes, 0, zero_bytes); 1217 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1218 &local_qiov, 1219 flags & ~BDRV_REQ_ZERO_WRITE); 1220 if (ret < 0) { 1221 goto fail; 1222 } 1223 offset += zero_bytes; 1224 bytes -= zero_bytes; 1225 } 1226 1227 assert(!bytes || (offset & (align - 1)) == 0); 1228 if (bytes >= align) { 1229 /* Write the aligned part in the middle. */ 1230 uint64_t aligned_bytes = bytes & ~(align - 1); 1231 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1232 NULL, flags); 1233 if (ret < 0) { 1234 goto fail; 1235 } 1236 bytes -= aligned_bytes; 1237 offset += aligned_bytes; 1238 } 1239 1240 assert(!bytes || (offset & (align - 1)) == 0); 1241 if (bytes) { 1242 assert(align == tail_padding_bytes + bytes); 1243 /* RMW the unaligned part after tail. */ 1244 mark_request_serialising(req, align); 1245 wait_serialising_requests(req); 1246 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1247 ret = bdrv_aligned_preadv(bs, req, offset, align, 1248 align, &local_qiov, 0); 1249 if (ret < 0) { 1250 goto fail; 1251 } 1252 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1253 1254 memset(buf, 0, bytes); 1255 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1256 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1257 } 1258 fail: 1259 qemu_vfree(buf); 1260 return ret; 1261 1262 } 1263 1264 /* 1265 * Handle a write request in coroutine context 1266 */ 1267 int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1268 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1269 BdrvRequestFlags flags) 1270 { 1271 BdrvTrackedRequest req; 1272 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1273 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1274 uint8_t *head_buf = NULL; 1275 uint8_t *tail_buf = NULL; 1276 QEMUIOVector local_qiov; 1277 bool use_local_qiov = false; 1278 int ret; 1279 1280 if (!bs->drv) { 1281 return -ENOMEDIUM; 1282 } 1283 if (bs->read_only) { 1284 return -EPERM; 1285 } 1286 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1287 1288 ret = bdrv_check_byte_request(bs, offset, bytes); 1289 if (ret < 0) { 1290 return ret; 1291 } 1292 1293 /* throttling disk I/O */ 1294 if (bs->io_limits_enabled) { 1295 throttle_group_co_io_limits_intercept(bs, bytes, true); 1296 } 1297 1298 /* 1299 * Align write if necessary by performing a read-modify-write cycle. 1300 * Pad qiov with the read parts and be sure to have a tracked request not 1301 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1302 */ 1303 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1304 1305 if (!qiov) { 1306 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1307 goto out; 1308 } 1309 1310 if (offset & (align - 1)) { 1311 QEMUIOVector head_qiov; 1312 struct iovec head_iov; 1313 1314 mark_request_serialising(&req, align); 1315 wait_serialising_requests(&req); 1316 1317 head_buf = qemu_blockalign(bs, align); 1318 head_iov = (struct iovec) { 1319 .iov_base = head_buf, 1320 .iov_len = align, 1321 }; 1322 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1323 1324 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1325 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1326 align, &head_qiov, 0); 1327 if (ret < 0) { 1328 goto fail; 1329 } 1330 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1331 1332 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1333 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1334 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1335 use_local_qiov = true; 1336 1337 bytes += offset & (align - 1); 1338 offset = offset & ~(align - 1); 1339 } 1340 1341 if ((offset + bytes) & (align - 1)) { 1342 QEMUIOVector tail_qiov; 1343 struct iovec tail_iov; 1344 size_t tail_bytes; 1345 bool waited; 1346 1347 mark_request_serialising(&req, align); 1348 waited = wait_serialising_requests(&req); 1349 assert(!waited || !use_local_qiov); 1350 1351 tail_buf = qemu_blockalign(bs, align); 1352 tail_iov = (struct iovec) { 1353 .iov_base = tail_buf, 1354 .iov_len = align, 1355 }; 1356 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1357 1358 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1359 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1360 align, &tail_qiov, 0); 1361 if (ret < 0) { 1362 goto fail; 1363 } 1364 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1365 1366 if (!use_local_qiov) { 1367 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1368 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1369 use_local_qiov = true; 1370 } 1371 1372 tail_bytes = (offset + bytes) & (align - 1); 1373 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1374 1375 bytes = ROUND_UP(bytes, align); 1376 } 1377 1378 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1379 use_local_qiov ? &local_qiov : qiov, 1380 flags); 1381 1382 fail: 1383 1384 if (use_local_qiov) { 1385 qemu_iovec_destroy(&local_qiov); 1386 } 1387 qemu_vfree(head_buf); 1388 qemu_vfree(tail_buf); 1389 out: 1390 tracked_request_end(&req); 1391 return ret; 1392 } 1393 1394 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1395 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1396 BdrvRequestFlags flags) 1397 { 1398 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1399 return -EINVAL; 1400 } 1401 1402 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1403 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1404 } 1405 1406 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1407 int nb_sectors, QEMUIOVector *qiov) 1408 { 1409 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1410 1411 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1412 } 1413 1414 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1415 int64_t sector_num, int nb_sectors, 1416 BdrvRequestFlags flags) 1417 { 1418 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1419 1420 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1421 flags &= ~BDRV_REQ_MAY_UNMAP; 1422 } 1423 1424 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1425 BDRV_REQ_ZERO_WRITE | flags); 1426 } 1427 1428 typedef struct BdrvCoGetBlockStatusData { 1429 BlockDriverState *bs; 1430 BlockDriverState *base; 1431 BlockDriverState **file; 1432 int64_t sector_num; 1433 int nb_sectors; 1434 int *pnum; 1435 int64_t ret; 1436 bool done; 1437 } BdrvCoGetBlockStatusData; 1438 1439 /* 1440 * Returns the allocation status of the specified sectors. 1441 * Drivers not implementing the functionality are assumed to not support 1442 * backing files, hence all their sectors are reported as allocated. 1443 * 1444 * If 'sector_num' is beyond the end of the disk image the return value is 0 1445 * and 'pnum' is set to 0. 1446 * 1447 * 'pnum' is set to the number of sectors (including and immediately following 1448 * the specified sector) that are known to be in the same 1449 * allocated/unallocated state. 1450 * 1451 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1452 * beyond the end of the disk image it will be clamped. 1453 * 1454 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' 1455 * points to the BDS which the sector range is allocated in. 1456 */ 1457 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1458 int64_t sector_num, 1459 int nb_sectors, int *pnum, 1460 BlockDriverState **file) 1461 { 1462 int64_t total_sectors; 1463 int64_t n; 1464 int64_t ret, ret2; 1465 1466 total_sectors = bdrv_nb_sectors(bs); 1467 if (total_sectors < 0) { 1468 return total_sectors; 1469 } 1470 1471 if (sector_num >= total_sectors) { 1472 *pnum = 0; 1473 return 0; 1474 } 1475 1476 n = total_sectors - sector_num; 1477 if (n < nb_sectors) { 1478 nb_sectors = n; 1479 } 1480 1481 if (!bs->drv->bdrv_co_get_block_status) { 1482 *pnum = nb_sectors; 1483 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1484 if (bs->drv->protocol_name) { 1485 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1486 } 1487 return ret; 1488 } 1489 1490 *file = NULL; 1491 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, 1492 file); 1493 if (ret < 0) { 1494 *pnum = 0; 1495 return ret; 1496 } 1497 1498 if (ret & BDRV_BLOCK_RAW) { 1499 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1500 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1501 *pnum, pnum, file); 1502 } 1503 1504 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1505 ret |= BDRV_BLOCK_ALLOCATED; 1506 } else { 1507 if (bdrv_unallocated_blocks_are_zero(bs)) { 1508 ret |= BDRV_BLOCK_ZERO; 1509 } else if (bs->backing) { 1510 BlockDriverState *bs2 = bs->backing->bs; 1511 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1512 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1513 ret |= BDRV_BLOCK_ZERO; 1514 } 1515 } 1516 } 1517 1518 if (*file && *file != bs && 1519 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1520 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1521 BlockDriverState *file2; 1522 int file_pnum; 1523 1524 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1525 *pnum, &file_pnum, &file2); 1526 if (ret2 >= 0) { 1527 /* Ignore errors. This is just providing extra information, it 1528 * is useful but not necessary. 1529 */ 1530 if (!file_pnum) { 1531 /* !file_pnum indicates an offset at or beyond the EOF; it is 1532 * perfectly valid for the format block driver to point to such 1533 * offsets, so catch it and mark everything as zero */ 1534 ret |= BDRV_BLOCK_ZERO; 1535 } else { 1536 /* Limit request to the range reported by the protocol driver */ 1537 *pnum = file_pnum; 1538 ret |= (ret2 & BDRV_BLOCK_ZERO); 1539 } 1540 } 1541 } 1542 1543 return ret; 1544 } 1545 1546 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1547 BlockDriverState *base, 1548 int64_t sector_num, 1549 int nb_sectors, 1550 int *pnum, 1551 BlockDriverState **file) 1552 { 1553 BlockDriverState *p; 1554 int64_t ret = 0; 1555 1556 assert(bs != base); 1557 for (p = bs; p != base; p = backing_bs(p)) { 1558 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); 1559 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1560 break; 1561 } 1562 /* [sector_num, pnum] unallocated on this layer, which could be only 1563 * the first part of [sector_num, nb_sectors]. */ 1564 nb_sectors = MIN(nb_sectors, *pnum); 1565 } 1566 return ret; 1567 } 1568 1569 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1570 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1571 { 1572 BdrvCoGetBlockStatusData *data = opaque; 1573 1574 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1575 data->sector_num, 1576 data->nb_sectors, 1577 data->pnum, 1578 data->file); 1579 data->done = true; 1580 } 1581 1582 /* 1583 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1584 * 1585 * See bdrv_co_get_block_status_above() for details. 1586 */ 1587 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1588 BlockDriverState *base, 1589 int64_t sector_num, 1590 int nb_sectors, int *pnum, 1591 BlockDriverState **file) 1592 { 1593 Coroutine *co; 1594 BdrvCoGetBlockStatusData data = { 1595 .bs = bs, 1596 .base = base, 1597 .file = file, 1598 .sector_num = sector_num, 1599 .nb_sectors = nb_sectors, 1600 .pnum = pnum, 1601 .done = false, 1602 }; 1603 1604 if (qemu_in_coroutine()) { 1605 /* Fast-path if already in coroutine context */ 1606 bdrv_get_block_status_above_co_entry(&data); 1607 } else { 1608 AioContext *aio_context = bdrv_get_aio_context(bs); 1609 1610 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1611 qemu_coroutine_enter(co, &data); 1612 while (!data.done) { 1613 aio_poll(aio_context, true); 1614 } 1615 } 1616 return data.ret; 1617 } 1618 1619 int64_t bdrv_get_block_status(BlockDriverState *bs, 1620 int64_t sector_num, 1621 int nb_sectors, int *pnum, 1622 BlockDriverState **file) 1623 { 1624 return bdrv_get_block_status_above(bs, backing_bs(bs), 1625 sector_num, nb_sectors, pnum, file); 1626 } 1627 1628 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1629 int nb_sectors, int *pnum) 1630 { 1631 BlockDriverState *file; 1632 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, 1633 &file); 1634 if (ret < 0) { 1635 return ret; 1636 } 1637 return !!(ret & BDRV_BLOCK_ALLOCATED); 1638 } 1639 1640 /* 1641 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1642 * 1643 * Return true if the given sector is allocated in any image between 1644 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1645 * sector is allocated in any image of the chain. Return false otherwise. 1646 * 1647 * 'pnum' is set to the number of sectors (including and immediately following 1648 * the specified sector) that are known to be in the same 1649 * allocated/unallocated state. 1650 * 1651 */ 1652 int bdrv_is_allocated_above(BlockDriverState *top, 1653 BlockDriverState *base, 1654 int64_t sector_num, 1655 int nb_sectors, int *pnum) 1656 { 1657 BlockDriverState *intermediate; 1658 int ret, n = nb_sectors; 1659 1660 intermediate = top; 1661 while (intermediate && intermediate != base) { 1662 int pnum_inter; 1663 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1664 &pnum_inter); 1665 if (ret < 0) { 1666 return ret; 1667 } else if (ret) { 1668 *pnum = pnum_inter; 1669 return 1; 1670 } 1671 1672 /* 1673 * [sector_num, nb_sectors] is unallocated on top but intermediate 1674 * might have 1675 * 1676 * [sector_num+x, nr_sectors] allocated. 1677 */ 1678 if (n > pnum_inter && 1679 (intermediate == top || 1680 sector_num + pnum_inter < intermediate->total_sectors)) { 1681 n = pnum_inter; 1682 } 1683 1684 intermediate = backing_bs(intermediate); 1685 } 1686 1687 *pnum = n; 1688 return 0; 1689 } 1690 1691 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1692 const uint8_t *buf, int nb_sectors) 1693 { 1694 BlockDriver *drv = bs->drv; 1695 int ret; 1696 1697 if (!drv) { 1698 return -ENOMEDIUM; 1699 } 1700 if (!drv->bdrv_write_compressed) { 1701 return -ENOTSUP; 1702 } 1703 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1704 if (ret < 0) { 1705 return ret; 1706 } 1707 1708 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1709 1710 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1711 } 1712 1713 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1714 int64_t pos, int size) 1715 { 1716 QEMUIOVector qiov; 1717 struct iovec iov = { 1718 .iov_base = (void *) buf, 1719 .iov_len = size, 1720 }; 1721 1722 qemu_iovec_init_external(&qiov, &iov, 1); 1723 return bdrv_writev_vmstate(bs, &qiov, pos); 1724 } 1725 1726 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1727 { 1728 BlockDriver *drv = bs->drv; 1729 1730 if (!drv) { 1731 return -ENOMEDIUM; 1732 } else if (drv->bdrv_save_vmstate) { 1733 return drv->bdrv_save_vmstate(bs, qiov, pos); 1734 } else if (bs->file) { 1735 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1736 } 1737 1738 return -ENOTSUP; 1739 } 1740 1741 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1742 int64_t pos, int size) 1743 { 1744 BlockDriver *drv = bs->drv; 1745 if (!drv) 1746 return -ENOMEDIUM; 1747 if (drv->bdrv_load_vmstate) 1748 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1749 if (bs->file) 1750 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1751 return -ENOTSUP; 1752 } 1753 1754 /**************************************************************/ 1755 /* async I/Os */ 1756 1757 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1758 QEMUIOVector *qiov, int nb_sectors, 1759 BlockCompletionFunc *cb, void *opaque) 1760 { 1761 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1762 1763 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1764 cb, opaque, false); 1765 } 1766 1767 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1768 QEMUIOVector *qiov, int nb_sectors, 1769 BlockCompletionFunc *cb, void *opaque) 1770 { 1771 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1772 1773 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1774 cb, opaque, true); 1775 } 1776 1777 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1778 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1779 BlockCompletionFunc *cb, void *opaque) 1780 { 1781 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1782 1783 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1784 BDRV_REQ_ZERO_WRITE | flags, 1785 cb, opaque, true); 1786 } 1787 1788 1789 typedef struct MultiwriteCB { 1790 int error; 1791 int num_requests; 1792 int num_callbacks; 1793 struct { 1794 BlockCompletionFunc *cb; 1795 void *opaque; 1796 QEMUIOVector *free_qiov; 1797 } callbacks[]; 1798 } MultiwriteCB; 1799 1800 static void multiwrite_user_cb(MultiwriteCB *mcb) 1801 { 1802 int i; 1803 1804 for (i = 0; i < mcb->num_callbacks; i++) { 1805 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1806 if (mcb->callbacks[i].free_qiov) { 1807 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1808 } 1809 g_free(mcb->callbacks[i].free_qiov); 1810 } 1811 } 1812 1813 static void multiwrite_cb(void *opaque, int ret) 1814 { 1815 MultiwriteCB *mcb = opaque; 1816 1817 trace_multiwrite_cb(mcb, ret); 1818 1819 if (ret < 0 && !mcb->error) { 1820 mcb->error = ret; 1821 } 1822 1823 mcb->num_requests--; 1824 if (mcb->num_requests == 0) { 1825 multiwrite_user_cb(mcb); 1826 g_free(mcb); 1827 } 1828 } 1829 1830 static int multiwrite_req_compare(const void *a, const void *b) 1831 { 1832 const BlockRequest *req1 = a, *req2 = b; 1833 1834 /* 1835 * Note that we can't simply subtract req2->sector from req1->sector 1836 * here as that could overflow the return value. 1837 */ 1838 if (req1->sector > req2->sector) { 1839 return 1; 1840 } else if (req1->sector < req2->sector) { 1841 return -1; 1842 } else { 1843 return 0; 1844 } 1845 } 1846 1847 /* 1848 * Takes a bunch of requests and tries to merge them. Returns the number of 1849 * requests that remain after merging. 1850 */ 1851 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1852 int num_reqs, MultiwriteCB *mcb) 1853 { 1854 int i, outidx; 1855 1856 // Sort requests by start sector 1857 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1858 1859 // Check if adjacent requests touch the same clusters. If so, combine them, 1860 // filling up gaps with zero sectors. 1861 outidx = 0; 1862 for (i = 1; i < num_reqs; i++) { 1863 int merge = 0; 1864 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1865 1866 // Handle exactly sequential writes and overlapping writes. 1867 if (reqs[i].sector <= oldreq_last) { 1868 merge = 1; 1869 } 1870 1871 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > 1872 bs->bl.max_iov) { 1873 merge = 0; 1874 } 1875 1876 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1877 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1878 merge = 0; 1879 } 1880 1881 if (merge) { 1882 size_t size; 1883 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1884 qemu_iovec_init(qiov, 1885 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1886 1887 // Add the first request to the merged one. If the requests are 1888 // overlapping, drop the last sectors of the first request. 1889 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1890 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1891 1892 // We should need to add any zeros between the two requests 1893 assert (reqs[i].sector <= oldreq_last); 1894 1895 // Add the second request 1896 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1897 1898 // Add tail of first request, if necessary 1899 if (qiov->size < reqs[outidx].qiov->size) { 1900 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1901 reqs[outidx].qiov->size - qiov->size); 1902 } 1903 1904 reqs[outidx].nb_sectors = qiov->size >> 9; 1905 reqs[outidx].qiov = qiov; 1906 1907 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1908 } else { 1909 outidx++; 1910 reqs[outidx].sector = reqs[i].sector; 1911 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1912 reqs[outidx].qiov = reqs[i].qiov; 1913 } 1914 } 1915 1916 if (bs->blk) { 1917 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1918 num_reqs - outidx - 1); 1919 } 1920 1921 return outidx + 1; 1922 } 1923 1924 /* 1925 * Submit multiple AIO write requests at once. 1926 * 1927 * On success, the function returns 0 and all requests in the reqs array have 1928 * been submitted. In error case this function returns -1, and any of the 1929 * requests may or may not be submitted yet. In particular, this means that the 1930 * callback will be called for some of the requests, for others it won't. The 1931 * caller must check the error field of the BlockRequest to wait for the right 1932 * callbacks (if error != 0, no callback will be called). 1933 * 1934 * The implementation may modify the contents of the reqs array, e.g. to merge 1935 * requests. However, the fields opaque and error are left unmodified as they 1936 * are used to signal failure for a single request to the caller. 1937 */ 1938 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1939 { 1940 MultiwriteCB *mcb; 1941 int i; 1942 1943 /* don't submit writes if we don't have a medium */ 1944 if (bs->drv == NULL) { 1945 for (i = 0; i < num_reqs; i++) { 1946 reqs[i].error = -ENOMEDIUM; 1947 } 1948 return -1; 1949 } 1950 1951 if (num_reqs == 0) { 1952 return 0; 1953 } 1954 1955 // Create MultiwriteCB structure 1956 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1957 mcb->num_requests = 0; 1958 mcb->num_callbacks = num_reqs; 1959 1960 for (i = 0; i < num_reqs; i++) { 1961 mcb->callbacks[i].cb = reqs[i].cb; 1962 mcb->callbacks[i].opaque = reqs[i].opaque; 1963 } 1964 1965 // Check for mergable requests 1966 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1967 1968 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1969 1970 /* Run the aio requests. */ 1971 mcb->num_requests = num_reqs; 1972 for (i = 0; i < num_reqs; i++) { 1973 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1974 reqs[i].nb_sectors, reqs[i].flags, 1975 multiwrite_cb, mcb, 1976 true); 1977 } 1978 1979 return 0; 1980 } 1981 1982 void bdrv_aio_cancel(BlockAIOCB *acb) 1983 { 1984 qemu_aio_ref(acb); 1985 bdrv_aio_cancel_async(acb); 1986 while (acb->refcnt > 1) { 1987 if (acb->aiocb_info->get_aio_context) { 1988 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1989 } else if (acb->bs) { 1990 aio_poll(bdrv_get_aio_context(acb->bs), true); 1991 } else { 1992 abort(); 1993 } 1994 } 1995 qemu_aio_unref(acb); 1996 } 1997 1998 /* Async version of aio cancel. The caller is not blocked if the acb implements 1999 * cancel_async, otherwise we do nothing and let the request normally complete. 2000 * In either case the completion callback must be called. */ 2001 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2002 { 2003 if (acb->aiocb_info->cancel_async) { 2004 acb->aiocb_info->cancel_async(acb); 2005 } 2006 } 2007 2008 /**************************************************************/ 2009 /* async block device emulation */ 2010 2011 typedef struct BlockAIOCBSync { 2012 BlockAIOCB common; 2013 QEMUBH *bh; 2014 int ret; 2015 /* vector translation state */ 2016 QEMUIOVector *qiov; 2017 uint8_t *bounce; 2018 int is_write; 2019 } BlockAIOCBSync; 2020 2021 static const AIOCBInfo bdrv_em_aiocb_info = { 2022 .aiocb_size = sizeof(BlockAIOCBSync), 2023 }; 2024 2025 static void bdrv_aio_bh_cb(void *opaque) 2026 { 2027 BlockAIOCBSync *acb = opaque; 2028 2029 if (!acb->is_write && acb->ret >= 0) { 2030 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2031 } 2032 qemu_vfree(acb->bounce); 2033 acb->common.cb(acb->common.opaque, acb->ret); 2034 qemu_bh_delete(acb->bh); 2035 acb->bh = NULL; 2036 qemu_aio_unref(acb); 2037 } 2038 2039 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2040 int64_t sector_num, 2041 QEMUIOVector *qiov, 2042 int nb_sectors, 2043 BlockCompletionFunc *cb, 2044 void *opaque, 2045 int is_write) 2046 2047 { 2048 BlockAIOCBSync *acb; 2049 2050 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2051 acb->is_write = is_write; 2052 acb->qiov = qiov; 2053 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2054 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2055 2056 if (acb->bounce == NULL) { 2057 acb->ret = -ENOMEM; 2058 } else if (is_write) { 2059 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2060 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2061 } else { 2062 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2063 } 2064 2065 qemu_bh_schedule(acb->bh); 2066 2067 return &acb->common; 2068 } 2069 2070 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2071 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2072 BlockCompletionFunc *cb, void *opaque) 2073 { 2074 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2075 } 2076 2077 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2078 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2079 BlockCompletionFunc *cb, void *opaque) 2080 { 2081 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2082 } 2083 2084 2085 typedef struct BlockAIOCBCoroutine { 2086 BlockAIOCB common; 2087 BlockRequest req; 2088 bool is_write; 2089 bool need_bh; 2090 bool *done; 2091 QEMUBH* bh; 2092 } BlockAIOCBCoroutine; 2093 2094 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2095 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2096 }; 2097 2098 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2099 { 2100 if (!acb->need_bh) { 2101 acb->common.cb(acb->common.opaque, acb->req.error); 2102 qemu_aio_unref(acb); 2103 } 2104 } 2105 2106 static void bdrv_co_em_bh(void *opaque) 2107 { 2108 BlockAIOCBCoroutine *acb = opaque; 2109 2110 assert(!acb->need_bh); 2111 qemu_bh_delete(acb->bh); 2112 bdrv_co_complete(acb); 2113 } 2114 2115 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2116 { 2117 acb->need_bh = false; 2118 if (acb->req.error != -EINPROGRESS) { 2119 BlockDriverState *bs = acb->common.bs; 2120 2121 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2122 qemu_bh_schedule(acb->bh); 2123 } 2124 } 2125 2126 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2127 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2128 { 2129 BlockAIOCBCoroutine *acb = opaque; 2130 BlockDriverState *bs = acb->common.bs; 2131 2132 if (!acb->is_write) { 2133 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2134 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2135 } else { 2136 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2137 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2138 } 2139 2140 bdrv_co_complete(acb); 2141 } 2142 2143 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2144 int64_t sector_num, 2145 QEMUIOVector *qiov, 2146 int nb_sectors, 2147 BdrvRequestFlags flags, 2148 BlockCompletionFunc *cb, 2149 void *opaque, 2150 bool is_write) 2151 { 2152 Coroutine *co; 2153 BlockAIOCBCoroutine *acb; 2154 2155 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2156 acb->need_bh = true; 2157 acb->req.error = -EINPROGRESS; 2158 acb->req.sector = sector_num; 2159 acb->req.nb_sectors = nb_sectors; 2160 acb->req.qiov = qiov; 2161 acb->req.flags = flags; 2162 acb->is_write = is_write; 2163 2164 co = qemu_coroutine_create(bdrv_co_do_rw); 2165 qemu_coroutine_enter(co, acb); 2166 2167 bdrv_co_maybe_schedule_bh(acb); 2168 return &acb->common; 2169 } 2170 2171 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2172 { 2173 BlockAIOCBCoroutine *acb = opaque; 2174 BlockDriverState *bs = acb->common.bs; 2175 2176 acb->req.error = bdrv_co_flush(bs); 2177 bdrv_co_complete(acb); 2178 } 2179 2180 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2181 BlockCompletionFunc *cb, void *opaque) 2182 { 2183 trace_bdrv_aio_flush(bs, opaque); 2184 2185 Coroutine *co; 2186 BlockAIOCBCoroutine *acb; 2187 2188 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2189 acb->need_bh = true; 2190 acb->req.error = -EINPROGRESS; 2191 2192 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2193 qemu_coroutine_enter(co, acb); 2194 2195 bdrv_co_maybe_schedule_bh(acb); 2196 return &acb->common; 2197 } 2198 2199 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2200 { 2201 BlockAIOCBCoroutine *acb = opaque; 2202 BlockDriverState *bs = acb->common.bs; 2203 2204 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2205 bdrv_co_complete(acb); 2206 } 2207 2208 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2209 int64_t sector_num, int nb_sectors, 2210 BlockCompletionFunc *cb, void *opaque) 2211 { 2212 Coroutine *co; 2213 BlockAIOCBCoroutine *acb; 2214 2215 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2216 2217 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2218 acb->need_bh = true; 2219 acb->req.error = -EINPROGRESS; 2220 acb->req.sector = sector_num; 2221 acb->req.nb_sectors = nb_sectors; 2222 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2223 qemu_coroutine_enter(co, acb); 2224 2225 bdrv_co_maybe_schedule_bh(acb); 2226 return &acb->common; 2227 } 2228 2229 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2230 BlockCompletionFunc *cb, void *opaque) 2231 { 2232 BlockAIOCB *acb; 2233 2234 acb = g_malloc(aiocb_info->aiocb_size); 2235 acb->aiocb_info = aiocb_info; 2236 acb->bs = bs; 2237 acb->cb = cb; 2238 acb->opaque = opaque; 2239 acb->refcnt = 1; 2240 return acb; 2241 } 2242 2243 void qemu_aio_ref(void *p) 2244 { 2245 BlockAIOCB *acb = p; 2246 acb->refcnt++; 2247 } 2248 2249 void qemu_aio_unref(void *p) 2250 { 2251 BlockAIOCB *acb = p; 2252 assert(acb->refcnt > 0); 2253 if (--acb->refcnt == 0) { 2254 g_free(acb); 2255 } 2256 } 2257 2258 /**************************************************************/ 2259 /* Coroutine block device emulation */ 2260 2261 typedef struct CoroutineIOCompletion { 2262 Coroutine *coroutine; 2263 int ret; 2264 } CoroutineIOCompletion; 2265 2266 static void bdrv_co_io_em_complete(void *opaque, int ret) 2267 { 2268 CoroutineIOCompletion *co = opaque; 2269 2270 co->ret = ret; 2271 qemu_coroutine_enter(co->coroutine, NULL); 2272 } 2273 2274 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2275 int nb_sectors, QEMUIOVector *iov, 2276 bool is_write) 2277 { 2278 CoroutineIOCompletion co = { 2279 .coroutine = qemu_coroutine_self(), 2280 }; 2281 BlockAIOCB *acb; 2282 2283 if (is_write) { 2284 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2285 bdrv_co_io_em_complete, &co); 2286 } else { 2287 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2288 bdrv_co_io_em_complete, &co); 2289 } 2290 2291 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2292 if (!acb) { 2293 return -EIO; 2294 } 2295 qemu_coroutine_yield(); 2296 2297 return co.ret; 2298 } 2299 2300 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2301 int64_t sector_num, int nb_sectors, 2302 QEMUIOVector *iov) 2303 { 2304 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2305 } 2306 2307 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2308 int64_t sector_num, int nb_sectors, 2309 QEMUIOVector *iov) 2310 { 2311 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2312 } 2313 2314 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2315 { 2316 RwCo *rwco = opaque; 2317 2318 rwco->ret = bdrv_co_flush(rwco->bs); 2319 } 2320 2321 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2322 { 2323 int ret; 2324 BdrvTrackedRequest req; 2325 2326 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2327 bdrv_is_sg(bs)) { 2328 return 0; 2329 } 2330 2331 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2332 /* Write back cached data to the OS even with cache=unsafe */ 2333 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2334 if (bs->drv->bdrv_co_flush_to_os) { 2335 ret = bs->drv->bdrv_co_flush_to_os(bs); 2336 if (ret < 0) { 2337 goto out; 2338 } 2339 } 2340 2341 /* But don't actually force it to the disk with cache=unsafe */ 2342 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2343 goto flush_parent; 2344 } 2345 2346 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2347 if (bs->drv->bdrv_co_flush_to_disk) { 2348 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2349 } else if (bs->drv->bdrv_aio_flush) { 2350 BlockAIOCB *acb; 2351 CoroutineIOCompletion co = { 2352 .coroutine = qemu_coroutine_self(), 2353 }; 2354 2355 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2356 if (acb == NULL) { 2357 ret = -EIO; 2358 } else { 2359 qemu_coroutine_yield(); 2360 ret = co.ret; 2361 } 2362 } else { 2363 /* 2364 * Some block drivers always operate in either writethrough or unsafe 2365 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2366 * know how the server works (because the behaviour is hardcoded or 2367 * depends on server-side configuration), so we can't ensure that 2368 * everything is safe on disk. Returning an error doesn't work because 2369 * that would break guests even if the server operates in writethrough 2370 * mode. 2371 * 2372 * Let's hope the user knows what he's doing. 2373 */ 2374 ret = 0; 2375 } 2376 if (ret < 0) { 2377 goto out; 2378 } 2379 2380 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2381 * in the case of cache=unsafe, so there are no useless flushes. 2382 */ 2383 flush_parent: 2384 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2385 out: 2386 tracked_request_end(&req); 2387 return ret; 2388 } 2389 2390 int bdrv_flush(BlockDriverState *bs) 2391 { 2392 Coroutine *co; 2393 RwCo rwco = { 2394 .bs = bs, 2395 .ret = NOT_DONE, 2396 }; 2397 2398 if (qemu_in_coroutine()) { 2399 /* Fast-path if already in coroutine context */ 2400 bdrv_flush_co_entry(&rwco); 2401 } else { 2402 AioContext *aio_context = bdrv_get_aio_context(bs); 2403 2404 co = qemu_coroutine_create(bdrv_flush_co_entry); 2405 qemu_coroutine_enter(co, &rwco); 2406 while (rwco.ret == NOT_DONE) { 2407 aio_poll(aio_context, true); 2408 } 2409 } 2410 2411 return rwco.ret; 2412 } 2413 2414 typedef struct DiscardCo { 2415 BlockDriverState *bs; 2416 int64_t sector_num; 2417 int nb_sectors; 2418 int ret; 2419 } DiscardCo; 2420 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2421 { 2422 DiscardCo *rwco = opaque; 2423 2424 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2425 } 2426 2427 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2428 int nb_sectors) 2429 { 2430 BdrvTrackedRequest req; 2431 int max_discard, ret; 2432 2433 if (!bs->drv) { 2434 return -ENOMEDIUM; 2435 } 2436 2437 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2438 if (ret < 0) { 2439 return ret; 2440 } else if (bs->read_only) { 2441 return -EPERM; 2442 } 2443 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2444 2445 /* Do nothing if disabled. */ 2446 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2447 return 0; 2448 } 2449 2450 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2451 return 0; 2452 } 2453 2454 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2455 BDRV_TRACKED_DISCARD); 2456 bdrv_set_dirty(bs, sector_num, nb_sectors); 2457 2458 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2459 while (nb_sectors > 0) { 2460 int ret; 2461 int num = nb_sectors; 2462 2463 /* align request */ 2464 if (bs->bl.discard_alignment && 2465 num >= bs->bl.discard_alignment && 2466 sector_num % bs->bl.discard_alignment) { 2467 if (num > bs->bl.discard_alignment) { 2468 num = bs->bl.discard_alignment; 2469 } 2470 num -= sector_num % bs->bl.discard_alignment; 2471 } 2472 2473 /* limit request size */ 2474 if (num > max_discard) { 2475 num = max_discard; 2476 } 2477 2478 if (bs->drv->bdrv_co_discard) { 2479 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2480 } else { 2481 BlockAIOCB *acb; 2482 CoroutineIOCompletion co = { 2483 .coroutine = qemu_coroutine_self(), 2484 }; 2485 2486 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2487 bdrv_co_io_em_complete, &co); 2488 if (acb == NULL) { 2489 ret = -EIO; 2490 goto out; 2491 } else { 2492 qemu_coroutine_yield(); 2493 ret = co.ret; 2494 } 2495 } 2496 if (ret && ret != -ENOTSUP) { 2497 goto out; 2498 } 2499 2500 sector_num += num; 2501 nb_sectors -= num; 2502 } 2503 ret = 0; 2504 out: 2505 tracked_request_end(&req); 2506 return ret; 2507 } 2508 2509 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2510 { 2511 Coroutine *co; 2512 DiscardCo rwco = { 2513 .bs = bs, 2514 .sector_num = sector_num, 2515 .nb_sectors = nb_sectors, 2516 .ret = NOT_DONE, 2517 }; 2518 2519 if (qemu_in_coroutine()) { 2520 /* Fast-path if already in coroutine context */ 2521 bdrv_discard_co_entry(&rwco); 2522 } else { 2523 AioContext *aio_context = bdrv_get_aio_context(bs); 2524 2525 co = qemu_coroutine_create(bdrv_discard_co_entry); 2526 qemu_coroutine_enter(co, &rwco); 2527 while (rwco.ret == NOT_DONE) { 2528 aio_poll(aio_context, true); 2529 } 2530 } 2531 2532 return rwco.ret; 2533 } 2534 2535 typedef struct { 2536 CoroutineIOCompletion *co; 2537 QEMUBH *bh; 2538 } BdrvIoctlCompletionData; 2539 2540 static void bdrv_ioctl_bh_cb(void *opaque) 2541 { 2542 BdrvIoctlCompletionData *data = opaque; 2543 2544 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2545 qemu_bh_delete(data->bh); 2546 } 2547 2548 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2549 { 2550 BlockDriver *drv = bs->drv; 2551 BdrvTrackedRequest tracked_req; 2552 CoroutineIOCompletion co = { 2553 .coroutine = qemu_coroutine_self(), 2554 }; 2555 BlockAIOCB *acb; 2556 2557 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2558 if (!drv || !drv->bdrv_aio_ioctl) { 2559 co.ret = -ENOTSUP; 2560 goto out; 2561 } 2562 2563 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2564 if (!acb) { 2565 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2566 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2567 bdrv_ioctl_bh_cb, data); 2568 data->co = &co; 2569 qemu_bh_schedule(data->bh); 2570 } 2571 qemu_coroutine_yield(); 2572 out: 2573 tracked_request_end(&tracked_req); 2574 return co.ret; 2575 } 2576 2577 typedef struct { 2578 BlockDriverState *bs; 2579 int req; 2580 void *buf; 2581 int ret; 2582 } BdrvIoctlCoData; 2583 2584 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2585 { 2586 BdrvIoctlCoData *data = opaque; 2587 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2588 } 2589 2590 /* needed for generic scsi interface */ 2591 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2592 { 2593 BdrvIoctlCoData data = { 2594 .bs = bs, 2595 .req = req, 2596 .buf = buf, 2597 .ret = -EINPROGRESS, 2598 }; 2599 2600 if (qemu_in_coroutine()) { 2601 /* Fast-path if already in coroutine context */ 2602 bdrv_co_ioctl_entry(&data); 2603 } else { 2604 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2605 2606 qemu_coroutine_enter(co, &data); 2607 while (data.ret == -EINPROGRESS) { 2608 aio_poll(bdrv_get_aio_context(bs), true); 2609 } 2610 } 2611 return data.ret; 2612 } 2613 2614 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2615 { 2616 BlockAIOCBCoroutine *acb = opaque; 2617 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2618 acb->req.req, acb->req.buf); 2619 bdrv_co_complete(acb); 2620 } 2621 2622 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2623 unsigned long int req, void *buf, 2624 BlockCompletionFunc *cb, void *opaque) 2625 { 2626 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2627 bs, cb, opaque); 2628 Coroutine *co; 2629 2630 acb->need_bh = true; 2631 acb->req.error = -EINPROGRESS; 2632 acb->req.req = req; 2633 acb->req.buf = buf; 2634 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2635 qemu_coroutine_enter(co, acb); 2636 2637 bdrv_co_maybe_schedule_bh(acb); 2638 return &acb->common; 2639 } 2640 2641 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2642 { 2643 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2644 } 2645 2646 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2647 { 2648 return memset(qemu_blockalign(bs, size), 0, size); 2649 } 2650 2651 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2652 { 2653 size_t align = bdrv_opt_mem_align(bs); 2654 2655 /* Ensure that NULL is never returned on success */ 2656 assert(align > 0); 2657 if (size == 0) { 2658 size = align; 2659 } 2660 2661 return qemu_try_memalign(align, size); 2662 } 2663 2664 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2665 { 2666 void *mem = qemu_try_blockalign(bs, size); 2667 2668 if (mem) { 2669 memset(mem, 0, size); 2670 } 2671 2672 return mem; 2673 } 2674 2675 /* 2676 * Check if all memory in this vector is sector aligned. 2677 */ 2678 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2679 { 2680 int i; 2681 size_t alignment = bdrv_min_mem_align(bs); 2682 2683 for (i = 0; i < qiov->niov; i++) { 2684 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2685 return false; 2686 } 2687 if (qiov->iov[i].iov_len % alignment) { 2688 return false; 2689 } 2690 } 2691 2692 return true; 2693 } 2694 2695 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2696 NotifierWithReturn *notifier) 2697 { 2698 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2699 } 2700 2701 void bdrv_io_plug(BlockDriverState *bs) 2702 { 2703 BlockDriver *drv = bs->drv; 2704 if (drv && drv->bdrv_io_plug) { 2705 drv->bdrv_io_plug(bs); 2706 } else if (bs->file) { 2707 bdrv_io_plug(bs->file->bs); 2708 } 2709 } 2710 2711 void bdrv_io_unplug(BlockDriverState *bs) 2712 { 2713 BlockDriver *drv = bs->drv; 2714 if (drv && drv->bdrv_io_unplug) { 2715 drv->bdrv_io_unplug(bs); 2716 } else if (bs->file) { 2717 bdrv_io_unplug(bs->file->bs); 2718 } 2719 } 2720 2721 void bdrv_flush_io_queue(BlockDriverState *bs) 2722 { 2723 BlockDriver *drv = bs->drv; 2724 if (drv && drv->bdrv_flush_io_queue) { 2725 drv->bdrv_flush_io_queue(bs); 2726 } else if (bs->file) { 2727 bdrv_flush_io_queue(bs->file->bs); 2728 } 2729 bdrv_start_throttled_reqs(bs); 2730 } 2731 2732 void bdrv_drained_begin(BlockDriverState *bs) 2733 { 2734 if (!bs->quiesce_counter++) { 2735 aio_disable_external(bdrv_get_aio_context(bs)); 2736 } 2737 bdrv_drain(bs); 2738 } 2739 2740 void bdrv_drained_end(BlockDriverState *bs) 2741 { 2742 assert(bs->quiesce_counter > 0); 2743 if (--bs->quiesce_counter > 0) { 2744 return; 2745 } 2746 aio_enable_external(bdrv_get_aio_context(bs)); 2747 } 2748