1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "sysemu/block-backend.h" 27 #include "block/blockjob.h" 28 #include "block/block_int.h" 29 #include "block/throttle-groups.h" 30 #include "qemu/error-report.h" 31 32 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 33 34 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 35 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 36 BlockCompletionFunc *cb, void *opaque); 37 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 38 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 39 BlockCompletionFunc *cb, void *opaque); 40 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 41 int64_t sector_num, int nb_sectors, 42 QEMUIOVector *iov); 43 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 44 int64_t sector_num, int nb_sectors, 45 QEMUIOVector *iov); 46 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 47 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 48 BdrvRequestFlags flags); 49 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 50 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 51 BdrvRequestFlags flags); 52 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 53 int64_t sector_num, 54 QEMUIOVector *qiov, 55 int nb_sectors, 56 BdrvRequestFlags flags, 57 BlockCompletionFunc *cb, 58 void *opaque, 59 bool is_write); 60 static void coroutine_fn bdrv_co_do_rw(void *opaque); 61 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 62 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 63 64 /* throttling disk I/O limits */ 65 void bdrv_set_io_limits(BlockDriverState *bs, 66 ThrottleConfig *cfg) 67 { 68 int i; 69 70 throttle_group_config(bs, cfg); 71 72 for (i = 0; i < 2; i++) { 73 qemu_co_enter_next(&bs->throttled_reqs[i]); 74 } 75 } 76 77 /* this function drain all the throttled IOs */ 78 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 79 { 80 bool drained = false; 81 bool enabled = bs->io_limits_enabled; 82 int i; 83 84 bs->io_limits_enabled = false; 85 86 for (i = 0; i < 2; i++) { 87 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 88 drained = true; 89 } 90 } 91 92 bs->io_limits_enabled = enabled; 93 94 return drained; 95 } 96 97 void bdrv_io_limits_disable(BlockDriverState *bs) 98 { 99 bs->io_limits_enabled = false; 100 bdrv_start_throttled_reqs(bs); 101 throttle_group_unregister_bs(bs); 102 } 103 104 /* should be called before bdrv_set_io_limits if a limit is set */ 105 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 106 { 107 assert(!bs->io_limits_enabled); 108 throttle_group_register_bs(bs, group); 109 bs->io_limits_enabled = true; 110 } 111 112 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 113 { 114 /* this bs is not part of any group */ 115 if (!bs->throttle_state) { 116 return; 117 } 118 119 /* this bs is a part of the same group than the one we want */ 120 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 121 return; 122 } 123 124 /* need to change the group this bs belong to */ 125 bdrv_io_limits_disable(bs); 126 bdrv_io_limits_enable(bs, group); 127 } 128 129 void bdrv_setup_io_funcs(BlockDriver *bdrv) 130 { 131 /* Block drivers without coroutine functions need emulation */ 132 if (!bdrv->bdrv_co_readv) { 133 bdrv->bdrv_co_readv = bdrv_co_readv_em; 134 bdrv->bdrv_co_writev = bdrv_co_writev_em; 135 136 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 137 * the block driver lacks aio we need to emulate that too. 138 */ 139 if (!bdrv->bdrv_aio_readv) { 140 /* add AIO emulation layer */ 141 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 142 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 143 } 144 } 145 } 146 147 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 148 { 149 BlockDriver *drv = bs->drv; 150 Error *local_err = NULL; 151 152 memset(&bs->bl, 0, sizeof(bs->bl)); 153 154 if (!drv) { 155 return; 156 } 157 158 /* Take some limits from the children as a default */ 159 if (bs->file) { 160 bdrv_refresh_limits(bs->file->bs, &local_err); 161 if (local_err) { 162 error_propagate(errp, local_err); 163 return; 164 } 165 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 166 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 167 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 168 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 169 } else { 170 bs->bl.min_mem_alignment = 512; 171 bs->bl.opt_mem_alignment = getpagesize(); 172 } 173 174 if (bs->backing) { 175 bdrv_refresh_limits(bs->backing->bs, &local_err); 176 if (local_err) { 177 error_propagate(errp, local_err); 178 return; 179 } 180 bs->bl.opt_transfer_length = 181 MAX(bs->bl.opt_transfer_length, 182 bs->backing->bs->bl.opt_transfer_length); 183 bs->bl.max_transfer_length = 184 MIN_NON_ZERO(bs->bl.max_transfer_length, 185 bs->backing->bs->bl.max_transfer_length); 186 bs->bl.opt_mem_alignment = 187 MAX(bs->bl.opt_mem_alignment, 188 bs->backing->bs->bl.opt_mem_alignment); 189 bs->bl.min_mem_alignment = 190 MAX(bs->bl.min_mem_alignment, 191 bs->backing->bs->bl.min_mem_alignment); 192 } 193 194 /* Then let the driver override it */ 195 if (drv->bdrv_refresh_limits) { 196 drv->bdrv_refresh_limits(bs, errp); 197 } 198 } 199 200 /** 201 * The copy-on-read flag is actually a reference count so multiple users may 202 * use the feature without worrying about clobbering its previous state. 203 * Copy-on-read stays enabled until all users have called to disable it. 204 */ 205 void bdrv_enable_copy_on_read(BlockDriverState *bs) 206 { 207 bs->copy_on_read++; 208 } 209 210 void bdrv_disable_copy_on_read(BlockDriverState *bs) 211 { 212 assert(bs->copy_on_read > 0); 213 bs->copy_on_read--; 214 } 215 216 /* Check if any requests are in-flight (including throttled requests) */ 217 bool bdrv_requests_pending(BlockDriverState *bs) 218 { 219 BdrvChild *child; 220 221 if (!QLIST_EMPTY(&bs->tracked_requests)) { 222 return true; 223 } 224 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 225 return true; 226 } 227 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 228 return true; 229 } 230 231 QLIST_FOREACH(child, &bs->children, next) { 232 if (bdrv_requests_pending(child->bs)) { 233 return true; 234 } 235 } 236 237 return false; 238 } 239 240 static void bdrv_drain_recurse(BlockDriverState *bs) 241 { 242 BdrvChild *child; 243 244 if (bs->drv && bs->drv->bdrv_drain) { 245 bs->drv->bdrv_drain(bs); 246 } 247 QLIST_FOREACH(child, &bs->children, next) { 248 bdrv_drain_recurse(child->bs); 249 } 250 } 251 252 /* 253 * Wait for pending requests to complete on a single BlockDriverState subtree, 254 * and suspend block driver's internal I/O until next request arrives. 255 * 256 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 257 * AioContext. 258 * 259 * Only this BlockDriverState's AioContext is run, so in-flight requests must 260 * not depend on events in other AioContexts. In that case, use 261 * bdrv_drain_all() instead. 262 */ 263 void bdrv_drain(BlockDriverState *bs) 264 { 265 bool busy = true; 266 267 bdrv_drain_recurse(bs); 268 while (busy) { 269 /* Keep iterating */ 270 bdrv_flush_io_queue(bs); 271 busy = bdrv_requests_pending(bs); 272 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 273 } 274 } 275 276 /* 277 * Wait for pending requests to complete across all BlockDriverStates 278 * 279 * This function does not flush data to disk, use bdrv_flush_all() for that 280 * after calling this function. 281 */ 282 void bdrv_drain_all(void) 283 { 284 /* Always run first iteration so any pending completion BHs run */ 285 bool busy = true; 286 BlockDriverState *bs = NULL; 287 GSList *aio_ctxs = NULL, *ctx; 288 289 while ((bs = bdrv_next(bs))) { 290 AioContext *aio_context = bdrv_get_aio_context(bs); 291 292 aio_context_acquire(aio_context); 293 if (bs->job) { 294 block_job_pause(bs->job); 295 } 296 aio_context_release(aio_context); 297 298 if (!g_slist_find(aio_ctxs, aio_context)) { 299 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 300 } 301 } 302 303 /* Note that completion of an asynchronous I/O operation can trigger any 304 * number of other I/O operations on other devices---for example a 305 * coroutine can submit an I/O request to another device in response to 306 * request completion. Therefore we must keep looping until there was no 307 * more activity rather than simply draining each device independently. 308 */ 309 while (busy) { 310 busy = false; 311 312 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 313 AioContext *aio_context = ctx->data; 314 bs = NULL; 315 316 aio_context_acquire(aio_context); 317 while ((bs = bdrv_next(bs))) { 318 if (aio_context == bdrv_get_aio_context(bs)) { 319 bdrv_flush_io_queue(bs); 320 if (bdrv_requests_pending(bs)) { 321 busy = true; 322 aio_poll(aio_context, busy); 323 } 324 } 325 } 326 busy |= aio_poll(aio_context, false); 327 aio_context_release(aio_context); 328 } 329 } 330 331 bs = NULL; 332 while ((bs = bdrv_next(bs))) { 333 AioContext *aio_context = bdrv_get_aio_context(bs); 334 335 aio_context_acquire(aio_context); 336 if (bs->job) { 337 block_job_resume(bs->job); 338 } 339 aio_context_release(aio_context); 340 } 341 g_slist_free(aio_ctxs); 342 } 343 344 /** 345 * Remove an active request from the tracked requests list 346 * 347 * This function should be called when a tracked request is completing. 348 */ 349 static void tracked_request_end(BdrvTrackedRequest *req) 350 { 351 if (req->serialising) { 352 req->bs->serialising_in_flight--; 353 } 354 355 QLIST_REMOVE(req, list); 356 qemu_co_queue_restart_all(&req->wait_queue); 357 } 358 359 /** 360 * Add an active request to the tracked requests list 361 */ 362 static void tracked_request_begin(BdrvTrackedRequest *req, 363 BlockDriverState *bs, 364 int64_t offset, 365 unsigned int bytes, 366 enum BdrvTrackedRequestType type) 367 { 368 *req = (BdrvTrackedRequest){ 369 .bs = bs, 370 .offset = offset, 371 .bytes = bytes, 372 .type = type, 373 .co = qemu_coroutine_self(), 374 .serialising = false, 375 .overlap_offset = offset, 376 .overlap_bytes = bytes, 377 }; 378 379 qemu_co_queue_init(&req->wait_queue); 380 381 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 382 } 383 384 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 385 { 386 int64_t overlap_offset = req->offset & ~(align - 1); 387 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 388 - overlap_offset; 389 390 if (!req->serialising) { 391 req->bs->serialising_in_flight++; 392 req->serialising = true; 393 } 394 395 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 396 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 397 } 398 399 /** 400 * Round a region to cluster boundaries 401 */ 402 void bdrv_round_to_clusters(BlockDriverState *bs, 403 int64_t sector_num, int nb_sectors, 404 int64_t *cluster_sector_num, 405 int *cluster_nb_sectors) 406 { 407 BlockDriverInfo bdi; 408 409 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 410 *cluster_sector_num = sector_num; 411 *cluster_nb_sectors = nb_sectors; 412 } else { 413 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 414 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 415 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 416 nb_sectors, c); 417 } 418 } 419 420 static int bdrv_get_cluster_size(BlockDriverState *bs) 421 { 422 BlockDriverInfo bdi; 423 int ret; 424 425 ret = bdrv_get_info(bs, &bdi); 426 if (ret < 0 || bdi.cluster_size == 0) { 427 return bs->request_alignment; 428 } else { 429 return bdi.cluster_size; 430 } 431 } 432 433 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 434 int64_t offset, unsigned int bytes) 435 { 436 /* aaaa bbbb */ 437 if (offset >= req->overlap_offset + req->overlap_bytes) { 438 return false; 439 } 440 /* bbbb aaaa */ 441 if (req->overlap_offset >= offset + bytes) { 442 return false; 443 } 444 return true; 445 } 446 447 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 448 { 449 BlockDriverState *bs = self->bs; 450 BdrvTrackedRequest *req; 451 bool retry; 452 bool waited = false; 453 454 if (!bs->serialising_in_flight) { 455 return false; 456 } 457 458 do { 459 retry = false; 460 QLIST_FOREACH(req, &bs->tracked_requests, list) { 461 if (req == self || (!req->serialising && !self->serialising)) { 462 continue; 463 } 464 if (tracked_request_overlaps(req, self->overlap_offset, 465 self->overlap_bytes)) 466 { 467 /* Hitting this means there was a reentrant request, for 468 * example, a block driver issuing nested requests. This must 469 * never happen since it means deadlock. 470 */ 471 assert(qemu_coroutine_self() != req->co); 472 473 /* If the request is already (indirectly) waiting for us, or 474 * will wait for us as soon as it wakes up, then just go on 475 * (instead of producing a deadlock in the former case). */ 476 if (!req->waiting_for) { 477 self->waiting_for = req; 478 qemu_co_queue_wait(&req->wait_queue); 479 self->waiting_for = NULL; 480 retry = true; 481 waited = true; 482 break; 483 } 484 } 485 } 486 } while (retry); 487 488 return waited; 489 } 490 491 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 492 size_t size) 493 { 494 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 495 return -EIO; 496 } 497 498 if (!bdrv_is_inserted(bs)) { 499 return -ENOMEDIUM; 500 } 501 502 if (offset < 0) { 503 return -EIO; 504 } 505 506 return 0; 507 } 508 509 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 510 int nb_sectors) 511 { 512 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 513 return -EIO; 514 } 515 516 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 517 nb_sectors * BDRV_SECTOR_SIZE); 518 } 519 520 typedef struct RwCo { 521 BlockDriverState *bs; 522 int64_t offset; 523 QEMUIOVector *qiov; 524 bool is_write; 525 int ret; 526 BdrvRequestFlags flags; 527 } RwCo; 528 529 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 530 { 531 RwCo *rwco = opaque; 532 533 if (!rwco->is_write) { 534 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 535 rwco->qiov->size, rwco->qiov, 536 rwco->flags); 537 } else { 538 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 539 rwco->qiov->size, rwco->qiov, 540 rwco->flags); 541 } 542 } 543 544 /* 545 * Process a vectored synchronous request using coroutines 546 */ 547 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 548 QEMUIOVector *qiov, bool is_write, 549 BdrvRequestFlags flags) 550 { 551 Coroutine *co; 552 RwCo rwco = { 553 .bs = bs, 554 .offset = offset, 555 .qiov = qiov, 556 .is_write = is_write, 557 .ret = NOT_DONE, 558 .flags = flags, 559 }; 560 561 /** 562 * In sync call context, when the vcpu is blocked, this throttling timer 563 * will not fire; so the I/O throttling function has to be disabled here 564 * if it has been enabled. 565 */ 566 if (bs->io_limits_enabled) { 567 fprintf(stderr, "Disabling I/O throttling on '%s' due " 568 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 569 bdrv_io_limits_disable(bs); 570 } 571 572 if (qemu_in_coroutine()) { 573 /* Fast-path if already in coroutine context */ 574 bdrv_rw_co_entry(&rwco); 575 } else { 576 AioContext *aio_context = bdrv_get_aio_context(bs); 577 578 co = qemu_coroutine_create(bdrv_rw_co_entry); 579 qemu_coroutine_enter(co, &rwco); 580 while (rwco.ret == NOT_DONE) { 581 aio_poll(aio_context, true); 582 } 583 } 584 return rwco.ret; 585 } 586 587 /* 588 * Process a synchronous request using coroutines 589 */ 590 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 591 int nb_sectors, bool is_write, BdrvRequestFlags flags) 592 { 593 QEMUIOVector qiov; 594 struct iovec iov = { 595 .iov_base = (void *)buf, 596 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 597 }; 598 599 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 600 return -EINVAL; 601 } 602 603 qemu_iovec_init_external(&qiov, &iov, 1); 604 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 605 &qiov, is_write, flags); 606 } 607 608 /* return < 0 if error. See bdrv_write() for the return codes */ 609 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 610 uint8_t *buf, int nb_sectors) 611 { 612 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 613 } 614 615 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 616 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 617 uint8_t *buf, int nb_sectors) 618 { 619 bool enabled; 620 int ret; 621 622 enabled = bs->io_limits_enabled; 623 bs->io_limits_enabled = false; 624 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 625 bs->io_limits_enabled = enabled; 626 return ret; 627 } 628 629 /* Return < 0 if error. Important errors are: 630 -EIO generic I/O error (may happen for all errors) 631 -ENOMEDIUM No media inserted. 632 -EINVAL Invalid sector number or nb_sectors 633 -EACCES Trying to write a read-only device 634 */ 635 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 636 const uint8_t *buf, int nb_sectors) 637 { 638 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 639 } 640 641 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 642 int nb_sectors, BdrvRequestFlags flags) 643 { 644 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 645 BDRV_REQ_ZERO_WRITE | flags); 646 } 647 648 /* 649 * Completely zero out a block device with the help of bdrv_write_zeroes. 650 * The operation is sped up by checking the block status and only writing 651 * zeroes to the device if they currently do not return zeroes. Optional 652 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 653 * 654 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 655 */ 656 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 657 { 658 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 659 int n; 660 661 target_sectors = bdrv_nb_sectors(bs); 662 if (target_sectors < 0) { 663 return target_sectors; 664 } 665 666 for (;;) { 667 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 668 if (nb_sectors <= 0) { 669 return 0; 670 } 671 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 672 if (ret < 0) { 673 error_report("error getting block status at sector %" PRId64 ": %s", 674 sector_num, strerror(-ret)); 675 return ret; 676 } 677 if (ret & BDRV_BLOCK_ZERO) { 678 sector_num += n; 679 continue; 680 } 681 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 682 if (ret < 0) { 683 error_report("error writing zeroes at sector %" PRId64 ": %s", 684 sector_num, strerror(-ret)); 685 return ret; 686 } 687 sector_num += n; 688 } 689 } 690 691 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 692 { 693 QEMUIOVector qiov; 694 struct iovec iov = { 695 .iov_base = (void *)buf, 696 .iov_len = bytes, 697 }; 698 int ret; 699 700 if (bytes < 0) { 701 return -EINVAL; 702 } 703 704 qemu_iovec_init_external(&qiov, &iov, 1); 705 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 706 if (ret < 0) { 707 return ret; 708 } 709 710 return bytes; 711 } 712 713 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 714 { 715 int ret; 716 717 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 718 if (ret < 0) { 719 return ret; 720 } 721 722 return qiov->size; 723 } 724 725 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 726 const void *buf, int bytes) 727 { 728 QEMUIOVector qiov; 729 struct iovec iov = { 730 .iov_base = (void *) buf, 731 .iov_len = bytes, 732 }; 733 734 if (bytes < 0) { 735 return -EINVAL; 736 } 737 738 qemu_iovec_init_external(&qiov, &iov, 1); 739 return bdrv_pwritev(bs, offset, &qiov); 740 } 741 742 /* 743 * Writes to the file and ensures that no writes are reordered across this 744 * request (acts as a barrier) 745 * 746 * Returns 0 on success, -errno in error cases. 747 */ 748 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 749 const void *buf, int count) 750 { 751 int ret; 752 753 ret = bdrv_pwrite(bs, offset, buf, count); 754 if (ret < 0) { 755 return ret; 756 } 757 758 /* No flush needed for cache modes that already do it */ 759 if (bs->enable_write_cache) { 760 bdrv_flush(bs); 761 } 762 763 return 0; 764 } 765 766 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 767 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 768 { 769 /* Perform I/O through a temporary buffer so that users who scribble over 770 * their read buffer while the operation is in progress do not end up 771 * modifying the image file. This is critical for zero-copy guest I/O 772 * where anything might happen inside guest memory. 773 */ 774 void *bounce_buffer; 775 776 BlockDriver *drv = bs->drv; 777 struct iovec iov; 778 QEMUIOVector bounce_qiov; 779 int64_t cluster_sector_num; 780 int cluster_nb_sectors; 781 size_t skip_bytes; 782 int ret; 783 784 /* Cover entire cluster so no additional backing file I/O is required when 785 * allocating cluster in the image file. 786 */ 787 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 788 &cluster_sector_num, &cluster_nb_sectors); 789 790 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 791 cluster_sector_num, cluster_nb_sectors); 792 793 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 794 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 795 if (bounce_buffer == NULL) { 796 ret = -ENOMEM; 797 goto err; 798 } 799 800 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 801 802 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 803 &bounce_qiov); 804 if (ret < 0) { 805 goto err; 806 } 807 808 if (drv->bdrv_co_write_zeroes && 809 buffer_is_zero(bounce_buffer, iov.iov_len)) { 810 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 811 cluster_nb_sectors, 0); 812 } else { 813 /* This does not change the data on the disk, it is not necessary 814 * to flush even in cache=writethrough mode. 815 */ 816 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 817 &bounce_qiov); 818 } 819 820 if (ret < 0) { 821 /* It might be okay to ignore write errors for guest requests. If this 822 * is a deliberate copy-on-read then we don't want to ignore the error. 823 * Simply report it in all cases. 824 */ 825 goto err; 826 } 827 828 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 829 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 830 nb_sectors * BDRV_SECTOR_SIZE); 831 832 err: 833 qemu_vfree(bounce_buffer); 834 return ret; 835 } 836 837 /* 838 * Forwards an already correctly aligned request to the BlockDriver. This 839 * handles copy on read and zeroing after EOF; any other features must be 840 * implemented by the caller. 841 */ 842 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 843 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 844 int64_t align, QEMUIOVector *qiov, int flags) 845 { 846 BlockDriver *drv = bs->drv; 847 int ret; 848 849 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 850 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 851 852 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 853 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 854 assert(!qiov || bytes == qiov->size); 855 856 /* Handle Copy on Read and associated serialisation */ 857 if (flags & BDRV_REQ_COPY_ON_READ) { 858 /* If we touch the same cluster it counts as an overlap. This 859 * guarantees that allocating writes will be serialized and not race 860 * with each other for the same cluster. For example, in copy-on-read 861 * it ensures that the CoR read and write operations are atomic and 862 * guest writes cannot interleave between them. */ 863 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 864 } 865 866 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 867 wait_serialising_requests(req); 868 } 869 870 if (flags & BDRV_REQ_COPY_ON_READ) { 871 int pnum; 872 873 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 874 if (ret < 0) { 875 goto out; 876 } 877 878 if (!ret || pnum != nb_sectors) { 879 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 880 goto out; 881 } 882 } 883 884 /* Forward the request to the BlockDriver */ 885 if (!bs->zero_beyond_eof) { 886 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 887 } else { 888 /* Read zeros after EOF */ 889 int64_t total_sectors, max_nb_sectors; 890 891 total_sectors = bdrv_nb_sectors(bs); 892 if (total_sectors < 0) { 893 ret = total_sectors; 894 goto out; 895 } 896 897 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 898 align >> BDRV_SECTOR_BITS); 899 if (nb_sectors < max_nb_sectors) { 900 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 901 } else if (max_nb_sectors > 0) { 902 QEMUIOVector local_qiov; 903 904 qemu_iovec_init(&local_qiov, qiov->niov); 905 qemu_iovec_concat(&local_qiov, qiov, 0, 906 max_nb_sectors * BDRV_SECTOR_SIZE); 907 908 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 909 &local_qiov); 910 911 qemu_iovec_destroy(&local_qiov); 912 } else { 913 ret = 0; 914 } 915 916 /* Reading beyond end of file is supposed to produce zeroes */ 917 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 918 uint64_t offset = MAX(0, total_sectors - sector_num); 919 uint64_t bytes = (sector_num + nb_sectors - offset) * 920 BDRV_SECTOR_SIZE; 921 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 922 } 923 } 924 925 out: 926 return ret; 927 } 928 929 /* 930 * Handle a read request in coroutine context 931 */ 932 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 933 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 934 BdrvRequestFlags flags) 935 { 936 BlockDriver *drv = bs->drv; 937 BdrvTrackedRequest req; 938 939 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 940 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 941 uint8_t *head_buf = NULL; 942 uint8_t *tail_buf = NULL; 943 QEMUIOVector local_qiov; 944 bool use_local_qiov = false; 945 int ret; 946 947 if (!drv) { 948 return -ENOMEDIUM; 949 } 950 951 ret = bdrv_check_byte_request(bs, offset, bytes); 952 if (ret < 0) { 953 return ret; 954 } 955 956 /* Don't do copy-on-read if we read data before write operation */ 957 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 958 flags |= BDRV_REQ_COPY_ON_READ; 959 } 960 961 /* throttling disk I/O */ 962 if (bs->io_limits_enabled) { 963 throttle_group_co_io_limits_intercept(bs, bytes, false); 964 } 965 966 /* Align read if necessary by padding qiov */ 967 if (offset & (align - 1)) { 968 head_buf = qemu_blockalign(bs, align); 969 qemu_iovec_init(&local_qiov, qiov->niov + 2); 970 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 971 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 972 use_local_qiov = true; 973 974 bytes += offset & (align - 1); 975 offset = offset & ~(align - 1); 976 } 977 978 if ((offset + bytes) & (align - 1)) { 979 if (!use_local_qiov) { 980 qemu_iovec_init(&local_qiov, qiov->niov + 1); 981 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 982 use_local_qiov = true; 983 } 984 tail_buf = qemu_blockalign(bs, align); 985 qemu_iovec_add(&local_qiov, tail_buf, 986 align - ((offset + bytes) & (align - 1))); 987 988 bytes = ROUND_UP(bytes, align); 989 } 990 991 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 992 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 993 use_local_qiov ? &local_qiov : qiov, 994 flags); 995 tracked_request_end(&req); 996 997 if (use_local_qiov) { 998 qemu_iovec_destroy(&local_qiov); 999 qemu_vfree(head_buf); 1000 qemu_vfree(tail_buf); 1001 } 1002 1003 return ret; 1004 } 1005 1006 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1007 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1008 BdrvRequestFlags flags) 1009 { 1010 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1011 return -EINVAL; 1012 } 1013 1014 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1015 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1016 } 1017 1018 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1019 int nb_sectors, QEMUIOVector *qiov) 1020 { 1021 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1022 1023 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1024 } 1025 1026 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, 1027 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1028 { 1029 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); 1030 1031 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1032 BDRV_REQ_NO_SERIALISING); 1033 } 1034 1035 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1036 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1037 { 1038 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1039 1040 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1041 BDRV_REQ_COPY_ON_READ); 1042 } 1043 1044 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1045 1046 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1047 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1048 { 1049 BlockDriver *drv = bs->drv; 1050 QEMUIOVector qiov; 1051 struct iovec iov = {0}; 1052 int ret = 0; 1053 1054 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1055 BDRV_REQUEST_MAX_SECTORS); 1056 1057 while (nb_sectors > 0 && !ret) { 1058 int num = nb_sectors; 1059 1060 /* Align request. Block drivers can expect the "bulk" of the request 1061 * to be aligned. 1062 */ 1063 if (bs->bl.write_zeroes_alignment 1064 && num > bs->bl.write_zeroes_alignment) { 1065 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1066 /* Make a small request up to the first aligned sector. */ 1067 num = bs->bl.write_zeroes_alignment; 1068 num -= sector_num % bs->bl.write_zeroes_alignment; 1069 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1070 /* Shorten the request to the last aligned sector. num cannot 1071 * underflow because num > bs->bl.write_zeroes_alignment. 1072 */ 1073 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1074 } 1075 } 1076 1077 /* limit request size */ 1078 if (num > max_write_zeroes) { 1079 num = max_write_zeroes; 1080 } 1081 1082 ret = -ENOTSUP; 1083 /* First try the efficient write zeroes operation */ 1084 if (drv->bdrv_co_write_zeroes) { 1085 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1086 } 1087 1088 if (ret == -ENOTSUP) { 1089 /* Fall back to bounce buffer if write zeroes is unsupported */ 1090 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1091 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1092 num = MIN(num, max_xfer_len); 1093 iov.iov_len = num * BDRV_SECTOR_SIZE; 1094 if (iov.iov_base == NULL) { 1095 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1096 if (iov.iov_base == NULL) { 1097 ret = -ENOMEM; 1098 goto fail; 1099 } 1100 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1101 } 1102 qemu_iovec_init_external(&qiov, &iov, 1); 1103 1104 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1105 1106 /* Keep bounce buffer around if it is big enough for all 1107 * all future requests. 1108 */ 1109 if (num < max_xfer_len) { 1110 qemu_vfree(iov.iov_base); 1111 iov.iov_base = NULL; 1112 } 1113 } 1114 1115 sector_num += num; 1116 nb_sectors -= num; 1117 } 1118 1119 fail: 1120 qemu_vfree(iov.iov_base); 1121 return ret; 1122 } 1123 1124 /* 1125 * Forwards an already correctly aligned write request to the BlockDriver. 1126 */ 1127 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1128 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1129 QEMUIOVector *qiov, int flags) 1130 { 1131 BlockDriver *drv = bs->drv; 1132 bool waited; 1133 int ret; 1134 1135 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1136 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1137 1138 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1139 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1140 assert(!qiov || bytes == qiov->size); 1141 1142 waited = wait_serialising_requests(req); 1143 assert(!waited || !req->serialising); 1144 assert(req->overlap_offset <= offset); 1145 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1146 1147 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1148 1149 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1150 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1151 qemu_iovec_is_zero(qiov)) { 1152 flags |= BDRV_REQ_ZERO_WRITE; 1153 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1154 flags |= BDRV_REQ_MAY_UNMAP; 1155 } 1156 } 1157 1158 if (ret < 0) { 1159 /* Do nothing, write notifier decided to fail this request */ 1160 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1161 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1162 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1163 } else { 1164 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1165 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1166 } 1167 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1168 1169 if (ret == 0 && !bs->enable_write_cache) { 1170 ret = bdrv_co_flush(bs); 1171 } 1172 1173 bdrv_set_dirty(bs, sector_num, nb_sectors); 1174 1175 if (bs->wr_highest_offset < offset + bytes) { 1176 bs->wr_highest_offset = offset + bytes; 1177 } 1178 1179 if (ret >= 0) { 1180 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1181 } 1182 1183 return ret; 1184 } 1185 1186 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1187 int64_t offset, 1188 unsigned int bytes, 1189 BdrvRequestFlags flags, 1190 BdrvTrackedRequest *req) 1191 { 1192 uint8_t *buf = NULL; 1193 QEMUIOVector local_qiov; 1194 struct iovec iov; 1195 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1196 unsigned int head_padding_bytes, tail_padding_bytes; 1197 int ret = 0; 1198 1199 head_padding_bytes = offset & (align - 1); 1200 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1201 1202 1203 assert(flags & BDRV_REQ_ZERO_WRITE); 1204 if (head_padding_bytes || tail_padding_bytes) { 1205 buf = qemu_blockalign(bs, align); 1206 iov = (struct iovec) { 1207 .iov_base = buf, 1208 .iov_len = align, 1209 }; 1210 qemu_iovec_init_external(&local_qiov, &iov, 1); 1211 } 1212 if (head_padding_bytes) { 1213 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1214 1215 /* RMW the unaligned part before head. */ 1216 mark_request_serialising(req, align); 1217 wait_serialising_requests(req); 1218 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1219 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1220 align, &local_qiov, 0); 1221 if (ret < 0) { 1222 goto fail; 1223 } 1224 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1225 1226 memset(buf + head_padding_bytes, 0, zero_bytes); 1227 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1228 &local_qiov, 1229 flags & ~BDRV_REQ_ZERO_WRITE); 1230 if (ret < 0) { 1231 goto fail; 1232 } 1233 offset += zero_bytes; 1234 bytes -= zero_bytes; 1235 } 1236 1237 assert(!bytes || (offset & (align - 1)) == 0); 1238 if (bytes >= align) { 1239 /* Write the aligned part in the middle. */ 1240 uint64_t aligned_bytes = bytes & ~(align - 1); 1241 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1242 NULL, flags); 1243 if (ret < 0) { 1244 goto fail; 1245 } 1246 bytes -= aligned_bytes; 1247 offset += aligned_bytes; 1248 } 1249 1250 assert(!bytes || (offset & (align - 1)) == 0); 1251 if (bytes) { 1252 assert(align == tail_padding_bytes + bytes); 1253 /* RMW the unaligned part after tail. */ 1254 mark_request_serialising(req, align); 1255 wait_serialising_requests(req); 1256 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1257 ret = bdrv_aligned_preadv(bs, req, offset, align, 1258 align, &local_qiov, 0); 1259 if (ret < 0) { 1260 goto fail; 1261 } 1262 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1263 1264 memset(buf, 0, bytes); 1265 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1266 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1267 } 1268 fail: 1269 qemu_vfree(buf); 1270 return ret; 1271 1272 } 1273 1274 /* 1275 * Handle a write request in coroutine context 1276 */ 1277 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1278 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1279 BdrvRequestFlags flags) 1280 { 1281 BdrvTrackedRequest req; 1282 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1283 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1284 uint8_t *head_buf = NULL; 1285 uint8_t *tail_buf = NULL; 1286 QEMUIOVector local_qiov; 1287 bool use_local_qiov = false; 1288 int ret; 1289 1290 if (!bs->drv) { 1291 return -ENOMEDIUM; 1292 } 1293 if (bs->read_only) { 1294 return -EPERM; 1295 } 1296 1297 ret = bdrv_check_byte_request(bs, offset, bytes); 1298 if (ret < 0) { 1299 return ret; 1300 } 1301 1302 /* throttling disk I/O */ 1303 if (bs->io_limits_enabled) { 1304 throttle_group_co_io_limits_intercept(bs, bytes, true); 1305 } 1306 1307 /* 1308 * Align write if necessary by performing a read-modify-write cycle. 1309 * Pad qiov with the read parts and be sure to have a tracked request not 1310 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1311 */ 1312 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1313 1314 if (!qiov) { 1315 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1316 goto out; 1317 } 1318 1319 if (offset & (align - 1)) { 1320 QEMUIOVector head_qiov; 1321 struct iovec head_iov; 1322 1323 mark_request_serialising(&req, align); 1324 wait_serialising_requests(&req); 1325 1326 head_buf = qemu_blockalign(bs, align); 1327 head_iov = (struct iovec) { 1328 .iov_base = head_buf, 1329 .iov_len = align, 1330 }; 1331 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1332 1333 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1334 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1335 align, &head_qiov, 0); 1336 if (ret < 0) { 1337 goto fail; 1338 } 1339 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1340 1341 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1342 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1343 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1344 use_local_qiov = true; 1345 1346 bytes += offset & (align - 1); 1347 offset = offset & ~(align - 1); 1348 } 1349 1350 if ((offset + bytes) & (align - 1)) { 1351 QEMUIOVector tail_qiov; 1352 struct iovec tail_iov; 1353 size_t tail_bytes; 1354 bool waited; 1355 1356 mark_request_serialising(&req, align); 1357 waited = wait_serialising_requests(&req); 1358 assert(!waited || !use_local_qiov); 1359 1360 tail_buf = qemu_blockalign(bs, align); 1361 tail_iov = (struct iovec) { 1362 .iov_base = tail_buf, 1363 .iov_len = align, 1364 }; 1365 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1366 1367 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1368 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1369 align, &tail_qiov, 0); 1370 if (ret < 0) { 1371 goto fail; 1372 } 1373 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1374 1375 if (!use_local_qiov) { 1376 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1377 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1378 use_local_qiov = true; 1379 } 1380 1381 tail_bytes = (offset + bytes) & (align - 1); 1382 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1383 1384 bytes = ROUND_UP(bytes, align); 1385 } 1386 1387 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1388 use_local_qiov ? &local_qiov : qiov, 1389 flags); 1390 1391 fail: 1392 1393 if (use_local_qiov) { 1394 qemu_iovec_destroy(&local_qiov); 1395 } 1396 qemu_vfree(head_buf); 1397 qemu_vfree(tail_buf); 1398 out: 1399 tracked_request_end(&req); 1400 return ret; 1401 } 1402 1403 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1404 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1405 BdrvRequestFlags flags) 1406 { 1407 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1408 return -EINVAL; 1409 } 1410 1411 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1412 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1413 } 1414 1415 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1416 int nb_sectors, QEMUIOVector *qiov) 1417 { 1418 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1419 1420 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1421 } 1422 1423 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1424 int64_t sector_num, int nb_sectors, 1425 BdrvRequestFlags flags) 1426 { 1427 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1428 1429 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1430 flags &= ~BDRV_REQ_MAY_UNMAP; 1431 } 1432 1433 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1434 BDRV_REQ_ZERO_WRITE | flags); 1435 } 1436 1437 int bdrv_flush_all(void) 1438 { 1439 BlockDriverState *bs = NULL; 1440 int result = 0; 1441 1442 while ((bs = bdrv_next(bs))) { 1443 AioContext *aio_context = bdrv_get_aio_context(bs); 1444 int ret; 1445 1446 aio_context_acquire(aio_context); 1447 ret = bdrv_flush(bs); 1448 if (ret < 0 && !result) { 1449 result = ret; 1450 } 1451 aio_context_release(aio_context); 1452 } 1453 1454 return result; 1455 } 1456 1457 typedef struct BdrvCoGetBlockStatusData { 1458 BlockDriverState *bs; 1459 BlockDriverState *base; 1460 int64_t sector_num; 1461 int nb_sectors; 1462 int *pnum; 1463 int64_t ret; 1464 bool done; 1465 } BdrvCoGetBlockStatusData; 1466 1467 /* 1468 * Returns the allocation status of the specified sectors. 1469 * Drivers not implementing the functionality are assumed to not support 1470 * backing files, hence all their sectors are reported as allocated. 1471 * 1472 * If 'sector_num' is beyond the end of the disk image the return value is 0 1473 * and 'pnum' is set to 0. 1474 * 1475 * 'pnum' is set to the number of sectors (including and immediately following 1476 * the specified sector) that are known to be in the same 1477 * allocated/unallocated state. 1478 * 1479 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1480 * beyond the end of the disk image it will be clamped. 1481 */ 1482 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1483 int64_t sector_num, 1484 int nb_sectors, int *pnum) 1485 { 1486 int64_t total_sectors; 1487 int64_t n; 1488 int64_t ret, ret2; 1489 1490 total_sectors = bdrv_nb_sectors(bs); 1491 if (total_sectors < 0) { 1492 return total_sectors; 1493 } 1494 1495 if (sector_num >= total_sectors) { 1496 *pnum = 0; 1497 return 0; 1498 } 1499 1500 n = total_sectors - sector_num; 1501 if (n < nb_sectors) { 1502 nb_sectors = n; 1503 } 1504 1505 if (!bs->drv->bdrv_co_get_block_status) { 1506 *pnum = nb_sectors; 1507 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1508 if (bs->drv->protocol_name) { 1509 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1510 } 1511 return ret; 1512 } 1513 1514 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1515 if (ret < 0) { 1516 *pnum = 0; 1517 return ret; 1518 } 1519 1520 if (ret & BDRV_BLOCK_RAW) { 1521 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1522 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1523 *pnum, pnum); 1524 } 1525 1526 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1527 ret |= BDRV_BLOCK_ALLOCATED; 1528 } else { 1529 if (bdrv_unallocated_blocks_are_zero(bs)) { 1530 ret |= BDRV_BLOCK_ZERO; 1531 } else if (bs->backing) { 1532 BlockDriverState *bs2 = bs->backing->bs; 1533 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1534 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1535 ret |= BDRV_BLOCK_ZERO; 1536 } 1537 } 1538 } 1539 1540 if (bs->file && 1541 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1542 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1543 int file_pnum; 1544 1545 ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1546 *pnum, &file_pnum); 1547 if (ret2 >= 0) { 1548 /* Ignore errors. This is just providing extra information, it 1549 * is useful but not necessary. 1550 */ 1551 if (!file_pnum) { 1552 /* !file_pnum indicates an offset at or beyond the EOF; it is 1553 * perfectly valid for the format block driver to point to such 1554 * offsets, so catch it and mark everything as zero */ 1555 ret |= BDRV_BLOCK_ZERO; 1556 } else { 1557 /* Limit request to the range reported by the protocol driver */ 1558 *pnum = file_pnum; 1559 ret |= (ret2 & BDRV_BLOCK_ZERO); 1560 } 1561 } 1562 } 1563 1564 return ret; 1565 } 1566 1567 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1568 BlockDriverState *base, 1569 int64_t sector_num, 1570 int nb_sectors, 1571 int *pnum) 1572 { 1573 BlockDriverState *p; 1574 int64_t ret = 0; 1575 1576 assert(bs != base); 1577 for (p = bs; p != base; p = backing_bs(p)) { 1578 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); 1579 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1580 break; 1581 } 1582 /* [sector_num, pnum] unallocated on this layer, which could be only 1583 * the first part of [sector_num, nb_sectors]. */ 1584 nb_sectors = MIN(nb_sectors, *pnum); 1585 } 1586 return ret; 1587 } 1588 1589 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1590 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1591 { 1592 BdrvCoGetBlockStatusData *data = opaque; 1593 1594 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1595 data->sector_num, 1596 data->nb_sectors, 1597 data->pnum); 1598 data->done = true; 1599 } 1600 1601 /* 1602 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1603 * 1604 * See bdrv_co_get_block_status_above() for details. 1605 */ 1606 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1607 BlockDriverState *base, 1608 int64_t sector_num, 1609 int nb_sectors, int *pnum) 1610 { 1611 Coroutine *co; 1612 BdrvCoGetBlockStatusData data = { 1613 .bs = bs, 1614 .base = base, 1615 .sector_num = sector_num, 1616 .nb_sectors = nb_sectors, 1617 .pnum = pnum, 1618 .done = false, 1619 }; 1620 1621 if (qemu_in_coroutine()) { 1622 /* Fast-path if already in coroutine context */ 1623 bdrv_get_block_status_above_co_entry(&data); 1624 } else { 1625 AioContext *aio_context = bdrv_get_aio_context(bs); 1626 1627 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1628 qemu_coroutine_enter(co, &data); 1629 while (!data.done) { 1630 aio_poll(aio_context, true); 1631 } 1632 } 1633 return data.ret; 1634 } 1635 1636 int64_t bdrv_get_block_status(BlockDriverState *bs, 1637 int64_t sector_num, 1638 int nb_sectors, int *pnum) 1639 { 1640 return bdrv_get_block_status_above(bs, backing_bs(bs), 1641 sector_num, nb_sectors, pnum); 1642 } 1643 1644 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1645 int nb_sectors, int *pnum) 1646 { 1647 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1648 if (ret < 0) { 1649 return ret; 1650 } 1651 return !!(ret & BDRV_BLOCK_ALLOCATED); 1652 } 1653 1654 /* 1655 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1656 * 1657 * Return true if the given sector is allocated in any image between 1658 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1659 * sector is allocated in any image of the chain. Return false otherwise. 1660 * 1661 * 'pnum' is set to the number of sectors (including and immediately following 1662 * the specified sector) that are known to be in the same 1663 * allocated/unallocated state. 1664 * 1665 */ 1666 int bdrv_is_allocated_above(BlockDriverState *top, 1667 BlockDriverState *base, 1668 int64_t sector_num, 1669 int nb_sectors, int *pnum) 1670 { 1671 BlockDriverState *intermediate; 1672 int ret, n = nb_sectors; 1673 1674 intermediate = top; 1675 while (intermediate && intermediate != base) { 1676 int pnum_inter; 1677 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1678 &pnum_inter); 1679 if (ret < 0) { 1680 return ret; 1681 } else if (ret) { 1682 *pnum = pnum_inter; 1683 return 1; 1684 } 1685 1686 /* 1687 * [sector_num, nb_sectors] is unallocated on top but intermediate 1688 * might have 1689 * 1690 * [sector_num+x, nr_sectors] allocated. 1691 */ 1692 if (n > pnum_inter && 1693 (intermediate == top || 1694 sector_num + pnum_inter < intermediate->total_sectors)) { 1695 n = pnum_inter; 1696 } 1697 1698 intermediate = backing_bs(intermediate); 1699 } 1700 1701 *pnum = n; 1702 return 0; 1703 } 1704 1705 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1706 const uint8_t *buf, int nb_sectors) 1707 { 1708 BlockDriver *drv = bs->drv; 1709 int ret; 1710 1711 if (!drv) { 1712 return -ENOMEDIUM; 1713 } 1714 if (!drv->bdrv_write_compressed) { 1715 return -ENOTSUP; 1716 } 1717 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1718 if (ret < 0) { 1719 return ret; 1720 } 1721 1722 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1723 1724 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1725 } 1726 1727 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1728 int64_t pos, int size) 1729 { 1730 QEMUIOVector qiov; 1731 struct iovec iov = { 1732 .iov_base = (void *) buf, 1733 .iov_len = size, 1734 }; 1735 1736 qemu_iovec_init_external(&qiov, &iov, 1); 1737 return bdrv_writev_vmstate(bs, &qiov, pos); 1738 } 1739 1740 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1741 { 1742 BlockDriver *drv = bs->drv; 1743 1744 if (!drv) { 1745 return -ENOMEDIUM; 1746 } else if (drv->bdrv_save_vmstate) { 1747 return drv->bdrv_save_vmstate(bs, qiov, pos); 1748 } else if (bs->file) { 1749 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1750 } 1751 1752 return -ENOTSUP; 1753 } 1754 1755 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1756 int64_t pos, int size) 1757 { 1758 BlockDriver *drv = bs->drv; 1759 if (!drv) 1760 return -ENOMEDIUM; 1761 if (drv->bdrv_load_vmstate) 1762 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1763 if (bs->file) 1764 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1765 return -ENOTSUP; 1766 } 1767 1768 /**************************************************************/ 1769 /* async I/Os */ 1770 1771 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1772 QEMUIOVector *qiov, int nb_sectors, 1773 BlockCompletionFunc *cb, void *opaque) 1774 { 1775 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1776 1777 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1778 cb, opaque, false); 1779 } 1780 1781 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1782 QEMUIOVector *qiov, int nb_sectors, 1783 BlockCompletionFunc *cb, void *opaque) 1784 { 1785 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1786 1787 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1788 cb, opaque, true); 1789 } 1790 1791 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1792 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1793 BlockCompletionFunc *cb, void *opaque) 1794 { 1795 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1796 1797 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1798 BDRV_REQ_ZERO_WRITE | flags, 1799 cb, opaque, true); 1800 } 1801 1802 1803 typedef struct MultiwriteCB { 1804 int error; 1805 int num_requests; 1806 int num_callbacks; 1807 struct { 1808 BlockCompletionFunc *cb; 1809 void *opaque; 1810 QEMUIOVector *free_qiov; 1811 } callbacks[]; 1812 } MultiwriteCB; 1813 1814 static void multiwrite_user_cb(MultiwriteCB *mcb) 1815 { 1816 int i; 1817 1818 for (i = 0; i < mcb->num_callbacks; i++) { 1819 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1820 if (mcb->callbacks[i].free_qiov) { 1821 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1822 } 1823 g_free(mcb->callbacks[i].free_qiov); 1824 } 1825 } 1826 1827 static void multiwrite_cb(void *opaque, int ret) 1828 { 1829 MultiwriteCB *mcb = opaque; 1830 1831 trace_multiwrite_cb(mcb, ret); 1832 1833 if (ret < 0 && !mcb->error) { 1834 mcb->error = ret; 1835 } 1836 1837 mcb->num_requests--; 1838 if (mcb->num_requests == 0) { 1839 multiwrite_user_cb(mcb); 1840 g_free(mcb); 1841 } 1842 } 1843 1844 static int multiwrite_req_compare(const void *a, const void *b) 1845 { 1846 const BlockRequest *req1 = a, *req2 = b; 1847 1848 /* 1849 * Note that we can't simply subtract req2->sector from req1->sector 1850 * here as that could overflow the return value. 1851 */ 1852 if (req1->sector > req2->sector) { 1853 return 1; 1854 } else if (req1->sector < req2->sector) { 1855 return -1; 1856 } else { 1857 return 0; 1858 } 1859 } 1860 1861 /* 1862 * Takes a bunch of requests and tries to merge them. Returns the number of 1863 * requests that remain after merging. 1864 */ 1865 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1866 int num_reqs, MultiwriteCB *mcb) 1867 { 1868 int i, outidx; 1869 1870 // Sort requests by start sector 1871 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1872 1873 // Check if adjacent requests touch the same clusters. If so, combine them, 1874 // filling up gaps with zero sectors. 1875 outidx = 0; 1876 for (i = 1; i < num_reqs; i++) { 1877 int merge = 0; 1878 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1879 1880 // Handle exactly sequential writes and overlapping writes. 1881 if (reqs[i].sector <= oldreq_last) { 1882 merge = 1; 1883 } 1884 1885 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1886 merge = 0; 1887 } 1888 1889 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1890 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1891 merge = 0; 1892 } 1893 1894 if (merge) { 1895 size_t size; 1896 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1897 qemu_iovec_init(qiov, 1898 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1899 1900 // Add the first request to the merged one. If the requests are 1901 // overlapping, drop the last sectors of the first request. 1902 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1903 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1904 1905 // We should need to add any zeros between the two requests 1906 assert (reqs[i].sector <= oldreq_last); 1907 1908 // Add the second request 1909 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1910 1911 // Add tail of first request, if necessary 1912 if (qiov->size < reqs[outidx].qiov->size) { 1913 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1914 reqs[outidx].qiov->size - qiov->size); 1915 } 1916 1917 reqs[outidx].nb_sectors = qiov->size >> 9; 1918 reqs[outidx].qiov = qiov; 1919 1920 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1921 } else { 1922 outidx++; 1923 reqs[outidx].sector = reqs[i].sector; 1924 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1925 reqs[outidx].qiov = reqs[i].qiov; 1926 } 1927 } 1928 1929 if (bs->blk) { 1930 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1931 num_reqs - outidx - 1); 1932 } 1933 1934 return outidx + 1; 1935 } 1936 1937 /* 1938 * Submit multiple AIO write requests at once. 1939 * 1940 * On success, the function returns 0 and all requests in the reqs array have 1941 * been submitted. In error case this function returns -1, and any of the 1942 * requests may or may not be submitted yet. In particular, this means that the 1943 * callback will be called for some of the requests, for others it won't. The 1944 * caller must check the error field of the BlockRequest to wait for the right 1945 * callbacks (if error != 0, no callback will be called). 1946 * 1947 * The implementation may modify the contents of the reqs array, e.g. to merge 1948 * requests. However, the fields opaque and error are left unmodified as they 1949 * are used to signal failure for a single request to the caller. 1950 */ 1951 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1952 { 1953 MultiwriteCB *mcb; 1954 int i; 1955 1956 /* don't submit writes if we don't have a medium */ 1957 if (bs->drv == NULL) { 1958 for (i = 0; i < num_reqs; i++) { 1959 reqs[i].error = -ENOMEDIUM; 1960 } 1961 return -1; 1962 } 1963 1964 if (num_reqs == 0) { 1965 return 0; 1966 } 1967 1968 // Create MultiwriteCB structure 1969 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1970 mcb->num_requests = 0; 1971 mcb->num_callbacks = num_reqs; 1972 1973 for (i = 0; i < num_reqs; i++) { 1974 mcb->callbacks[i].cb = reqs[i].cb; 1975 mcb->callbacks[i].opaque = reqs[i].opaque; 1976 } 1977 1978 // Check for mergable requests 1979 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1980 1981 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1982 1983 /* Run the aio requests. */ 1984 mcb->num_requests = num_reqs; 1985 for (i = 0; i < num_reqs; i++) { 1986 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1987 reqs[i].nb_sectors, reqs[i].flags, 1988 multiwrite_cb, mcb, 1989 true); 1990 } 1991 1992 return 0; 1993 } 1994 1995 void bdrv_aio_cancel(BlockAIOCB *acb) 1996 { 1997 qemu_aio_ref(acb); 1998 bdrv_aio_cancel_async(acb); 1999 while (acb->refcnt > 1) { 2000 if (acb->aiocb_info->get_aio_context) { 2001 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2002 } else if (acb->bs) { 2003 aio_poll(bdrv_get_aio_context(acb->bs), true); 2004 } else { 2005 abort(); 2006 } 2007 } 2008 qemu_aio_unref(acb); 2009 } 2010 2011 /* Async version of aio cancel. The caller is not blocked if the acb implements 2012 * cancel_async, otherwise we do nothing and let the request normally complete. 2013 * In either case the completion callback must be called. */ 2014 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2015 { 2016 if (acb->aiocb_info->cancel_async) { 2017 acb->aiocb_info->cancel_async(acb); 2018 } 2019 } 2020 2021 /**************************************************************/ 2022 /* async block device emulation */ 2023 2024 typedef struct BlockAIOCBSync { 2025 BlockAIOCB common; 2026 QEMUBH *bh; 2027 int ret; 2028 /* vector translation state */ 2029 QEMUIOVector *qiov; 2030 uint8_t *bounce; 2031 int is_write; 2032 } BlockAIOCBSync; 2033 2034 static const AIOCBInfo bdrv_em_aiocb_info = { 2035 .aiocb_size = sizeof(BlockAIOCBSync), 2036 }; 2037 2038 static void bdrv_aio_bh_cb(void *opaque) 2039 { 2040 BlockAIOCBSync *acb = opaque; 2041 2042 if (!acb->is_write && acb->ret >= 0) { 2043 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2044 } 2045 qemu_vfree(acb->bounce); 2046 acb->common.cb(acb->common.opaque, acb->ret); 2047 qemu_bh_delete(acb->bh); 2048 acb->bh = NULL; 2049 qemu_aio_unref(acb); 2050 } 2051 2052 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2053 int64_t sector_num, 2054 QEMUIOVector *qiov, 2055 int nb_sectors, 2056 BlockCompletionFunc *cb, 2057 void *opaque, 2058 int is_write) 2059 2060 { 2061 BlockAIOCBSync *acb; 2062 2063 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2064 acb->is_write = is_write; 2065 acb->qiov = qiov; 2066 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2067 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2068 2069 if (acb->bounce == NULL) { 2070 acb->ret = -ENOMEM; 2071 } else if (is_write) { 2072 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2073 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2074 } else { 2075 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2076 } 2077 2078 qemu_bh_schedule(acb->bh); 2079 2080 return &acb->common; 2081 } 2082 2083 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2084 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2085 BlockCompletionFunc *cb, void *opaque) 2086 { 2087 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2088 } 2089 2090 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2091 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2092 BlockCompletionFunc *cb, void *opaque) 2093 { 2094 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2095 } 2096 2097 2098 typedef struct BlockAIOCBCoroutine { 2099 BlockAIOCB common; 2100 BlockRequest req; 2101 bool is_write; 2102 bool need_bh; 2103 bool *done; 2104 QEMUBH* bh; 2105 } BlockAIOCBCoroutine; 2106 2107 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2108 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2109 }; 2110 2111 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2112 { 2113 if (!acb->need_bh) { 2114 acb->common.cb(acb->common.opaque, acb->req.error); 2115 qemu_aio_unref(acb); 2116 } 2117 } 2118 2119 static void bdrv_co_em_bh(void *opaque) 2120 { 2121 BlockAIOCBCoroutine *acb = opaque; 2122 2123 assert(!acb->need_bh); 2124 qemu_bh_delete(acb->bh); 2125 bdrv_co_complete(acb); 2126 } 2127 2128 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2129 { 2130 acb->need_bh = false; 2131 if (acb->req.error != -EINPROGRESS) { 2132 BlockDriverState *bs = acb->common.bs; 2133 2134 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2135 qemu_bh_schedule(acb->bh); 2136 } 2137 } 2138 2139 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2140 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2141 { 2142 BlockAIOCBCoroutine *acb = opaque; 2143 BlockDriverState *bs = acb->common.bs; 2144 2145 if (!acb->is_write) { 2146 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2147 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2148 } else { 2149 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2150 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2151 } 2152 2153 bdrv_co_complete(acb); 2154 } 2155 2156 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2157 int64_t sector_num, 2158 QEMUIOVector *qiov, 2159 int nb_sectors, 2160 BdrvRequestFlags flags, 2161 BlockCompletionFunc *cb, 2162 void *opaque, 2163 bool is_write) 2164 { 2165 Coroutine *co; 2166 BlockAIOCBCoroutine *acb; 2167 2168 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2169 acb->need_bh = true; 2170 acb->req.error = -EINPROGRESS; 2171 acb->req.sector = sector_num; 2172 acb->req.nb_sectors = nb_sectors; 2173 acb->req.qiov = qiov; 2174 acb->req.flags = flags; 2175 acb->is_write = is_write; 2176 2177 co = qemu_coroutine_create(bdrv_co_do_rw); 2178 qemu_coroutine_enter(co, acb); 2179 2180 bdrv_co_maybe_schedule_bh(acb); 2181 return &acb->common; 2182 } 2183 2184 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2185 { 2186 BlockAIOCBCoroutine *acb = opaque; 2187 BlockDriverState *bs = acb->common.bs; 2188 2189 acb->req.error = bdrv_co_flush(bs); 2190 bdrv_co_complete(acb); 2191 } 2192 2193 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2194 BlockCompletionFunc *cb, void *opaque) 2195 { 2196 trace_bdrv_aio_flush(bs, opaque); 2197 2198 Coroutine *co; 2199 BlockAIOCBCoroutine *acb; 2200 2201 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2202 acb->need_bh = true; 2203 acb->req.error = -EINPROGRESS; 2204 2205 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2206 qemu_coroutine_enter(co, acb); 2207 2208 bdrv_co_maybe_schedule_bh(acb); 2209 return &acb->common; 2210 } 2211 2212 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2213 { 2214 BlockAIOCBCoroutine *acb = opaque; 2215 BlockDriverState *bs = acb->common.bs; 2216 2217 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2218 bdrv_co_complete(acb); 2219 } 2220 2221 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2222 int64_t sector_num, int nb_sectors, 2223 BlockCompletionFunc *cb, void *opaque) 2224 { 2225 Coroutine *co; 2226 BlockAIOCBCoroutine *acb; 2227 2228 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2229 2230 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2231 acb->need_bh = true; 2232 acb->req.error = -EINPROGRESS; 2233 acb->req.sector = sector_num; 2234 acb->req.nb_sectors = nb_sectors; 2235 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2236 qemu_coroutine_enter(co, acb); 2237 2238 bdrv_co_maybe_schedule_bh(acb); 2239 return &acb->common; 2240 } 2241 2242 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2243 BlockCompletionFunc *cb, void *opaque) 2244 { 2245 BlockAIOCB *acb; 2246 2247 acb = g_malloc(aiocb_info->aiocb_size); 2248 acb->aiocb_info = aiocb_info; 2249 acb->bs = bs; 2250 acb->cb = cb; 2251 acb->opaque = opaque; 2252 acb->refcnt = 1; 2253 return acb; 2254 } 2255 2256 void qemu_aio_ref(void *p) 2257 { 2258 BlockAIOCB *acb = p; 2259 acb->refcnt++; 2260 } 2261 2262 void qemu_aio_unref(void *p) 2263 { 2264 BlockAIOCB *acb = p; 2265 assert(acb->refcnt > 0); 2266 if (--acb->refcnt == 0) { 2267 g_free(acb); 2268 } 2269 } 2270 2271 /**************************************************************/ 2272 /* Coroutine block device emulation */ 2273 2274 typedef struct CoroutineIOCompletion { 2275 Coroutine *coroutine; 2276 int ret; 2277 } CoroutineIOCompletion; 2278 2279 static void bdrv_co_io_em_complete(void *opaque, int ret) 2280 { 2281 CoroutineIOCompletion *co = opaque; 2282 2283 co->ret = ret; 2284 qemu_coroutine_enter(co->coroutine, NULL); 2285 } 2286 2287 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2288 int nb_sectors, QEMUIOVector *iov, 2289 bool is_write) 2290 { 2291 CoroutineIOCompletion co = { 2292 .coroutine = qemu_coroutine_self(), 2293 }; 2294 BlockAIOCB *acb; 2295 2296 if (is_write) { 2297 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2298 bdrv_co_io_em_complete, &co); 2299 } else { 2300 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2301 bdrv_co_io_em_complete, &co); 2302 } 2303 2304 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2305 if (!acb) { 2306 return -EIO; 2307 } 2308 qemu_coroutine_yield(); 2309 2310 return co.ret; 2311 } 2312 2313 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2314 int64_t sector_num, int nb_sectors, 2315 QEMUIOVector *iov) 2316 { 2317 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2318 } 2319 2320 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2321 int64_t sector_num, int nb_sectors, 2322 QEMUIOVector *iov) 2323 { 2324 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2325 } 2326 2327 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2328 { 2329 RwCo *rwco = opaque; 2330 2331 rwco->ret = bdrv_co_flush(rwco->bs); 2332 } 2333 2334 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2335 { 2336 int ret; 2337 BdrvTrackedRequest req; 2338 2339 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2340 bdrv_is_sg(bs)) { 2341 return 0; 2342 } 2343 2344 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2345 /* Write back cached data to the OS even with cache=unsafe */ 2346 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2347 if (bs->drv->bdrv_co_flush_to_os) { 2348 ret = bs->drv->bdrv_co_flush_to_os(bs); 2349 if (ret < 0) { 2350 goto out; 2351 } 2352 } 2353 2354 /* But don't actually force it to the disk with cache=unsafe */ 2355 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2356 goto flush_parent; 2357 } 2358 2359 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2360 if (bs->drv->bdrv_co_flush_to_disk) { 2361 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2362 } else if (bs->drv->bdrv_aio_flush) { 2363 BlockAIOCB *acb; 2364 CoroutineIOCompletion co = { 2365 .coroutine = qemu_coroutine_self(), 2366 }; 2367 2368 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2369 if (acb == NULL) { 2370 ret = -EIO; 2371 } else { 2372 qemu_coroutine_yield(); 2373 ret = co.ret; 2374 } 2375 } else { 2376 /* 2377 * Some block drivers always operate in either writethrough or unsafe 2378 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2379 * know how the server works (because the behaviour is hardcoded or 2380 * depends on server-side configuration), so we can't ensure that 2381 * everything is safe on disk. Returning an error doesn't work because 2382 * that would break guests even if the server operates in writethrough 2383 * mode. 2384 * 2385 * Let's hope the user knows what he's doing. 2386 */ 2387 ret = 0; 2388 } 2389 if (ret < 0) { 2390 goto out; 2391 } 2392 2393 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2394 * in the case of cache=unsafe, so there are no useless flushes. 2395 */ 2396 flush_parent: 2397 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2398 out: 2399 tracked_request_end(&req); 2400 return ret; 2401 } 2402 2403 int bdrv_flush(BlockDriverState *bs) 2404 { 2405 Coroutine *co; 2406 RwCo rwco = { 2407 .bs = bs, 2408 .ret = NOT_DONE, 2409 }; 2410 2411 if (qemu_in_coroutine()) { 2412 /* Fast-path if already in coroutine context */ 2413 bdrv_flush_co_entry(&rwco); 2414 } else { 2415 AioContext *aio_context = bdrv_get_aio_context(bs); 2416 2417 co = qemu_coroutine_create(bdrv_flush_co_entry); 2418 qemu_coroutine_enter(co, &rwco); 2419 while (rwco.ret == NOT_DONE) { 2420 aio_poll(aio_context, true); 2421 } 2422 } 2423 2424 return rwco.ret; 2425 } 2426 2427 typedef struct DiscardCo { 2428 BlockDriverState *bs; 2429 int64_t sector_num; 2430 int nb_sectors; 2431 int ret; 2432 } DiscardCo; 2433 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2434 { 2435 DiscardCo *rwco = opaque; 2436 2437 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2438 } 2439 2440 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2441 int nb_sectors) 2442 { 2443 BdrvTrackedRequest req; 2444 int max_discard, ret; 2445 2446 if (!bs->drv) { 2447 return -ENOMEDIUM; 2448 } 2449 2450 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2451 if (ret < 0) { 2452 return ret; 2453 } else if (bs->read_only) { 2454 return -EPERM; 2455 } 2456 2457 /* Do nothing if disabled. */ 2458 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2459 return 0; 2460 } 2461 2462 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2463 return 0; 2464 } 2465 2466 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2467 BDRV_TRACKED_DISCARD); 2468 bdrv_set_dirty(bs, sector_num, nb_sectors); 2469 2470 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2471 while (nb_sectors > 0) { 2472 int ret; 2473 int num = nb_sectors; 2474 2475 /* align request */ 2476 if (bs->bl.discard_alignment && 2477 num >= bs->bl.discard_alignment && 2478 sector_num % bs->bl.discard_alignment) { 2479 if (num > bs->bl.discard_alignment) { 2480 num = bs->bl.discard_alignment; 2481 } 2482 num -= sector_num % bs->bl.discard_alignment; 2483 } 2484 2485 /* limit request size */ 2486 if (num > max_discard) { 2487 num = max_discard; 2488 } 2489 2490 if (bs->drv->bdrv_co_discard) { 2491 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2492 } else { 2493 BlockAIOCB *acb; 2494 CoroutineIOCompletion co = { 2495 .coroutine = qemu_coroutine_self(), 2496 }; 2497 2498 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2499 bdrv_co_io_em_complete, &co); 2500 if (acb == NULL) { 2501 ret = -EIO; 2502 goto out; 2503 } else { 2504 qemu_coroutine_yield(); 2505 ret = co.ret; 2506 } 2507 } 2508 if (ret && ret != -ENOTSUP) { 2509 goto out; 2510 } 2511 2512 sector_num += num; 2513 nb_sectors -= num; 2514 } 2515 ret = 0; 2516 out: 2517 tracked_request_end(&req); 2518 return ret; 2519 } 2520 2521 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2522 { 2523 Coroutine *co; 2524 DiscardCo rwco = { 2525 .bs = bs, 2526 .sector_num = sector_num, 2527 .nb_sectors = nb_sectors, 2528 .ret = NOT_DONE, 2529 }; 2530 2531 if (qemu_in_coroutine()) { 2532 /* Fast-path if already in coroutine context */ 2533 bdrv_discard_co_entry(&rwco); 2534 } else { 2535 AioContext *aio_context = bdrv_get_aio_context(bs); 2536 2537 co = qemu_coroutine_create(bdrv_discard_co_entry); 2538 qemu_coroutine_enter(co, &rwco); 2539 while (rwco.ret == NOT_DONE) { 2540 aio_poll(aio_context, true); 2541 } 2542 } 2543 2544 return rwco.ret; 2545 } 2546 2547 typedef struct { 2548 CoroutineIOCompletion *co; 2549 QEMUBH *bh; 2550 } BdrvIoctlCompletionData; 2551 2552 static void bdrv_ioctl_bh_cb(void *opaque) 2553 { 2554 BdrvIoctlCompletionData *data = opaque; 2555 2556 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2557 qemu_bh_delete(data->bh); 2558 } 2559 2560 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2561 { 2562 BlockDriver *drv = bs->drv; 2563 BdrvTrackedRequest tracked_req; 2564 CoroutineIOCompletion co = { 2565 .coroutine = qemu_coroutine_self(), 2566 }; 2567 BlockAIOCB *acb; 2568 2569 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2570 if (!drv || !drv->bdrv_aio_ioctl) { 2571 co.ret = -ENOTSUP; 2572 goto out; 2573 } 2574 2575 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2576 if (!acb) { 2577 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2578 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2579 bdrv_ioctl_bh_cb, data); 2580 data->co = &co; 2581 qemu_bh_schedule(data->bh); 2582 } 2583 qemu_coroutine_yield(); 2584 out: 2585 tracked_request_end(&tracked_req); 2586 return co.ret; 2587 } 2588 2589 typedef struct { 2590 BlockDriverState *bs; 2591 int req; 2592 void *buf; 2593 int ret; 2594 } BdrvIoctlCoData; 2595 2596 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2597 { 2598 BdrvIoctlCoData *data = opaque; 2599 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2600 } 2601 2602 /* needed for generic scsi interface */ 2603 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2604 { 2605 BdrvIoctlCoData data = { 2606 .bs = bs, 2607 .req = req, 2608 .buf = buf, 2609 .ret = -EINPROGRESS, 2610 }; 2611 2612 if (qemu_in_coroutine()) { 2613 /* Fast-path if already in coroutine context */ 2614 bdrv_co_ioctl_entry(&data); 2615 } else { 2616 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2617 qemu_coroutine_enter(co, &data); 2618 } 2619 while (data.ret == -EINPROGRESS) { 2620 aio_poll(bdrv_get_aio_context(bs), true); 2621 } 2622 return data.ret; 2623 } 2624 2625 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2626 { 2627 BlockAIOCBCoroutine *acb = opaque; 2628 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2629 acb->req.req, acb->req.buf); 2630 bdrv_co_complete(acb); 2631 } 2632 2633 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2634 unsigned long int req, void *buf, 2635 BlockCompletionFunc *cb, void *opaque) 2636 { 2637 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2638 bs, cb, opaque); 2639 Coroutine *co; 2640 2641 acb->need_bh = true; 2642 acb->req.error = -EINPROGRESS; 2643 acb->req.req = req; 2644 acb->req.buf = buf; 2645 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2646 qemu_coroutine_enter(co, acb); 2647 2648 bdrv_co_maybe_schedule_bh(acb); 2649 return &acb->common; 2650 } 2651 2652 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2653 { 2654 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2655 } 2656 2657 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2658 { 2659 return memset(qemu_blockalign(bs, size), 0, size); 2660 } 2661 2662 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2663 { 2664 size_t align = bdrv_opt_mem_align(bs); 2665 2666 /* Ensure that NULL is never returned on success */ 2667 assert(align > 0); 2668 if (size == 0) { 2669 size = align; 2670 } 2671 2672 return qemu_try_memalign(align, size); 2673 } 2674 2675 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2676 { 2677 void *mem = qemu_try_blockalign(bs, size); 2678 2679 if (mem) { 2680 memset(mem, 0, size); 2681 } 2682 2683 return mem; 2684 } 2685 2686 /* 2687 * Check if all memory in this vector is sector aligned. 2688 */ 2689 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2690 { 2691 int i; 2692 size_t alignment = bdrv_min_mem_align(bs); 2693 2694 for (i = 0; i < qiov->niov; i++) { 2695 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2696 return false; 2697 } 2698 if (qiov->iov[i].iov_len % alignment) { 2699 return false; 2700 } 2701 } 2702 2703 return true; 2704 } 2705 2706 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2707 NotifierWithReturn *notifier) 2708 { 2709 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2710 } 2711 2712 void bdrv_io_plug(BlockDriverState *bs) 2713 { 2714 BlockDriver *drv = bs->drv; 2715 if (drv && drv->bdrv_io_plug) { 2716 drv->bdrv_io_plug(bs); 2717 } else if (bs->file) { 2718 bdrv_io_plug(bs->file->bs); 2719 } 2720 } 2721 2722 void bdrv_io_unplug(BlockDriverState *bs) 2723 { 2724 BlockDriver *drv = bs->drv; 2725 if (drv && drv->bdrv_io_unplug) { 2726 drv->bdrv_io_unplug(bs); 2727 } else if (bs->file) { 2728 bdrv_io_unplug(bs->file->bs); 2729 } 2730 } 2731 2732 void bdrv_flush_io_queue(BlockDriverState *bs) 2733 { 2734 BlockDriver *drv = bs->drv; 2735 if (drv && drv->bdrv_flush_io_queue) { 2736 drv->bdrv_flush_io_queue(bs); 2737 } else if (bs->file) { 2738 bdrv_flush_io_queue(bs->file->bs); 2739 } 2740 bdrv_start_throttled_reqs(bs); 2741 } 2742 2743 void bdrv_drained_begin(BlockDriverState *bs) 2744 { 2745 if (!bs->quiesce_counter++) { 2746 aio_disable_external(bdrv_get_aio_context(bs)); 2747 } 2748 bdrv_drain(bs); 2749 } 2750 2751 void bdrv_drained_end(BlockDriverState *bs) 2752 { 2753 assert(bs->quiesce_counter > 0); 2754 if (--bs->quiesce_counter > 0) { 2755 return; 2756 } 2757 aio_enable_external(bdrv_get_aio_context(bs)); 2758 } 2759