1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "sysemu/block-backend.h" 27 #include "block/blockjob.h" 28 #include "block/block_int.h" 29 #include "block/throttle-groups.h" 30 #include "qemu/error-report.h" 31 32 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 33 34 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 35 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 36 BlockCompletionFunc *cb, void *opaque); 37 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 38 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 39 BlockCompletionFunc *cb, void *opaque); 40 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 41 int64_t sector_num, int nb_sectors, 42 QEMUIOVector *iov); 43 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 44 int64_t sector_num, int nb_sectors, 45 QEMUIOVector *iov); 46 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 47 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 48 BdrvRequestFlags flags); 49 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 50 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 51 BdrvRequestFlags flags); 52 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 53 int64_t sector_num, 54 QEMUIOVector *qiov, 55 int nb_sectors, 56 BdrvRequestFlags flags, 57 BlockCompletionFunc *cb, 58 void *opaque, 59 bool is_write); 60 static void coroutine_fn bdrv_co_do_rw(void *opaque); 61 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 62 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 63 64 /* throttling disk I/O limits */ 65 void bdrv_set_io_limits(BlockDriverState *bs, 66 ThrottleConfig *cfg) 67 { 68 int i; 69 70 throttle_group_config(bs, cfg); 71 72 for (i = 0; i < 2; i++) { 73 qemu_co_enter_next(&bs->throttled_reqs[i]); 74 } 75 } 76 77 /* this function drain all the throttled IOs */ 78 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 79 { 80 bool drained = false; 81 bool enabled = bs->io_limits_enabled; 82 int i; 83 84 bs->io_limits_enabled = false; 85 86 for (i = 0; i < 2; i++) { 87 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 88 drained = true; 89 } 90 } 91 92 bs->io_limits_enabled = enabled; 93 94 return drained; 95 } 96 97 void bdrv_io_limits_disable(BlockDriverState *bs) 98 { 99 bs->io_limits_enabled = false; 100 bdrv_start_throttled_reqs(bs); 101 throttle_group_unregister_bs(bs); 102 } 103 104 /* should be called before bdrv_set_io_limits if a limit is set */ 105 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 106 { 107 assert(!bs->io_limits_enabled); 108 throttle_group_register_bs(bs, group); 109 bs->io_limits_enabled = true; 110 } 111 112 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 113 { 114 /* this bs is not part of any group */ 115 if (!bs->throttle_state) { 116 return; 117 } 118 119 /* this bs is a part of the same group than the one we want */ 120 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 121 return; 122 } 123 124 /* need to change the group this bs belong to */ 125 bdrv_io_limits_disable(bs); 126 bdrv_io_limits_enable(bs, group); 127 } 128 129 void bdrv_setup_io_funcs(BlockDriver *bdrv) 130 { 131 /* Block drivers without coroutine functions need emulation */ 132 if (!bdrv->bdrv_co_readv) { 133 bdrv->bdrv_co_readv = bdrv_co_readv_em; 134 bdrv->bdrv_co_writev = bdrv_co_writev_em; 135 136 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 137 * the block driver lacks aio we need to emulate that too. 138 */ 139 if (!bdrv->bdrv_aio_readv) { 140 /* add AIO emulation layer */ 141 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 142 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 143 } 144 } 145 } 146 147 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 148 { 149 BlockDriver *drv = bs->drv; 150 Error *local_err = NULL; 151 152 memset(&bs->bl, 0, sizeof(bs->bl)); 153 154 if (!drv) { 155 return; 156 } 157 158 /* Take some limits from the children as a default */ 159 if (bs->file) { 160 bdrv_refresh_limits(bs->file->bs, &local_err); 161 if (local_err) { 162 error_propagate(errp, local_err); 163 return; 164 } 165 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 166 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 167 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 168 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 169 } else { 170 bs->bl.min_mem_alignment = 512; 171 bs->bl.opt_mem_alignment = getpagesize(); 172 } 173 174 if (bs->backing) { 175 bdrv_refresh_limits(bs->backing->bs, &local_err); 176 if (local_err) { 177 error_propagate(errp, local_err); 178 return; 179 } 180 bs->bl.opt_transfer_length = 181 MAX(bs->bl.opt_transfer_length, 182 bs->backing->bs->bl.opt_transfer_length); 183 bs->bl.max_transfer_length = 184 MIN_NON_ZERO(bs->bl.max_transfer_length, 185 bs->backing->bs->bl.max_transfer_length); 186 bs->bl.opt_mem_alignment = 187 MAX(bs->bl.opt_mem_alignment, 188 bs->backing->bs->bl.opt_mem_alignment); 189 bs->bl.min_mem_alignment = 190 MAX(bs->bl.min_mem_alignment, 191 bs->backing->bs->bl.min_mem_alignment); 192 } 193 194 /* Then let the driver override it */ 195 if (drv->bdrv_refresh_limits) { 196 drv->bdrv_refresh_limits(bs, errp); 197 } 198 } 199 200 /** 201 * The copy-on-read flag is actually a reference count so multiple users may 202 * use the feature without worrying about clobbering its previous state. 203 * Copy-on-read stays enabled until all users have called to disable it. 204 */ 205 void bdrv_enable_copy_on_read(BlockDriverState *bs) 206 { 207 bs->copy_on_read++; 208 } 209 210 void bdrv_disable_copy_on_read(BlockDriverState *bs) 211 { 212 assert(bs->copy_on_read > 0); 213 bs->copy_on_read--; 214 } 215 216 /* Check if any requests are in-flight (including throttled requests) */ 217 bool bdrv_requests_pending(BlockDriverState *bs) 218 { 219 BdrvChild *child; 220 221 if (!QLIST_EMPTY(&bs->tracked_requests)) { 222 return true; 223 } 224 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 225 return true; 226 } 227 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 228 return true; 229 } 230 231 QLIST_FOREACH(child, &bs->children, next) { 232 if (bdrv_requests_pending(child->bs)) { 233 return true; 234 } 235 } 236 237 return false; 238 } 239 240 static void bdrv_drain_recurse(BlockDriverState *bs) 241 { 242 BdrvChild *child; 243 244 if (bs->drv && bs->drv->bdrv_drain) { 245 bs->drv->bdrv_drain(bs); 246 } 247 QLIST_FOREACH(child, &bs->children, next) { 248 bdrv_drain_recurse(child->bs); 249 } 250 } 251 252 /* 253 * Wait for pending requests to complete on a single BlockDriverState subtree, 254 * and suspend block driver's internal I/O until next request arrives. 255 * 256 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 257 * AioContext. 258 * 259 * Only this BlockDriverState's AioContext is run, so in-flight requests must 260 * not depend on events in other AioContexts. In that case, use 261 * bdrv_drain_all() instead. 262 */ 263 void bdrv_drain(BlockDriverState *bs) 264 { 265 bool busy = true; 266 267 bdrv_drain_recurse(bs); 268 while (busy) { 269 /* Keep iterating */ 270 bdrv_flush_io_queue(bs); 271 busy = bdrv_requests_pending(bs); 272 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 273 } 274 } 275 276 /* 277 * Wait for pending requests to complete across all BlockDriverStates 278 * 279 * This function does not flush data to disk, use bdrv_flush_all() for that 280 * after calling this function. 281 */ 282 void bdrv_drain_all(void) 283 { 284 /* Always run first iteration so any pending completion BHs run */ 285 bool busy = true; 286 BlockDriverState *bs = NULL; 287 GSList *aio_ctxs = NULL, *ctx; 288 289 while ((bs = bdrv_next(bs))) { 290 AioContext *aio_context = bdrv_get_aio_context(bs); 291 292 aio_context_acquire(aio_context); 293 if (bs->job) { 294 block_job_pause(bs->job); 295 } 296 aio_context_release(aio_context); 297 298 if (!g_slist_find(aio_ctxs, aio_context)) { 299 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 300 } 301 } 302 303 /* Note that completion of an asynchronous I/O operation can trigger any 304 * number of other I/O operations on other devices---for example a 305 * coroutine can submit an I/O request to another device in response to 306 * request completion. Therefore we must keep looping until there was no 307 * more activity rather than simply draining each device independently. 308 */ 309 while (busy) { 310 busy = false; 311 312 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 313 AioContext *aio_context = ctx->data; 314 bs = NULL; 315 316 aio_context_acquire(aio_context); 317 while ((bs = bdrv_next(bs))) { 318 if (aio_context == bdrv_get_aio_context(bs)) { 319 bdrv_flush_io_queue(bs); 320 if (bdrv_requests_pending(bs)) { 321 busy = true; 322 aio_poll(aio_context, busy); 323 } 324 } 325 } 326 busy |= aio_poll(aio_context, false); 327 aio_context_release(aio_context); 328 } 329 } 330 331 bs = NULL; 332 while ((bs = bdrv_next(bs))) { 333 AioContext *aio_context = bdrv_get_aio_context(bs); 334 335 aio_context_acquire(aio_context); 336 if (bs->job) { 337 block_job_resume(bs->job); 338 } 339 aio_context_release(aio_context); 340 } 341 g_slist_free(aio_ctxs); 342 } 343 344 /** 345 * Remove an active request from the tracked requests list 346 * 347 * This function should be called when a tracked request is completing. 348 */ 349 static void tracked_request_end(BdrvTrackedRequest *req) 350 { 351 if (req->serialising) { 352 req->bs->serialising_in_flight--; 353 } 354 355 QLIST_REMOVE(req, list); 356 qemu_co_queue_restart_all(&req->wait_queue); 357 } 358 359 /** 360 * Add an active request to the tracked requests list 361 */ 362 static void tracked_request_begin(BdrvTrackedRequest *req, 363 BlockDriverState *bs, 364 int64_t offset, 365 unsigned int bytes, 366 enum BdrvTrackedRequestType type) 367 { 368 *req = (BdrvTrackedRequest){ 369 .bs = bs, 370 .offset = offset, 371 .bytes = bytes, 372 .type = type, 373 .co = qemu_coroutine_self(), 374 .serialising = false, 375 .overlap_offset = offset, 376 .overlap_bytes = bytes, 377 }; 378 379 qemu_co_queue_init(&req->wait_queue); 380 381 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 382 } 383 384 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 385 { 386 int64_t overlap_offset = req->offset & ~(align - 1); 387 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 388 - overlap_offset; 389 390 if (!req->serialising) { 391 req->bs->serialising_in_flight++; 392 req->serialising = true; 393 } 394 395 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 396 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 397 } 398 399 /** 400 * Round a region to cluster boundaries 401 */ 402 void bdrv_round_to_clusters(BlockDriverState *bs, 403 int64_t sector_num, int nb_sectors, 404 int64_t *cluster_sector_num, 405 int *cluster_nb_sectors) 406 { 407 BlockDriverInfo bdi; 408 409 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 410 *cluster_sector_num = sector_num; 411 *cluster_nb_sectors = nb_sectors; 412 } else { 413 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 414 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 415 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 416 nb_sectors, c); 417 } 418 } 419 420 static int bdrv_get_cluster_size(BlockDriverState *bs) 421 { 422 BlockDriverInfo bdi; 423 int ret; 424 425 ret = bdrv_get_info(bs, &bdi); 426 if (ret < 0 || bdi.cluster_size == 0) { 427 return bs->request_alignment; 428 } else { 429 return bdi.cluster_size; 430 } 431 } 432 433 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 434 int64_t offset, unsigned int bytes) 435 { 436 /* aaaa bbbb */ 437 if (offset >= req->overlap_offset + req->overlap_bytes) { 438 return false; 439 } 440 /* bbbb aaaa */ 441 if (req->overlap_offset >= offset + bytes) { 442 return false; 443 } 444 return true; 445 } 446 447 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 448 { 449 BlockDriverState *bs = self->bs; 450 BdrvTrackedRequest *req; 451 bool retry; 452 bool waited = false; 453 454 if (!bs->serialising_in_flight) { 455 return false; 456 } 457 458 do { 459 retry = false; 460 QLIST_FOREACH(req, &bs->tracked_requests, list) { 461 if (req == self || (!req->serialising && !self->serialising)) { 462 continue; 463 } 464 if (tracked_request_overlaps(req, self->overlap_offset, 465 self->overlap_bytes)) 466 { 467 /* Hitting this means there was a reentrant request, for 468 * example, a block driver issuing nested requests. This must 469 * never happen since it means deadlock. 470 */ 471 assert(qemu_coroutine_self() != req->co); 472 473 /* If the request is already (indirectly) waiting for us, or 474 * will wait for us as soon as it wakes up, then just go on 475 * (instead of producing a deadlock in the former case). */ 476 if (!req->waiting_for) { 477 self->waiting_for = req; 478 qemu_co_queue_wait(&req->wait_queue); 479 self->waiting_for = NULL; 480 retry = true; 481 waited = true; 482 break; 483 } 484 } 485 } 486 } while (retry); 487 488 return waited; 489 } 490 491 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 492 size_t size) 493 { 494 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 495 return -EIO; 496 } 497 498 if (!bdrv_is_inserted(bs)) { 499 return -ENOMEDIUM; 500 } 501 502 if (offset < 0) { 503 return -EIO; 504 } 505 506 return 0; 507 } 508 509 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 510 int nb_sectors) 511 { 512 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 513 return -EIO; 514 } 515 516 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 517 nb_sectors * BDRV_SECTOR_SIZE); 518 } 519 520 typedef struct RwCo { 521 BlockDriverState *bs; 522 int64_t offset; 523 QEMUIOVector *qiov; 524 bool is_write; 525 int ret; 526 BdrvRequestFlags flags; 527 } RwCo; 528 529 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 530 { 531 RwCo *rwco = opaque; 532 533 if (!rwco->is_write) { 534 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 535 rwco->qiov->size, rwco->qiov, 536 rwco->flags); 537 } else { 538 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 539 rwco->qiov->size, rwco->qiov, 540 rwco->flags); 541 } 542 } 543 544 /* 545 * Process a vectored synchronous request using coroutines 546 */ 547 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 548 QEMUIOVector *qiov, bool is_write, 549 BdrvRequestFlags flags) 550 { 551 Coroutine *co; 552 RwCo rwco = { 553 .bs = bs, 554 .offset = offset, 555 .qiov = qiov, 556 .is_write = is_write, 557 .ret = NOT_DONE, 558 .flags = flags, 559 }; 560 561 /** 562 * In sync call context, when the vcpu is blocked, this throttling timer 563 * will not fire; so the I/O throttling function has to be disabled here 564 * if it has been enabled. 565 */ 566 if (bs->io_limits_enabled) { 567 fprintf(stderr, "Disabling I/O throttling on '%s' due " 568 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 569 bdrv_io_limits_disable(bs); 570 } 571 572 if (qemu_in_coroutine()) { 573 /* Fast-path if already in coroutine context */ 574 bdrv_rw_co_entry(&rwco); 575 } else { 576 AioContext *aio_context = bdrv_get_aio_context(bs); 577 578 co = qemu_coroutine_create(bdrv_rw_co_entry); 579 qemu_coroutine_enter(co, &rwco); 580 while (rwco.ret == NOT_DONE) { 581 aio_poll(aio_context, true); 582 } 583 } 584 return rwco.ret; 585 } 586 587 /* 588 * Process a synchronous request using coroutines 589 */ 590 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 591 int nb_sectors, bool is_write, BdrvRequestFlags flags) 592 { 593 QEMUIOVector qiov; 594 struct iovec iov = { 595 .iov_base = (void *)buf, 596 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 597 }; 598 599 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 600 return -EINVAL; 601 } 602 603 qemu_iovec_init_external(&qiov, &iov, 1); 604 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 605 &qiov, is_write, flags); 606 } 607 608 /* return < 0 if error. See bdrv_write() for the return codes */ 609 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 610 uint8_t *buf, int nb_sectors) 611 { 612 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 613 } 614 615 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 616 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 617 uint8_t *buf, int nb_sectors) 618 { 619 bool enabled; 620 int ret; 621 622 enabled = bs->io_limits_enabled; 623 bs->io_limits_enabled = false; 624 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 625 bs->io_limits_enabled = enabled; 626 return ret; 627 } 628 629 /* Return < 0 if error. Important errors are: 630 -EIO generic I/O error (may happen for all errors) 631 -ENOMEDIUM No media inserted. 632 -EINVAL Invalid sector number or nb_sectors 633 -EACCES Trying to write a read-only device 634 */ 635 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 636 const uint8_t *buf, int nb_sectors) 637 { 638 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 639 } 640 641 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 642 int nb_sectors, BdrvRequestFlags flags) 643 { 644 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 645 BDRV_REQ_ZERO_WRITE | flags); 646 } 647 648 /* 649 * Completely zero out a block device with the help of bdrv_write_zeroes. 650 * The operation is sped up by checking the block status and only writing 651 * zeroes to the device if they currently do not return zeroes. Optional 652 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 653 * 654 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 655 */ 656 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 657 { 658 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 659 int n; 660 661 target_sectors = bdrv_nb_sectors(bs); 662 if (target_sectors < 0) { 663 return target_sectors; 664 } 665 666 for (;;) { 667 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 668 if (nb_sectors <= 0) { 669 return 0; 670 } 671 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 672 if (ret < 0) { 673 error_report("error getting block status at sector %" PRId64 ": %s", 674 sector_num, strerror(-ret)); 675 return ret; 676 } 677 if (ret & BDRV_BLOCK_ZERO) { 678 sector_num += n; 679 continue; 680 } 681 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 682 if (ret < 0) { 683 error_report("error writing zeroes at sector %" PRId64 ": %s", 684 sector_num, strerror(-ret)); 685 return ret; 686 } 687 sector_num += n; 688 } 689 } 690 691 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 692 { 693 QEMUIOVector qiov; 694 struct iovec iov = { 695 .iov_base = (void *)buf, 696 .iov_len = bytes, 697 }; 698 int ret; 699 700 if (bytes < 0) { 701 return -EINVAL; 702 } 703 704 qemu_iovec_init_external(&qiov, &iov, 1); 705 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 706 if (ret < 0) { 707 return ret; 708 } 709 710 return bytes; 711 } 712 713 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 714 { 715 int ret; 716 717 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 718 if (ret < 0) { 719 return ret; 720 } 721 722 return qiov->size; 723 } 724 725 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 726 const void *buf, int bytes) 727 { 728 QEMUIOVector qiov; 729 struct iovec iov = { 730 .iov_base = (void *) buf, 731 .iov_len = bytes, 732 }; 733 734 if (bytes < 0) { 735 return -EINVAL; 736 } 737 738 qemu_iovec_init_external(&qiov, &iov, 1); 739 return bdrv_pwritev(bs, offset, &qiov); 740 } 741 742 /* 743 * Writes to the file and ensures that no writes are reordered across this 744 * request (acts as a barrier) 745 * 746 * Returns 0 on success, -errno in error cases. 747 */ 748 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 749 const void *buf, int count) 750 { 751 int ret; 752 753 ret = bdrv_pwrite(bs, offset, buf, count); 754 if (ret < 0) { 755 return ret; 756 } 757 758 /* No flush needed for cache modes that already do it */ 759 if (bs->enable_write_cache) { 760 bdrv_flush(bs); 761 } 762 763 return 0; 764 } 765 766 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 767 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 768 { 769 /* Perform I/O through a temporary buffer so that users who scribble over 770 * their read buffer while the operation is in progress do not end up 771 * modifying the image file. This is critical for zero-copy guest I/O 772 * where anything might happen inside guest memory. 773 */ 774 void *bounce_buffer; 775 776 BlockDriver *drv = bs->drv; 777 struct iovec iov; 778 QEMUIOVector bounce_qiov; 779 int64_t cluster_sector_num; 780 int cluster_nb_sectors; 781 size_t skip_bytes; 782 int ret; 783 784 /* Cover entire cluster so no additional backing file I/O is required when 785 * allocating cluster in the image file. 786 */ 787 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 788 &cluster_sector_num, &cluster_nb_sectors); 789 790 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 791 cluster_sector_num, cluster_nb_sectors); 792 793 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 794 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 795 if (bounce_buffer == NULL) { 796 ret = -ENOMEM; 797 goto err; 798 } 799 800 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 801 802 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 803 &bounce_qiov); 804 if (ret < 0) { 805 goto err; 806 } 807 808 if (drv->bdrv_co_write_zeroes && 809 buffer_is_zero(bounce_buffer, iov.iov_len)) { 810 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 811 cluster_nb_sectors, 0); 812 } else { 813 /* This does not change the data on the disk, it is not necessary 814 * to flush even in cache=writethrough mode. 815 */ 816 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 817 &bounce_qiov); 818 } 819 820 if (ret < 0) { 821 /* It might be okay to ignore write errors for guest requests. If this 822 * is a deliberate copy-on-read then we don't want to ignore the error. 823 * Simply report it in all cases. 824 */ 825 goto err; 826 } 827 828 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 829 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 830 nb_sectors * BDRV_SECTOR_SIZE); 831 832 err: 833 qemu_vfree(bounce_buffer); 834 return ret; 835 } 836 837 /* 838 * Forwards an already correctly aligned request to the BlockDriver. This 839 * handles copy on read and zeroing after EOF; any other features must be 840 * implemented by the caller. 841 */ 842 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 843 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 844 int64_t align, QEMUIOVector *qiov, int flags) 845 { 846 BlockDriver *drv = bs->drv; 847 int ret; 848 849 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 850 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 851 852 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 853 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 854 assert(!qiov || bytes == qiov->size); 855 856 /* Handle Copy on Read and associated serialisation */ 857 if (flags & BDRV_REQ_COPY_ON_READ) { 858 /* If we touch the same cluster it counts as an overlap. This 859 * guarantees that allocating writes will be serialized and not race 860 * with each other for the same cluster. For example, in copy-on-read 861 * it ensures that the CoR read and write operations are atomic and 862 * guest writes cannot interleave between them. */ 863 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 864 } 865 866 wait_serialising_requests(req); 867 868 if (flags & BDRV_REQ_COPY_ON_READ) { 869 int pnum; 870 871 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 872 if (ret < 0) { 873 goto out; 874 } 875 876 if (!ret || pnum != nb_sectors) { 877 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 878 goto out; 879 } 880 } 881 882 /* Forward the request to the BlockDriver */ 883 if (!bs->zero_beyond_eof) { 884 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 885 } else { 886 /* Read zeros after EOF */ 887 int64_t total_sectors, max_nb_sectors; 888 889 total_sectors = bdrv_nb_sectors(bs); 890 if (total_sectors < 0) { 891 ret = total_sectors; 892 goto out; 893 } 894 895 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 896 align >> BDRV_SECTOR_BITS); 897 if (nb_sectors < max_nb_sectors) { 898 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 899 } else if (max_nb_sectors > 0) { 900 QEMUIOVector local_qiov; 901 902 qemu_iovec_init(&local_qiov, qiov->niov); 903 qemu_iovec_concat(&local_qiov, qiov, 0, 904 max_nb_sectors * BDRV_SECTOR_SIZE); 905 906 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 907 &local_qiov); 908 909 qemu_iovec_destroy(&local_qiov); 910 } else { 911 ret = 0; 912 } 913 914 /* Reading beyond end of file is supposed to produce zeroes */ 915 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 916 uint64_t offset = MAX(0, total_sectors - sector_num); 917 uint64_t bytes = (sector_num + nb_sectors - offset) * 918 BDRV_SECTOR_SIZE; 919 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 920 } 921 } 922 923 out: 924 return ret; 925 } 926 927 /* 928 * Handle a read request in coroutine context 929 */ 930 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 931 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 932 BdrvRequestFlags flags) 933 { 934 BlockDriver *drv = bs->drv; 935 BdrvTrackedRequest req; 936 937 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 938 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 939 uint8_t *head_buf = NULL; 940 uint8_t *tail_buf = NULL; 941 QEMUIOVector local_qiov; 942 bool use_local_qiov = false; 943 int ret; 944 945 if (!drv) { 946 return -ENOMEDIUM; 947 } 948 949 ret = bdrv_check_byte_request(bs, offset, bytes); 950 if (ret < 0) { 951 return ret; 952 } 953 954 /* Don't do copy-on-read if we read data before write operation */ 955 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) { 956 flags |= BDRV_REQ_COPY_ON_READ; 957 } 958 959 /* throttling disk I/O */ 960 if (bs->io_limits_enabled) { 961 throttle_group_co_io_limits_intercept(bs, bytes, false); 962 } 963 964 /* Align read if necessary by padding qiov */ 965 if (offset & (align - 1)) { 966 head_buf = qemu_blockalign(bs, align); 967 qemu_iovec_init(&local_qiov, qiov->niov + 2); 968 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 969 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 970 use_local_qiov = true; 971 972 bytes += offset & (align - 1); 973 offset = offset & ~(align - 1); 974 } 975 976 if ((offset + bytes) & (align - 1)) { 977 if (!use_local_qiov) { 978 qemu_iovec_init(&local_qiov, qiov->niov + 1); 979 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 980 use_local_qiov = true; 981 } 982 tail_buf = qemu_blockalign(bs, align); 983 qemu_iovec_add(&local_qiov, tail_buf, 984 align - ((offset + bytes) & (align - 1))); 985 986 bytes = ROUND_UP(bytes, align); 987 } 988 989 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 990 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 991 use_local_qiov ? &local_qiov : qiov, 992 flags); 993 tracked_request_end(&req); 994 995 if (use_local_qiov) { 996 qemu_iovec_destroy(&local_qiov); 997 qemu_vfree(head_buf); 998 qemu_vfree(tail_buf); 999 } 1000 1001 return ret; 1002 } 1003 1004 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1005 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1006 BdrvRequestFlags flags) 1007 { 1008 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1009 return -EINVAL; 1010 } 1011 1012 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1013 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1014 } 1015 1016 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1017 int nb_sectors, QEMUIOVector *qiov) 1018 { 1019 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1020 1021 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1022 } 1023 1024 int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs, 1025 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1026 { 1027 trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors); 1028 1029 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1030 BDRV_REQ_NO_COPY_ON_READ); 1031 } 1032 1033 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1034 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1035 { 1036 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1037 1038 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1039 BDRV_REQ_COPY_ON_READ); 1040 } 1041 1042 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1043 1044 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1045 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1046 { 1047 BlockDriver *drv = bs->drv; 1048 QEMUIOVector qiov; 1049 struct iovec iov = {0}; 1050 int ret = 0; 1051 1052 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1053 BDRV_REQUEST_MAX_SECTORS); 1054 1055 while (nb_sectors > 0 && !ret) { 1056 int num = nb_sectors; 1057 1058 /* Align request. Block drivers can expect the "bulk" of the request 1059 * to be aligned. 1060 */ 1061 if (bs->bl.write_zeroes_alignment 1062 && num > bs->bl.write_zeroes_alignment) { 1063 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1064 /* Make a small request up to the first aligned sector. */ 1065 num = bs->bl.write_zeroes_alignment; 1066 num -= sector_num % bs->bl.write_zeroes_alignment; 1067 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1068 /* Shorten the request to the last aligned sector. num cannot 1069 * underflow because num > bs->bl.write_zeroes_alignment. 1070 */ 1071 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1072 } 1073 } 1074 1075 /* limit request size */ 1076 if (num > max_write_zeroes) { 1077 num = max_write_zeroes; 1078 } 1079 1080 ret = -ENOTSUP; 1081 /* First try the efficient write zeroes operation */ 1082 if (drv->bdrv_co_write_zeroes) { 1083 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1084 } 1085 1086 if (ret == -ENOTSUP) { 1087 /* Fall back to bounce buffer if write zeroes is unsupported */ 1088 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1089 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1090 num = MIN(num, max_xfer_len); 1091 iov.iov_len = num * BDRV_SECTOR_SIZE; 1092 if (iov.iov_base == NULL) { 1093 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1094 if (iov.iov_base == NULL) { 1095 ret = -ENOMEM; 1096 goto fail; 1097 } 1098 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1099 } 1100 qemu_iovec_init_external(&qiov, &iov, 1); 1101 1102 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1103 1104 /* Keep bounce buffer around if it is big enough for all 1105 * all future requests. 1106 */ 1107 if (num < max_xfer_len) { 1108 qemu_vfree(iov.iov_base); 1109 iov.iov_base = NULL; 1110 } 1111 } 1112 1113 sector_num += num; 1114 nb_sectors -= num; 1115 } 1116 1117 fail: 1118 qemu_vfree(iov.iov_base); 1119 return ret; 1120 } 1121 1122 /* 1123 * Forwards an already correctly aligned write request to the BlockDriver. 1124 */ 1125 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1126 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1127 QEMUIOVector *qiov, int flags) 1128 { 1129 BlockDriver *drv = bs->drv; 1130 bool waited; 1131 int ret; 1132 1133 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1134 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1135 1136 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1137 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1138 assert(!qiov || bytes == qiov->size); 1139 1140 waited = wait_serialising_requests(req); 1141 assert(!waited || !req->serialising); 1142 assert(req->overlap_offset <= offset); 1143 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1144 1145 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1146 1147 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1148 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1149 qemu_iovec_is_zero(qiov)) { 1150 flags |= BDRV_REQ_ZERO_WRITE; 1151 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1152 flags |= BDRV_REQ_MAY_UNMAP; 1153 } 1154 } 1155 1156 if (ret < 0) { 1157 /* Do nothing, write notifier decided to fail this request */ 1158 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1159 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1160 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1161 } else { 1162 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1163 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1164 } 1165 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1166 1167 if (ret == 0 && !bs->enable_write_cache) { 1168 ret = bdrv_co_flush(bs); 1169 } 1170 1171 bdrv_set_dirty(bs, sector_num, nb_sectors); 1172 1173 if (bs->wr_highest_offset < offset + bytes) { 1174 bs->wr_highest_offset = offset + bytes; 1175 } 1176 1177 if (ret >= 0) { 1178 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1179 } 1180 1181 return ret; 1182 } 1183 1184 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1185 int64_t offset, 1186 unsigned int bytes, 1187 BdrvRequestFlags flags, 1188 BdrvTrackedRequest *req) 1189 { 1190 uint8_t *buf = NULL; 1191 QEMUIOVector local_qiov; 1192 struct iovec iov; 1193 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1194 unsigned int head_padding_bytes, tail_padding_bytes; 1195 int ret = 0; 1196 1197 head_padding_bytes = offset & (align - 1); 1198 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1199 1200 1201 assert(flags & BDRV_REQ_ZERO_WRITE); 1202 if (head_padding_bytes || tail_padding_bytes) { 1203 buf = qemu_blockalign(bs, align); 1204 iov = (struct iovec) { 1205 .iov_base = buf, 1206 .iov_len = align, 1207 }; 1208 qemu_iovec_init_external(&local_qiov, &iov, 1); 1209 } 1210 if (head_padding_bytes) { 1211 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1212 1213 /* RMW the unaligned part before head. */ 1214 mark_request_serialising(req, align); 1215 wait_serialising_requests(req); 1216 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1217 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1218 align, &local_qiov, 0); 1219 if (ret < 0) { 1220 goto fail; 1221 } 1222 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1223 1224 memset(buf + head_padding_bytes, 0, zero_bytes); 1225 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1226 &local_qiov, 1227 flags & ~BDRV_REQ_ZERO_WRITE); 1228 if (ret < 0) { 1229 goto fail; 1230 } 1231 offset += zero_bytes; 1232 bytes -= zero_bytes; 1233 } 1234 1235 assert(!bytes || (offset & (align - 1)) == 0); 1236 if (bytes >= align) { 1237 /* Write the aligned part in the middle. */ 1238 uint64_t aligned_bytes = bytes & ~(align - 1); 1239 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1240 NULL, flags); 1241 if (ret < 0) { 1242 goto fail; 1243 } 1244 bytes -= aligned_bytes; 1245 offset += aligned_bytes; 1246 } 1247 1248 assert(!bytes || (offset & (align - 1)) == 0); 1249 if (bytes) { 1250 assert(align == tail_padding_bytes + bytes); 1251 /* RMW the unaligned part after tail. */ 1252 mark_request_serialising(req, align); 1253 wait_serialising_requests(req); 1254 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1255 ret = bdrv_aligned_preadv(bs, req, offset, align, 1256 align, &local_qiov, 0); 1257 if (ret < 0) { 1258 goto fail; 1259 } 1260 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1261 1262 memset(buf, 0, bytes); 1263 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1264 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1265 } 1266 fail: 1267 qemu_vfree(buf); 1268 return ret; 1269 1270 } 1271 1272 /* 1273 * Handle a write request in coroutine context 1274 */ 1275 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1276 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1277 BdrvRequestFlags flags) 1278 { 1279 BdrvTrackedRequest req; 1280 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1281 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1282 uint8_t *head_buf = NULL; 1283 uint8_t *tail_buf = NULL; 1284 QEMUIOVector local_qiov; 1285 bool use_local_qiov = false; 1286 int ret; 1287 1288 if (!bs->drv) { 1289 return -ENOMEDIUM; 1290 } 1291 if (bs->read_only) { 1292 return -EPERM; 1293 } 1294 1295 ret = bdrv_check_byte_request(bs, offset, bytes); 1296 if (ret < 0) { 1297 return ret; 1298 } 1299 1300 /* throttling disk I/O */ 1301 if (bs->io_limits_enabled) { 1302 throttle_group_co_io_limits_intercept(bs, bytes, true); 1303 } 1304 1305 /* 1306 * Align write if necessary by performing a read-modify-write cycle. 1307 * Pad qiov with the read parts and be sure to have a tracked request not 1308 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1309 */ 1310 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1311 1312 if (!qiov) { 1313 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1314 goto out; 1315 } 1316 1317 if (offset & (align - 1)) { 1318 QEMUIOVector head_qiov; 1319 struct iovec head_iov; 1320 1321 mark_request_serialising(&req, align); 1322 wait_serialising_requests(&req); 1323 1324 head_buf = qemu_blockalign(bs, align); 1325 head_iov = (struct iovec) { 1326 .iov_base = head_buf, 1327 .iov_len = align, 1328 }; 1329 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1330 1331 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1332 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1333 align, &head_qiov, 0); 1334 if (ret < 0) { 1335 goto fail; 1336 } 1337 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1338 1339 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1340 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1341 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1342 use_local_qiov = true; 1343 1344 bytes += offset & (align - 1); 1345 offset = offset & ~(align - 1); 1346 } 1347 1348 if ((offset + bytes) & (align - 1)) { 1349 QEMUIOVector tail_qiov; 1350 struct iovec tail_iov; 1351 size_t tail_bytes; 1352 bool waited; 1353 1354 mark_request_serialising(&req, align); 1355 waited = wait_serialising_requests(&req); 1356 assert(!waited || !use_local_qiov); 1357 1358 tail_buf = qemu_blockalign(bs, align); 1359 tail_iov = (struct iovec) { 1360 .iov_base = tail_buf, 1361 .iov_len = align, 1362 }; 1363 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1364 1365 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1366 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1367 align, &tail_qiov, 0); 1368 if (ret < 0) { 1369 goto fail; 1370 } 1371 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1372 1373 if (!use_local_qiov) { 1374 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1375 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1376 use_local_qiov = true; 1377 } 1378 1379 tail_bytes = (offset + bytes) & (align - 1); 1380 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1381 1382 bytes = ROUND_UP(bytes, align); 1383 } 1384 1385 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1386 use_local_qiov ? &local_qiov : qiov, 1387 flags); 1388 1389 fail: 1390 1391 if (use_local_qiov) { 1392 qemu_iovec_destroy(&local_qiov); 1393 } 1394 qemu_vfree(head_buf); 1395 qemu_vfree(tail_buf); 1396 out: 1397 tracked_request_end(&req); 1398 return ret; 1399 } 1400 1401 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1402 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1403 BdrvRequestFlags flags) 1404 { 1405 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1406 return -EINVAL; 1407 } 1408 1409 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1410 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1411 } 1412 1413 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1414 int nb_sectors, QEMUIOVector *qiov) 1415 { 1416 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1417 1418 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1419 } 1420 1421 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1422 int64_t sector_num, int nb_sectors, 1423 BdrvRequestFlags flags) 1424 { 1425 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1426 1427 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1428 flags &= ~BDRV_REQ_MAY_UNMAP; 1429 } 1430 1431 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1432 BDRV_REQ_ZERO_WRITE | flags); 1433 } 1434 1435 int bdrv_flush_all(void) 1436 { 1437 BlockDriverState *bs = NULL; 1438 int result = 0; 1439 1440 while ((bs = bdrv_next(bs))) { 1441 AioContext *aio_context = bdrv_get_aio_context(bs); 1442 int ret; 1443 1444 aio_context_acquire(aio_context); 1445 ret = bdrv_flush(bs); 1446 if (ret < 0 && !result) { 1447 result = ret; 1448 } 1449 aio_context_release(aio_context); 1450 } 1451 1452 return result; 1453 } 1454 1455 typedef struct BdrvCoGetBlockStatusData { 1456 BlockDriverState *bs; 1457 BlockDriverState *base; 1458 int64_t sector_num; 1459 int nb_sectors; 1460 int *pnum; 1461 int64_t ret; 1462 bool done; 1463 } BdrvCoGetBlockStatusData; 1464 1465 /* 1466 * Returns the allocation status of the specified sectors. 1467 * Drivers not implementing the functionality are assumed to not support 1468 * backing files, hence all their sectors are reported as allocated. 1469 * 1470 * If 'sector_num' is beyond the end of the disk image the return value is 0 1471 * and 'pnum' is set to 0. 1472 * 1473 * 'pnum' is set to the number of sectors (including and immediately following 1474 * the specified sector) that are known to be in the same 1475 * allocated/unallocated state. 1476 * 1477 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1478 * beyond the end of the disk image it will be clamped. 1479 */ 1480 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1481 int64_t sector_num, 1482 int nb_sectors, int *pnum) 1483 { 1484 int64_t total_sectors; 1485 int64_t n; 1486 int64_t ret, ret2; 1487 1488 total_sectors = bdrv_nb_sectors(bs); 1489 if (total_sectors < 0) { 1490 return total_sectors; 1491 } 1492 1493 if (sector_num >= total_sectors) { 1494 *pnum = 0; 1495 return 0; 1496 } 1497 1498 n = total_sectors - sector_num; 1499 if (n < nb_sectors) { 1500 nb_sectors = n; 1501 } 1502 1503 if (!bs->drv->bdrv_co_get_block_status) { 1504 *pnum = nb_sectors; 1505 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1506 if (bs->drv->protocol_name) { 1507 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1508 } 1509 return ret; 1510 } 1511 1512 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1513 if (ret < 0) { 1514 *pnum = 0; 1515 return ret; 1516 } 1517 1518 if (ret & BDRV_BLOCK_RAW) { 1519 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1520 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1521 *pnum, pnum); 1522 } 1523 1524 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1525 ret |= BDRV_BLOCK_ALLOCATED; 1526 } else { 1527 if (bdrv_unallocated_blocks_are_zero(bs)) { 1528 ret |= BDRV_BLOCK_ZERO; 1529 } else if (bs->backing) { 1530 BlockDriverState *bs2 = bs->backing->bs; 1531 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1532 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1533 ret |= BDRV_BLOCK_ZERO; 1534 } 1535 } 1536 } 1537 1538 if (bs->file && 1539 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1540 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1541 int file_pnum; 1542 1543 ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1544 *pnum, &file_pnum); 1545 if (ret2 >= 0) { 1546 /* Ignore errors. This is just providing extra information, it 1547 * is useful but not necessary. 1548 */ 1549 if (!file_pnum) { 1550 /* !file_pnum indicates an offset at or beyond the EOF; it is 1551 * perfectly valid for the format block driver to point to such 1552 * offsets, so catch it and mark everything as zero */ 1553 ret |= BDRV_BLOCK_ZERO; 1554 } else { 1555 /* Limit request to the range reported by the protocol driver */ 1556 *pnum = file_pnum; 1557 ret |= (ret2 & BDRV_BLOCK_ZERO); 1558 } 1559 } 1560 } 1561 1562 return ret; 1563 } 1564 1565 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1566 BlockDriverState *base, 1567 int64_t sector_num, 1568 int nb_sectors, 1569 int *pnum) 1570 { 1571 BlockDriverState *p; 1572 int64_t ret = 0; 1573 1574 assert(bs != base); 1575 for (p = bs; p != base; p = backing_bs(p)) { 1576 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); 1577 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1578 break; 1579 } 1580 /* [sector_num, pnum] unallocated on this layer, which could be only 1581 * the first part of [sector_num, nb_sectors]. */ 1582 nb_sectors = MIN(nb_sectors, *pnum); 1583 } 1584 return ret; 1585 } 1586 1587 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1588 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1589 { 1590 BdrvCoGetBlockStatusData *data = opaque; 1591 1592 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1593 data->sector_num, 1594 data->nb_sectors, 1595 data->pnum); 1596 data->done = true; 1597 } 1598 1599 /* 1600 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1601 * 1602 * See bdrv_co_get_block_status_above() for details. 1603 */ 1604 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1605 BlockDriverState *base, 1606 int64_t sector_num, 1607 int nb_sectors, int *pnum) 1608 { 1609 Coroutine *co; 1610 BdrvCoGetBlockStatusData data = { 1611 .bs = bs, 1612 .base = base, 1613 .sector_num = sector_num, 1614 .nb_sectors = nb_sectors, 1615 .pnum = pnum, 1616 .done = false, 1617 }; 1618 1619 if (qemu_in_coroutine()) { 1620 /* Fast-path if already in coroutine context */ 1621 bdrv_get_block_status_above_co_entry(&data); 1622 } else { 1623 AioContext *aio_context = bdrv_get_aio_context(bs); 1624 1625 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1626 qemu_coroutine_enter(co, &data); 1627 while (!data.done) { 1628 aio_poll(aio_context, true); 1629 } 1630 } 1631 return data.ret; 1632 } 1633 1634 int64_t bdrv_get_block_status(BlockDriverState *bs, 1635 int64_t sector_num, 1636 int nb_sectors, int *pnum) 1637 { 1638 return bdrv_get_block_status_above(bs, backing_bs(bs), 1639 sector_num, nb_sectors, pnum); 1640 } 1641 1642 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1643 int nb_sectors, int *pnum) 1644 { 1645 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1646 if (ret < 0) { 1647 return ret; 1648 } 1649 return !!(ret & BDRV_BLOCK_ALLOCATED); 1650 } 1651 1652 /* 1653 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1654 * 1655 * Return true if the given sector is allocated in any image between 1656 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1657 * sector is allocated in any image of the chain. Return false otherwise. 1658 * 1659 * 'pnum' is set to the number of sectors (including and immediately following 1660 * the specified sector) that are known to be in the same 1661 * allocated/unallocated state. 1662 * 1663 */ 1664 int bdrv_is_allocated_above(BlockDriverState *top, 1665 BlockDriverState *base, 1666 int64_t sector_num, 1667 int nb_sectors, int *pnum) 1668 { 1669 BlockDriverState *intermediate; 1670 int ret, n = nb_sectors; 1671 1672 intermediate = top; 1673 while (intermediate && intermediate != base) { 1674 int pnum_inter; 1675 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1676 &pnum_inter); 1677 if (ret < 0) { 1678 return ret; 1679 } else if (ret) { 1680 *pnum = pnum_inter; 1681 return 1; 1682 } 1683 1684 /* 1685 * [sector_num, nb_sectors] is unallocated on top but intermediate 1686 * might have 1687 * 1688 * [sector_num+x, nr_sectors] allocated. 1689 */ 1690 if (n > pnum_inter && 1691 (intermediate == top || 1692 sector_num + pnum_inter < intermediate->total_sectors)) { 1693 n = pnum_inter; 1694 } 1695 1696 intermediate = backing_bs(intermediate); 1697 } 1698 1699 *pnum = n; 1700 return 0; 1701 } 1702 1703 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1704 const uint8_t *buf, int nb_sectors) 1705 { 1706 BlockDriver *drv = bs->drv; 1707 int ret; 1708 1709 if (!drv) { 1710 return -ENOMEDIUM; 1711 } 1712 if (!drv->bdrv_write_compressed) { 1713 return -ENOTSUP; 1714 } 1715 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1716 if (ret < 0) { 1717 return ret; 1718 } 1719 1720 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1721 1722 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1723 } 1724 1725 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1726 int64_t pos, int size) 1727 { 1728 QEMUIOVector qiov; 1729 struct iovec iov = { 1730 .iov_base = (void *) buf, 1731 .iov_len = size, 1732 }; 1733 1734 qemu_iovec_init_external(&qiov, &iov, 1); 1735 return bdrv_writev_vmstate(bs, &qiov, pos); 1736 } 1737 1738 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1739 { 1740 BlockDriver *drv = bs->drv; 1741 1742 if (!drv) { 1743 return -ENOMEDIUM; 1744 } else if (drv->bdrv_save_vmstate) { 1745 return drv->bdrv_save_vmstate(bs, qiov, pos); 1746 } else if (bs->file) { 1747 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1748 } 1749 1750 return -ENOTSUP; 1751 } 1752 1753 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1754 int64_t pos, int size) 1755 { 1756 BlockDriver *drv = bs->drv; 1757 if (!drv) 1758 return -ENOMEDIUM; 1759 if (drv->bdrv_load_vmstate) 1760 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1761 if (bs->file) 1762 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1763 return -ENOTSUP; 1764 } 1765 1766 /**************************************************************/ 1767 /* async I/Os */ 1768 1769 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1770 QEMUIOVector *qiov, int nb_sectors, 1771 BlockCompletionFunc *cb, void *opaque) 1772 { 1773 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1774 1775 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1776 cb, opaque, false); 1777 } 1778 1779 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1780 QEMUIOVector *qiov, int nb_sectors, 1781 BlockCompletionFunc *cb, void *opaque) 1782 { 1783 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1784 1785 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1786 cb, opaque, true); 1787 } 1788 1789 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1790 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1791 BlockCompletionFunc *cb, void *opaque) 1792 { 1793 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1794 1795 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1796 BDRV_REQ_ZERO_WRITE | flags, 1797 cb, opaque, true); 1798 } 1799 1800 1801 typedef struct MultiwriteCB { 1802 int error; 1803 int num_requests; 1804 int num_callbacks; 1805 struct { 1806 BlockCompletionFunc *cb; 1807 void *opaque; 1808 QEMUIOVector *free_qiov; 1809 } callbacks[]; 1810 } MultiwriteCB; 1811 1812 static void multiwrite_user_cb(MultiwriteCB *mcb) 1813 { 1814 int i; 1815 1816 for (i = 0; i < mcb->num_callbacks; i++) { 1817 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1818 if (mcb->callbacks[i].free_qiov) { 1819 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1820 } 1821 g_free(mcb->callbacks[i].free_qiov); 1822 } 1823 } 1824 1825 static void multiwrite_cb(void *opaque, int ret) 1826 { 1827 MultiwriteCB *mcb = opaque; 1828 1829 trace_multiwrite_cb(mcb, ret); 1830 1831 if (ret < 0 && !mcb->error) { 1832 mcb->error = ret; 1833 } 1834 1835 mcb->num_requests--; 1836 if (mcb->num_requests == 0) { 1837 multiwrite_user_cb(mcb); 1838 g_free(mcb); 1839 } 1840 } 1841 1842 static int multiwrite_req_compare(const void *a, const void *b) 1843 { 1844 const BlockRequest *req1 = a, *req2 = b; 1845 1846 /* 1847 * Note that we can't simply subtract req2->sector from req1->sector 1848 * here as that could overflow the return value. 1849 */ 1850 if (req1->sector > req2->sector) { 1851 return 1; 1852 } else if (req1->sector < req2->sector) { 1853 return -1; 1854 } else { 1855 return 0; 1856 } 1857 } 1858 1859 /* 1860 * Takes a bunch of requests and tries to merge them. Returns the number of 1861 * requests that remain after merging. 1862 */ 1863 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1864 int num_reqs, MultiwriteCB *mcb) 1865 { 1866 int i, outidx; 1867 1868 // Sort requests by start sector 1869 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1870 1871 // Check if adjacent requests touch the same clusters. If so, combine them, 1872 // filling up gaps with zero sectors. 1873 outidx = 0; 1874 for (i = 1; i < num_reqs; i++) { 1875 int merge = 0; 1876 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1877 1878 // Handle exactly sequential writes and overlapping writes. 1879 if (reqs[i].sector <= oldreq_last) { 1880 merge = 1; 1881 } 1882 1883 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1884 merge = 0; 1885 } 1886 1887 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1888 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1889 merge = 0; 1890 } 1891 1892 if (merge) { 1893 size_t size; 1894 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1895 qemu_iovec_init(qiov, 1896 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1897 1898 // Add the first request to the merged one. If the requests are 1899 // overlapping, drop the last sectors of the first request. 1900 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1901 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1902 1903 // We should need to add any zeros between the two requests 1904 assert (reqs[i].sector <= oldreq_last); 1905 1906 // Add the second request 1907 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1908 1909 // Add tail of first request, if necessary 1910 if (qiov->size < reqs[outidx].qiov->size) { 1911 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1912 reqs[outidx].qiov->size - qiov->size); 1913 } 1914 1915 reqs[outidx].nb_sectors = qiov->size >> 9; 1916 reqs[outidx].qiov = qiov; 1917 1918 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1919 } else { 1920 outidx++; 1921 reqs[outidx].sector = reqs[i].sector; 1922 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1923 reqs[outidx].qiov = reqs[i].qiov; 1924 } 1925 } 1926 1927 if (bs->blk) { 1928 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1929 num_reqs - outidx - 1); 1930 } 1931 1932 return outidx + 1; 1933 } 1934 1935 /* 1936 * Submit multiple AIO write requests at once. 1937 * 1938 * On success, the function returns 0 and all requests in the reqs array have 1939 * been submitted. In error case this function returns -1, and any of the 1940 * requests may or may not be submitted yet. In particular, this means that the 1941 * callback will be called for some of the requests, for others it won't. The 1942 * caller must check the error field of the BlockRequest to wait for the right 1943 * callbacks (if error != 0, no callback will be called). 1944 * 1945 * The implementation may modify the contents of the reqs array, e.g. to merge 1946 * requests. However, the fields opaque and error are left unmodified as they 1947 * are used to signal failure for a single request to the caller. 1948 */ 1949 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1950 { 1951 MultiwriteCB *mcb; 1952 int i; 1953 1954 /* don't submit writes if we don't have a medium */ 1955 if (bs->drv == NULL) { 1956 for (i = 0; i < num_reqs; i++) { 1957 reqs[i].error = -ENOMEDIUM; 1958 } 1959 return -1; 1960 } 1961 1962 if (num_reqs == 0) { 1963 return 0; 1964 } 1965 1966 // Create MultiwriteCB structure 1967 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1968 mcb->num_requests = 0; 1969 mcb->num_callbacks = num_reqs; 1970 1971 for (i = 0; i < num_reqs; i++) { 1972 mcb->callbacks[i].cb = reqs[i].cb; 1973 mcb->callbacks[i].opaque = reqs[i].opaque; 1974 } 1975 1976 // Check for mergable requests 1977 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1978 1979 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1980 1981 /* Run the aio requests. */ 1982 mcb->num_requests = num_reqs; 1983 for (i = 0; i < num_reqs; i++) { 1984 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1985 reqs[i].nb_sectors, reqs[i].flags, 1986 multiwrite_cb, mcb, 1987 true); 1988 } 1989 1990 return 0; 1991 } 1992 1993 void bdrv_aio_cancel(BlockAIOCB *acb) 1994 { 1995 qemu_aio_ref(acb); 1996 bdrv_aio_cancel_async(acb); 1997 while (acb->refcnt > 1) { 1998 if (acb->aiocb_info->get_aio_context) { 1999 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2000 } else if (acb->bs) { 2001 aio_poll(bdrv_get_aio_context(acb->bs), true); 2002 } else { 2003 abort(); 2004 } 2005 } 2006 qemu_aio_unref(acb); 2007 } 2008 2009 /* Async version of aio cancel. The caller is not blocked if the acb implements 2010 * cancel_async, otherwise we do nothing and let the request normally complete. 2011 * In either case the completion callback must be called. */ 2012 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2013 { 2014 if (acb->aiocb_info->cancel_async) { 2015 acb->aiocb_info->cancel_async(acb); 2016 } 2017 } 2018 2019 /**************************************************************/ 2020 /* async block device emulation */ 2021 2022 typedef struct BlockAIOCBSync { 2023 BlockAIOCB common; 2024 QEMUBH *bh; 2025 int ret; 2026 /* vector translation state */ 2027 QEMUIOVector *qiov; 2028 uint8_t *bounce; 2029 int is_write; 2030 } BlockAIOCBSync; 2031 2032 static const AIOCBInfo bdrv_em_aiocb_info = { 2033 .aiocb_size = sizeof(BlockAIOCBSync), 2034 }; 2035 2036 static void bdrv_aio_bh_cb(void *opaque) 2037 { 2038 BlockAIOCBSync *acb = opaque; 2039 2040 if (!acb->is_write && acb->ret >= 0) { 2041 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2042 } 2043 qemu_vfree(acb->bounce); 2044 acb->common.cb(acb->common.opaque, acb->ret); 2045 qemu_bh_delete(acb->bh); 2046 acb->bh = NULL; 2047 qemu_aio_unref(acb); 2048 } 2049 2050 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2051 int64_t sector_num, 2052 QEMUIOVector *qiov, 2053 int nb_sectors, 2054 BlockCompletionFunc *cb, 2055 void *opaque, 2056 int is_write) 2057 2058 { 2059 BlockAIOCBSync *acb; 2060 2061 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2062 acb->is_write = is_write; 2063 acb->qiov = qiov; 2064 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2065 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2066 2067 if (acb->bounce == NULL) { 2068 acb->ret = -ENOMEM; 2069 } else if (is_write) { 2070 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2071 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2072 } else { 2073 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2074 } 2075 2076 qemu_bh_schedule(acb->bh); 2077 2078 return &acb->common; 2079 } 2080 2081 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2082 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2083 BlockCompletionFunc *cb, void *opaque) 2084 { 2085 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2086 } 2087 2088 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2089 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2090 BlockCompletionFunc *cb, void *opaque) 2091 { 2092 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2093 } 2094 2095 2096 typedef struct BlockAIOCBCoroutine { 2097 BlockAIOCB common; 2098 BlockRequest req; 2099 bool is_write; 2100 bool need_bh; 2101 bool *done; 2102 QEMUBH* bh; 2103 } BlockAIOCBCoroutine; 2104 2105 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2106 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2107 }; 2108 2109 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2110 { 2111 if (!acb->need_bh) { 2112 acb->common.cb(acb->common.opaque, acb->req.error); 2113 qemu_aio_unref(acb); 2114 } 2115 } 2116 2117 static void bdrv_co_em_bh(void *opaque) 2118 { 2119 BlockAIOCBCoroutine *acb = opaque; 2120 2121 assert(!acb->need_bh); 2122 qemu_bh_delete(acb->bh); 2123 bdrv_co_complete(acb); 2124 } 2125 2126 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2127 { 2128 acb->need_bh = false; 2129 if (acb->req.error != -EINPROGRESS) { 2130 BlockDriverState *bs = acb->common.bs; 2131 2132 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2133 qemu_bh_schedule(acb->bh); 2134 } 2135 } 2136 2137 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2138 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2139 { 2140 BlockAIOCBCoroutine *acb = opaque; 2141 BlockDriverState *bs = acb->common.bs; 2142 2143 if (!acb->is_write) { 2144 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2145 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2146 } else { 2147 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2148 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2149 } 2150 2151 bdrv_co_complete(acb); 2152 } 2153 2154 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2155 int64_t sector_num, 2156 QEMUIOVector *qiov, 2157 int nb_sectors, 2158 BdrvRequestFlags flags, 2159 BlockCompletionFunc *cb, 2160 void *opaque, 2161 bool is_write) 2162 { 2163 Coroutine *co; 2164 BlockAIOCBCoroutine *acb; 2165 2166 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2167 acb->need_bh = true; 2168 acb->req.error = -EINPROGRESS; 2169 acb->req.sector = sector_num; 2170 acb->req.nb_sectors = nb_sectors; 2171 acb->req.qiov = qiov; 2172 acb->req.flags = flags; 2173 acb->is_write = is_write; 2174 2175 co = qemu_coroutine_create(bdrv_co_do_rw); 2176 qemu_coroutine_enter(co, acb); 2177 2178 bdrv_co_maybe_schedule_bh(acb); 2179 return &acb->common; 2180 } 2181 2182 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2183 { 2184 BlockAIOCBCoroutine *acb = opaque; 2185 BlockDriverState *bs = acb->common.bs; 2186 2187 acb->req.error = bdrv_co_flush(bs); 2188 bdrv_co_complete(acb); 2189 } 2190 2191 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2192 BlockCompletionFunc *cb, void *opaque) 2193 { 2194 trace_bdrv_aio_flush(bs, opaque); 2195 2196 Coroutine *co; 2197 BlockAIOCBCoroutine *acb; 2198 2199 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2200 acb->need_bh = true; 2201 acb->req.error = -EINPROGRESS; 2202 2203 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2204 qemu_coroutine_enter(co, acb); 2205 2206 bdrv_co_maybe_schedule_bh(acb); 2207 return &acb->common; 2208 } 2209 2210 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2211 { 2212 BlockAIOCBCoroutine *acb = opaque; 2213 BlockDriverState *bs = acb->common.bs; 2214 2215 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2216 bdrv_co_complete(acb); 2217 } 2218 2219 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2220 int64_t sector_num, int nb_sectors, 2221 BlockCompletionFunc *cb, void *opaque) 2222 { 2223 Coroutine *co; 2224 BlockAIOCBCoroutine *acb; 2225 2226 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2227 2228 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2229 acb->need_bh = true; 2230 acb->req.error = -EINPROGRESS; 2231 acb->req.sector = sector_num; 2232 acb->req.nb_sectors = nb_sectors; 2233 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2234 qemu_coroutine_enter(co, acb); 2235 2236 bdrv_co_maybe_schedule_bh(acb); 2237 return &acb->common; 2238 } 2239 2240 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2241 BlockCompletionFunc *cb, void *opaque) 2242 { 2243 BlockAIOCB *acb; 2244 2245 acb = g_malloc(aiocb_info->aiocb_size); 2246 acb->aiocb_info = aiocb_info; 2247 acb->bs = bs; 2248 acb->cb = cb; 2249 acb->opaque = opaque; 2250 acb->refcnt = 1; 2251 return acb; 2252 } 2253 2254 void qemu_aio_ref(void *p) 2255 { 2256 BlockAIOCB *acb = p; 2257 acb->refcnt++; 2258 } 2259 2260 void qemu_aio_unref(void *p) 2261 { 2262 BlockAIOCB *acb = p; 2263 assert(acb->refcnt > 0); 2264 if (--acb->refcnt == 0) { 2265 g_free(acb); 2266 } 2267 } 2268 2269 /**************************************************************/ 2270 /* Coroutine block device emulation */ 2271 2272 typedef struct CoroutineIOCompletion { 2273 Coroutine *coroutine; 2274 int ret; 2275 } CoroutineIOCompletion; 2276 2277 static void bdrv_co_io_em_complete(void *opaque, int ret) 2278 { 2279 CoroutineIOCompletion *co = opaque; 2280 2281 co->ret = ret; 2282 qemu_coroutine_enter(co->coroutine, NULL); 2283 } 2284 2285 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2286 int nb_sectors, QEMUIOVector *iov, 2287 bool is_write) 2288 { 2289 CoroutineIOCompletion co = { 2290 .coroutine = qemu_coroutine_self(), 2291 }; 2292 BlockAIOCB *acb; 2293 2294 if (is_write) { 2295 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2296 bdrv_co_io_em_complete, &co); 2297 } else { 2298 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2299 bdrv_co_io_em_complete, &co); 2300 } 2301 2302 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2303 if (!acb) { 2304 return -EIO; 2305 } 2306 qemu_coroutine_yield(); 2307 2308 return co.ret; 2309 } 2310 2311 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2312 int64_t sector_num, int nb_sectors, 2313 QEMUIOVector *iov) 2314 { 2315 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2316 } 2317 2318 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2319 int64_t sector_num, int nb_sectors, 2320 QEMUIOVector *iov) 2321 { 2322 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2323 } 2324 2325 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2326 { 2327 RwCo *rwco = opaque; 2328 2329 rwco->ret = bdrv_co_flush(rwco->bs); 2330 } 2331 2332 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2333 { 2334 int ret; 2335 BdrvTrackedRequest req; 2336 2337 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2338 bdrv_is_sg(bs)) { 2339 return 0; 2340 } 2341 2342 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2343 /* Write back cached data to the OS even with cache=unsafe */ 2344 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2345 if (bs->drv->bdrv_co_flush_to_os) { 2346 ret = bs->drv->bdrv_co_flush_to_os(bs); 2347 if (ret < 0) { 2348 goto out; 2349 } 2350 } 2351 2352 /* But don't actually force it to the disk with cache=unsafe */ 2353 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2354 goto flush_parent; 2355 } 2356 2357 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2358 if (bs->drv->bdrv_co_flush_to_disk) { 2359 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2360 } else if (bs->drv->bdrv_aio_flush) { 2361 BlockAIOCB *acb; 2362 CoroutineIOCompletion co = { 2363 .coroutine = qemu_coroutine_self(), 2364 }; 2365 2366 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2367 if (acb == NULL) { 2368 ret = -EIO; 2369 } else { 2370 qemu_coroutine_yield(); 2371 ret = co.ret; 2372 } 2373 } else { 2374 /* 2375 * Some block drivers always operate in either writethrough or unsafe 2376 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2377 * know how the server works (because the behaviour is hardcoded or 2378 * depends on server-side configuration), so we can't ensure that 2379 * everything is safe on disk. Returning an error doesn't work because 2380 * that would break guests even if the server operates in writethrough 2381 * mode. 2382 * 2383 * Let's hope the user knows what he's doing. 2384 */ 2385 ret = 0; 2386 } 2387 if (ret < 0) { 2388 goto out; 2389 } 2390 2391 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2392 * in the case of cache=unsafe, so there are no useless flushes. 2393 */ 2394 flush_parent: 2395 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2396 out: 2397 tracked_request_end(&req); 2398 return ret; 2399 } 2400 2401 int bdrv_flush(BlockDriverState *bs) 2402 { 2403 Coroutine *co; 2404 RwCo rwco = { 2405 .bs = bs, 2406 .ret = NOT_DONE, 2407 }; 2408 2409 if (qemu_in_coroutine()) { 2410 /* Fast-path if already in coroutine context */ 2411 bdrv_flush_co_entry(&rwco); 2412 } else { 2413 AioContext *aio_context = bdrv_get_aio_context(bs); 2414 2415 co = qemu_coroutine_create(bdrv_flush_co_entry); 2416 qemu_coroutine_enter(co, &rwco); 2417 while (rwco.ret == NOT_DONE) { 2418 aio_poll(aio_context, true); 2419 } 2420 } 2421 2422 return rwco.ret; 2423 } 2424 2425 typedef struct DiscardCo { 2426 BlockDriverState *bs; 2427 int64_t sector_num; 2428 int nb_sectors; 2429 int ret; 2430 } DiscardCo; 2431 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2432 { 2433 DiscardCo *rwco = opaque; 2434 2435 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2436 } 2437 2438 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2439 int nb_sectors) 2440 { 2441 BdrvTrackedRequest req; 2442 int max_discard, ret; 2443 2444 if (!bs->drv) { 2445 return -ENOMEDIUM; 2446 } 2447 2448 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2449 if (ret < 0) { 2450 return ret; 2451 } else if (bs->read_only) { 2452 return -EPERM; 2453 } 2454 2455 /* Do nothing if disabled. */ 2456 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2457 return 0; 2458 } 2459 2460 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2461 return 0; 2462 } 2463 2464 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2465 BDRV_TRACKED_DISCARD); 2466 bdrv_set_dirty(bs, sector_num, nb_sectors); 2467 2468 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2469 while (nb_sectors > 0) { 2470 int ret; 2471 int num = nb_sectors; 2472 2473 /* align request */ 2474 if (bs->bl.discard_alignment && 2475 num >= bs->bl.discard_alignment && 2476 sector_num % bs->bl.discard_alignment) { 2477 if (num > bs->bl.discard_alignment) { 2478 num = bs->bl.discard_alignment; 2479 } 2480 num -= sector_num % bs->bl.discard_alignment; 2481 } 2482 2483 /* limit request size */ 2484 if (num > max_discard) { 2485 num = max_discard; 2486 } 2487 2488 if (bs->drv->bdrv_co_discard) { 2489 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2490 } else { 2491 BlockAIOCB *acb; 2492 CoroutineIOCompletion co = { 2493 .coroutine = qemu_coroutine_self(), 2494 }; 2495 2496 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2497 bdrv_co_io_em_complete, &co); 2498 if (acb == NULL) { 2499 ret = -EIO; 2500 goto out; 2501 } else { 2502 qemu_coroutine_yield(); 2503 ret = co.ret; 2504 } 2505 } 2506 if (ret && ret != -ENOTSUP) { 2507 goto out; 2508 } 2509 2510 sector_num += num; 2511 nb_sectors -= num; 2512 } 2513 ret = 0; 2514 out: 2515 tracked_request_end(&req); 2516 return ret; 2517 } 2518 2519 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2520 { 2521 Coroutine *co; 2522 DiscardCo rwco = { 2523 .bs = bs, 2524 .sector_num = sector_num, 2525 .nb_sectors = nb_sectors, 2526 .ret = NOT_DONE, 2527 }; 2528 2529 if (qemu_in_coroutine()) { 2530 /* Fast-path if already in coroutine context */ 2531 bdrv_discard_co_entry(&rwco); 2532 } else { 2533 AioContext *aio_context = bdrv_get_aio_context(bs); 2534 2535 co = qemu_coroutine_create(bdrv_discard_co_entry); 2536 qemu_coroutine_enter(co, &rwco); 2537 while (rwco.ret == NOT_DONE) { 2538 aio_poll(aio_context, true); 2539 } 2540 } 2541 2542 return rwco.ret; 2543 } 2544 2545 typedef struct { 2546 CoroutineIOCompletion *co; 2547 QEMUBH *bh; 2548 } BdrvIoctlCompletionData; 2549 2550 static void bdrv_ioctl_bh_cb(void *opaque) 2551 { 2552 BdrvIoctlCompletionData *data = opaque; 2553 2554 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2555 qemu_bh_delete(data->bh); 2556 } 2557 2558 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2559 { 2560 BlockDriver *drv = bs->drv; 2561 BdrvTrackedRequest tracked_req; 2562 CoroutineIOCompletion co = { 2563 .coroutine = qemu_coroutine_self(), 2564 }; 2565 BlockAIOCB *acb; 2566 2567 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2568 if (!drv || !drv->bdrv_aio_ioctl) { 2569 co.ret = -ENOTSUP; 2570 goto out; 2571 } 2572 2573 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2574 if (!acb) { 2575 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2576 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2577 bdrv_ioctl_bh_cb, data); 2578 data->co = &co; 2579 qemu_bh_schedule(data->bh); 2580 } 2581 qemu_coroutine_yield(); 2582 out: 2583 tracked_request_end(&tracked_req); 2584 return co.ret; 2585 } 2586 2587 typedef struct { 2588 BlockDriverState *bs; 2589 int req; 2590 void *buf; 2591 int ret; 2592 } BdrvIoctlCoData; 2593 2594 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2595 { 2596 BdrvIoctlCoData *data = opaque; 2597 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2598 } 2599 2600 /* needed for generic scsi interface */ 2601 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2602 { 2603 BdrvIoctlCoData data = { 2604 .bs = bs, 2605 .req = req, 2606 .buf = buf, 2607 .ret = -EINPROGRESS, 2608 }; 2609 2610 if (qemu_in_coroutine()) { 2611 /* Fast-path if already in coroutine context */ 2612 bdrv_co_ioctl_entry(&data); 2613 } else { 2614 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2615 qemu_coroutine_enter(co, &data); 2616 } 2617 while (data.ret == -EINPROGRESS) { 2618 aio_poll(bdrv_get_aio_context(bs), true); 2619 } 2620 return data.ret; 2621 } 2622 2623 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2624 { 2625 BlockAIOCBCoroutine *acb = opaque; 2626 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2627 acb->req.req, acb->req.buf); 2628 bdrv_co_complete(acb); 2629 } 2630 2631 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2632 unsigned long int req, void *buf, 2633 BlockCompletionFunc *cb, void *opaque) 2634 { 2635 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2636 bs, cb, opaque); 2637 Coroutine *co; 2638 2639 acb->need_bh = true; 2640 acb->req.error = -EINPROGRESS; 2641 acb->req.req = req; 2642 acb->req.buf = buf; 2643 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2644 qemu_coroutine_enter(co, acb); 2645 2646 bdrv_co_maybe_schedule_bh(acb); 2647 return &acb->common; 2648 } 2649 2650 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2651 { 2652 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2653 } 2654 2655 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2656 { 2657 return memset(qemu_blockalign(bs, size), 0, size); 2658 } 2659 2660 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2661 { 2662 size_t align = bdrv_opt_mem_align(bs); 2663 2664 /* Ensure that NULL is never returned on success */ 2665 assert(align > 0); 2666 if (size == 0) { 2667 size = align; 2668 } 2669 2670 return qemu_try_memalign(align, size); 2671 } 2672 2673 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2674 { 2675 void *mem = qemu_try_blockalign(bs, size); 2676 2677 if (mem) { 2678 memset(mem, 0, size); 2679 } 2680 2681 return mem; 2682 } 2683 2684 /* 2685 * Check if all memory in this vector is sector aligned. 2686 */ 2687 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2688 { 2689 int i; 2690 size_t alignment = bdrv_min_mem_align(bs); 2691 2692 for (i = 0; i < qiov->niov; i++) { 2693 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2694 return false; 2695 } 2696 if (qiov->iov[i].iov_len % alignment) { 2697 return false; 2698 } 2699 } 2700 2701 return true; 2702 } 2703 2704 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2705 NotifierWithReturn *notifier) 2706 { 2707 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2708 } 2709 2710 void bdrv_io_plug(BlockDriverState *bs) 2711 { 2712 BlockDriver *drv = bs->drv; 2713 if (drv && drv->bdrv_io_plug) { 2714 drv->bdrv_io_plug(bs); 2715 } else if (bs->file) { 2716 bdrv_io_plug(bs->file->bs); 2717 } 2718 } 2719 2720 void bdrv_io_unplug(BlockDriverState *bs) 2721 { 2722 BlockDriver *drv = bs->drv; 2723 if (drv && drv->bdrv_io_unplug) { 2724 drv->bdrv_io_unplug(bs); 2725 } else if (bs->file) { 2726 bdrv_io_unplug(bs->file->bs); 2727 } 2728 } 2729 2730 void bdrv_flush_io_queue(BlockDriverState *bs) 2731 { 2732 BlockDriver *drv = bs->drv; 2733 if (drv && drv->bdrv_flush_io_queue) { 2734 drv->bdrv_flush_io_queue(bs); 2735 } else if (bs->file) { 2736 bdrv_flush_io_queue(bs->file->bs); 2737 } 2738 bdrv_start_throttled_reqs(bs); 2739 } 2740 2741 void bdrv_drained_begin(BlockDriverState *bs) 2742 { 2743 if (!bs->quiesce_counter++) { 2744 aio_disable_external(bdrv_get_aio_context(bs)); 2745 } 2746 bdrv_drain(bs); 2747 } 2748 2749 void bdrv_drained_end(BlockDriverState *bs) 2750 { 2751 assert(bs->quiesce_counter > 0); 2752 if (--bs->quiesce_counter > 0) { 2753 return; 2754 } 2755 aio_enable_external(bdrv_get_aio_context(bs)); 2756 } 2757