1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "block/throttle-groups.h" 31 #include "qapi/error.h" 32 #include "qemu/error-report.h" 33 34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 35 36 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 37 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 38 BlockCompletionFunc *cb, void *opaque); 39 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 40 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 41 BlockCompletionFunc *cb, void *opaque); 42 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 43 int64_t sector_num, int nb_sectors, 44 QEMUIOVector *iov); 45 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 46 int64_t sector_num, int nb_sectors, 47 QEMUIOVector *iov); 48 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 49 int64_t sector_num, 50 QEMUIOVector *qiov, 51 int nb_sectors, 52 BdrvRequestFlags flags, 53 BlockCompletionFunc *cb, 54 void *opaque, 55 bool is_write); 56 static void coroutine_fn bdrv_co_do_rw(void *opaque); 57 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 58 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 59 60 /* throttling disk I/O limits */ 61 void bdrv_set_io_limits(BlockDriverState *bs, 62 ThrottleConfig *cfg) 63 { 64 int i; 65 66 throttle_group_config(bs, cfg); 67 68 for (i = 0; i < 2; i++) { 69 qemu_co_enter_next(&bs->throttled_reqs[i]); 70 } 71 } 72 73 /* this function drain all the throttled IOs */ 74 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 75 { 76 bool drained = false; 77 bool enabled = bs->io_limits_enabled; 78 int i; 79 80 bs->io_limits_enabled = false; 81 82 for (i = 0; i < 2; i++) { 83 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 84 drained = true; 85 } 86 } 87 88 bs->io_limits_enabled = enabled; 89 90 return drained; 91 } 92 93 void bdrv_io_limits_disable(BlockDriverState *bs) 94 { 95 bs->io_limits_enabled = false; 96 bdrv_start_throttled_reqs(bs); 97 throttle_group_unregister_bs(bs); 98 } 99 100 /* should be called before bdrv_set_io_limits if a limit is set */ 101 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 102 { 103 assert(!bs->io_limits_enabled); 104 throttle_group_register_bs(bs, group); 105 bs->io_limits_enabled = true; 106 } 107 108 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 109 { 110 /* this bs is not part of any group */ 111 if (!bs->throttle_state) { 112 return; 113 } 114 115 /* this bs is a part of the same group than the one we want */ 116 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 117 return; 118 } 119 120 /* need to change the group this bs belong to */ 121 bdrv_io_limits_disable(bs); 122 bdrv_io_limits_enable(bs, group); 123 } 124 125 void bdrv_setup_io_funcs(BlockDriver *bdrv) 126 { 127 /* Block drivers without coroutine functions need emulation */ 128 if (!bdrv->bdrv_co_readv) { 129 bdrv->bdrv_co_readv = bdrv_co_readv_em; 130 bdrv->bdrv_co_writev = bdrv_co_writev_em; 131 132 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 133 * the block driver lacks aio we need to emulate that too. 134 */ 135 if (!bdrv->bdrv_aio_readv) { 136 /* add AIO emulation layer */ 137 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 138 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 139 } 140 } 141 } 142 143 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 144 { 145 BlockDriver *drv = bs->drv; 146 Error *local_err = NULL; 147 148 memset(&bs->bl, 0, sizeof(bs->bl)); 149 150 if (!drv) { 151 return; 152 } 153 154 /* Take some limits from the children as a default */ 155 if (bs->file) { 156 bdrv_refresh_limits(bs->file->bs, &local_err); 157 if (local_err) { 158 error_propagate(errp, local_err); 159 return; 160 } 161 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 162 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 163 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 164 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 165 bs->bl.max_iov = bs->file->bs->bl.max_iov; 166 } else { 167 bs->bl.min_mem_alignment = 512; 168 bs->bl.opt_mem_alignment = getpagesize(); 169 170 /* Safe default since most protocols use readv()/writev()/etc */ 171 bs->bl.max_iov = IOV_MAX; 172 } 173 174 if (bs->backing) { 175 bdrv_refresh_limits(bs->backing->bs, &local_err); 176 if (local_err) { 177 error_propagate(errp, local_err); 178 return; 179 } 180 bs->bl.opt_transfer_length = 181 MAX(bs->bl.opt_transfer_length, 182 bs->backing->bs->bl.opt_transfer_length); 183 bs->bl.max_transfer_length = 184 MIN_NON_ZERO(bs->bl.max_transfer_length, 185 bs->backing->bs->bl.max_transfer_length); 186 bs->bl.opt_mem_alignment = 187 MAX(bs->bl.opt_mem_alignment, 188 bs->backing->bs->bl.opt_mem_alignment); 189 bs->bl.min_mem_alignment = 190 MAX(bs->bl.min_mem_alignment, 191 bs->backing->bs->bl.min_mem_alignment); 192 bs->bl.max_iov = 193 MIN(bs->bl.max_iov, 194 bs->backing->bs->bl.max_iov); 195 } 196 197 /* Then let the driver override it */ 198 if (drv->bdrv_refresh_limits) { 199 drv->bdrv_refresh_limits(bs, errp); 200 } 201 } 202 203 /** 204 * The copy-on-read flag is actually a reference count so multiple users may 205 * use the feature without worrying about clobbering its previous state. 206 * Copy-on-read stays enabled until all users have called to disable it. 207 */ 208 void bdrv_enable_copy_on_read(BlockDriverState *bs) 209 { 210 bs->copy_on_read++; 211 } 212 213 void bdrv_disable_copy_on_read(BlockDriverState *bs) 214 { 215 assert(bs->copy_on_read > 0); 216 bs->copy_on_read--; 217 } 218 219 /* Check if any requests are in-flight (including throttled requests) */ 220 bool bdrv_requests_pending(BlockDriverState *bs) 221 { 222 BdrvChild *child; 223 224 if (!QLIST_EMPTY(&bs->tracked_requests)) { 225 return true; 226 } 227 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 228 return true; 229 } 230 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 231 return true; 232 } 233 234 QLIST_FOREACH(child, &bs->children, next) { 235 if (bdrv_requests_pending(child->bs)) { 236 return true; 237 } 238 } 239 240 return false; 241 } 242 243 static void bdrv_drain_recurse(BlockDriverState *bs) 244 { 245 BdrvChild *child; 246 247 if (bs->drv && bs->drv->bdrv_drain) { 248 bs->drv->bdrv_drain(bs); 249 } 250 QLIST_FOREACH(child, &bs->children, next) { 251 bdrv_drain_recurse(child->bs); 252 } 253 } 254 255 /* 256 * Wait for pending requests to complete on a single BlockDriverState subtree, 257 * and suspend block driver's internal I/O until next request arrives. 258 * 259 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 260 * AioContext. 261 * 262 * Only this BlockDriverState's AioContext is run, so in-flight requests must 263 * not depend on events in other AioContexts. In that case, use 264 * bdrv_drain_all() instead. 265 */ 266 void bdrv_drain(BlockDriverState *bs) 267 { 268 bool busy = true; 269 270 bdrv_drain_recurse(bs); 271 while (busy) { 272 /* Keep iterating */ 273 bdrv_flush_io_queue(bs); 274 busy = bdrv_requests_pending(bs); 275 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 276 } 277 } 278 279 /* 280 * Wait for pending requests to complete across all BlockDriverStates 281 * 282 * This function does not flush data to disk, use bdrv_flush_all() for that 283 * after calling this function. 284 */ 285 void bdrv_drain_all(void) 286 { 287 /* Always run first iteration so any pending completion BHs run */ 288 bool busy = true; 289 BlockDriverState *bs = NULL; 290 GSList *aio_ctxs = NULL, *ctx; 291 292 while ((bs = bdrv_next(bs))) { 293 AioContext *aio_context = bdrv_get_aio_context(bs); 294 295 aio_context_acquire(aio_context); 296 if (bs->job) { 297 block_job_pause(bs->job); 298 } 299 bdrv_drain_recurse(bs); 300 aio_context_release(aio_context); 301 302 if (!g_slist_find(aio_ctxs, aio_context)) { 303 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 304 } 305 } 306 307 /* Note that completion of an asynchronous I/O operation can trigger any 308 * number of other I/O operations on other devices---for example a 309 * coroutine can submit an I/O request to another device in response to 310 * request completion. Therefore we must keep looping until there was no 311 * more activity rather than simply draining each device independently. 312 */ 313 while (busy) { 314 busy = false; 315 316 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 317 AioContext *aio_context = ctx->data; 318 bs = NULL; 319 320 aio_context_acquire(aio_context); 321 while ((bs = bdrv_next(bs))) { 322 if (aio_context == bdrv_get_aio_context(bs)) { 323 bdrv_flush_io_queue(bs); 324 if (bdrv_requests_pending(bs)) { 325 busy = true; 326 aio_poll(aio_context, busy); 327 } 328 } 329 } 330 busy |= aio_poll(aio_context, false); 331 aio_context_release(aio_context); 332 } 333 } 334 335 bs = NULL; 336 while ((bs = bdrv_next(bs))) { 337 AioContext *aio_context = bdrv_get_aio_context(bs); 338 339 aio_context_acquire(aio_context); 340 if (bs->job) { 341 block_job_resume(bs->job); 342 } 343 aio_context_release(aio_context); 344 } 345 g_slist_free(aio_ctxs); 346 } 347 348 /** 349 * Remove an active request from the tracked requests list 350 * 351 * This function should be called when a tracked request is completing. 352 */ 353 static void tracked_request_end(BdrvTrackedRequest *req) 354 { 355 if (req->serialising) { 356 req->bs->serialising_in_flight--; 357 } 358 359 QLIST_REMOVE(req, list); 360 qemu_co_queue_restart_all(&req->wait_queue); 361 } 362 363 /** 364 * Add an active request to the tracked requests list 365 */ 366 static void tracked_request_begin(BdrvTrackedRequest *req, 367 BlockDriverState *bs, 368 int64_t offset, 369 unsigned int bytes, 370 enum BdrvTrackedRequestType type) 371 { 372 *req = (BdrvTrackedRequest){ 373 .bs = bs, 374 .offset = offset, 375 .bytes = bytes, 376 .type = type, 377 .co = qemu_coroutine_self(), 378 .serialising = false, 379 .overlap_offset = offset, 380 .overlap_bytes = bytes, 381 }; 382 383 qemu_co_queue_init(&req->wait_queue); 384 385 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 386 } 387 388 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 389 { 390 int64_t overlap_offset = req->offset & ~(align - 1); 391 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 392 - overlap_offset; 393 394 if (!req->serialising) { 395 req->bs->serialising_in_flight++; 396 req->serialising = true; 397 } 398 399 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 400 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 401 } 402 403 /** 404 * Round a region to cluster boundaries 405 */ 406 void bdrv_round_to_clusters(BlockDriverState *bs, 407 int64_t sector_num, int nb_sectors, 408 int64_t *cluster_sector_num, 409 int *cluster_nb_sectors) 410 { 411 BlockDriverInfo bdi; 412 413 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 414 *cluster_sector_num = sector_num; 415 *cluster_nb_sectors = nb_sectors; 416 } else { 417 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 418 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 419 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 420 nb_sectors, c); 421 } 422 } 423 424 static int bdrv_get_cluster_size(BlockDriverState *bs) 425 { 426 BlockDriverInfo bdi; 427 int ret; 428 429 ret = bdrv_get_info(bs, &bdi); 430 if (ret < 0 || bdi.cluster_size == 0) { 431 return bs->request_alignment; 432 } else { 433 return bdi.cluster_size; 434 } 435 } 436 437 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 438 int64_t offset, unsigned int bytes) 439 { 440 /* aaaa bbbb */ 441 if (offset >= req->overlap_offset + req->overlap_bytes) { 442 return false; 443 } 444 /* bbbb aaaa */ 445 if (req->overlap_offset >= offset + bytes) { 446 return false; 447 } 448 return true; 449 } 450 451 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 452 { 453 BlockDriverState *bs = self->bs; 454 BdrvTrackedRequest *req; 455 bool retry; 456 bool waited = false; 457 458 if (!bs->serialising_in_flight) { 459 return false; 460 } 461 462 do { 463 retry = false; 464 QLIST_FOREACH(req, &bs->tracked_requests, list) { 465 if (req == self || (!req->serialising && !self->serialising)) { 466 continue; 467 } 468 if (tracked_request_overlaps(req, self->overlap_offset, 469 self->overlap_bytes)) 470 { 471 /* Hitting this means there was a reentrant request, for 472 * example, a block driver issuing nested requests. This must 473 * never happen since it means deadlock. 474 */ 475 assert(qemu_coroutine_self() != req->co); 476 477 /* If the request is already (indirectly) waiting for us, or 478 * will wait for us as soon as it wakes up, then just go on 479 * (instead of producing a deadlock in the former case). */ 480 if (!req->waiting_for) { 481 self->waiting_for = req; 482 qemu_co_queue_wait(&req->wait_queue); 483 self->waiting_for = NULL; 484 retry = true; 485 waited = true; 486 break; 487 } 488 } 489 } 490 } while (retry); 491 492 return waited; 493 } 494 495 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 496 size_t size) 497 { 498 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 499 return -EIO; 500 } 501 502 if (!bdrv_is_inserted(bs)) { 503 return -ENOMEDIUM; 504 } 505 506 if (offset < 0) { 507 return -EIO; 508 } 509 510 return 0; 511 } 512 513 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 514 int nb_sectors) 515 { 516 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 517 return -EIO; 518 } 519 520 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 521 nb_sectors * BDRV_SECTOR_SIZE); 522 } 523 524 typedef struct RwCo { 525 BlockDriverState *bs; 526 int64_t offset; 527 QEMUIOVector *qiov; 528 bool is_write; 529 int ret; 530 BdrvRequestFlags flags; 531 } RwCo; 532 533 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 534 { 535 RwCo *rwco = opaque; 536 537 if (!rwco->is_write) { 538 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 539 rwco->qiov->size, rwco->qiov, 540 rwco->flags); 541 } else { 542 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 543 rwco->qiov->size, rwco->qiov, 544 rwco->flags); 545 } 546 } 547 548 /* 549 * Process a vectored synchronous request using coroutines 550 */ 551 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 552 QEMUIOVector *qiov, bool is_write, 553 BdrvRequestFlags flags) 554 { 555 Coroutine *co; 556 RwCo rwco = { 557 .bs = bs, 558 .offset = offset, 559 .qiov = qiov, 560 .is_write = is_write, 561 .ret = NOT_DONE, 562 .flags = flags, 563 }; 564 565 /** 566 * In sync call context, when the vcpu is blocked, this throttling timer 567 * will not fire; so the I/O throttling function has to be disabled here 568 * if it has been enabled. 569 */ 570 if (bs->io_limits_enabled) { 571 fprintf(stderr, "Disabling I/O throttling on '%s' due " 572 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 573 bdrv_io_limits_disable(bs); 574 } 575 576 if (qemu_in_coroutine()) { 577 /* Fast-path if already in coroutine context */ 578 bdrv_rw_co_entry(&rwco); 579 } else { 580 AioContext *aio_context = bdrv_get_aio_context(bs); 581 582 co = qemu_coroutine_create(bdrv_rw_co_entry); 583 qemu_coroutine_enter(co, &rwco); 584 while (rwco.ret == NOT_DONE) { 585 aio_poll(aio_context, true); 586 } 587 } 588 return rwco.ret; 589 } 590 591 /* 592 * Process a synchronous request using coroutines 593 */ 594 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 595 int nb_sectors, bool is_write, BdrvRequestFlags flags) 596 { 597 QEMUIOVector qiov; 598 struct iovec iov = { 599 .iov_base = (void *)buf, 600 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 601 }; 602 603 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 604 return -EINVAL; 605 } 606 607 qemu_iovec_init_external(&qiov, &iov, 1); 608 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 609 &qiov, is_write, flags); 610 } 611 612 /* return < 0 if error. See bdrv_write() for the return codes */ 613 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 614 uint8_t *buf, int nb_sectors) 615 { 616 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 617 } 618 619 /* Return < 0 if error. Important errors are: 620 -EIO generic I/O error (may happen for all errors) 621 -ENOMEDIUM No media inserted. 622 -EINVAL Invalid sector number or nb_sectors 623 -EACCES Trying to write a read-only device 624 */ 625 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 626 const uint8_t *buf, int nb_sectors) 627 { 628 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 629 } 630 631 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 632 int nb_sectors, BdrvRequestFlags flags) 633 { 634 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 635 BDRV_REQ_ZERO_WRITE | flags); 636 } 637 638 /* 639 * Completely zero out a block device with the help of bdrv_write_zeroes. 640 * The operation is sped up by checking the block status and only writing 641 * zeroes to the device if they currently do not return zeroes. Optional 642 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 643 * 644 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 645 */ 646 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 647 { 648 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 649 BlockDriverState *file; 650 int n; 651 652 target_sectors = bdrv_nb_sectors(bs); 653 if (target_sectors < 0) { 654 return target_sectors; 655 } 656 657 for (;;) { 658 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 659 if (nb_sectors <= 0) { 660 return 0; 661 } 662 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); 663 if (ret < 0) { 664 error_report("error getting block status at sector %" PRId64 ": %s", 665 sector_num, strerror(-ret)); 666 return ret; 667 } 668 if (ret & BDRV_BLOCK_ZERO) { 669 sector_num += n; 670 continue; 671 } 672 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 673 if (ret < 0) { 674 error_report("error writing zeroes at sector %" PRId64 ": %s", 675 sector_num, strerror(-ret)); 676 return ret; 677 } 678 sector_num += n; 679 } 680 } 681 682 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 683 { 684 QEMUIOVector qiov; 685 struct iovec iov = { 686 .iov_base = (void *)buf, 687 .iov_len = bytes, 688 }; 689 int ret; 690 691 if (bytes < 0) { 692 return -EINVAL; 693 } 694 695 qemu_iovec_init_external(&qiov, &iov, 1); 696 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 697 if (ret < 0) { 698 return ret; 699 } 700 701 return bytes; 702 } 703 704 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 705 { 706 int ret; 707 708 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 709 if (ret < 0) { 710 return ret; 711 } 712 713 return qiov->size; 714 } 715 716 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 717 const void *buf, int bytes) 718 { 719 QEMUIOVector qiov; 720 struct iovec iov = { 721 .iov_base = (void *) buf, 722 .iov_len = bytes, 723 }; 724 725 if (bytes < 0) { 726 return -EINVAL; 727 } 728 729 qemu_iovec_init_external(&qiov, &iov, 1); 730 return bdrv_pwritev(bs, offset, &qiov); 731 } 732 733 /* 734 * Writes to the file and ensures that no writes are reordered across this 735 * request (acts as a barrier) 736 * 737 * Returns 0 on success, -errno in error cases. 738 */ 739 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 740 const void *buf, int count) 741 { 742 int ret; 743 744 ret = bdrv_pwrite(bs, offset, buf, count); 745 if (ret < 0) { 746 return ret; 747 } 748 749 /* No flush needed for cache modes that already do it */ 750 if (bs->enable_write_cache) { 751 bdrv_flush(bs); 752 } 753 754 return 0; 755 } 756 757 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 758 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 759 { 760 /* Perform I/O through a temporary buffer so that users who scribble over 761 * their read buffer while the operation is in progress do not end up 762 * modifying the image file. This is critical for zero-copy guest I/O 763 * where anything might happen inside guest memory. 764 */ 765 void *bounce_buffer; 766 767 BlockDriver *drv = bs->drv; 768 struct iovec iov; 769 QEMUIOVector bounce_qiov; 770 int64_t cluster_sector_num; 771 int cluster_nb_sectors; 772 size_t skip_bytes; 773 int ret; 774 775 /* Cover entire cluster so no additional backing file I/O is required when 776 * allocating cluster in the image file. 777 */ 778 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 779 &cluster_sector_num, &cluster_nb_sectors); 780 781 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 782 cluster_sector_num, cluster_nb_sectors); 783 784 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 785 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 786 if (bounce_buffer == NULL) { 787 ret = -ENOMEM; 788 goto err; 789 } 790 791 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 792 793 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 794 &bounce_qiov); 795 if (ret < 0) { 796 goto err; 797 } 798 799 if (drv->bdrv_co_write_zeroes && 800 buffer_is_zero(bounce_buffer, iov.iov_len)) { 801 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 802 cluster_nb_sectors, 0); 803 } else { 804 /* This does not change the data on the disk, it is not necessary 805 * to flush even in cache=writethrough mode. 806 */ 807 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 808 &bounce_qiov); 809 } 810 811 if (ret < 0) { 812 /* It might be okay to ignore write errors for guest requests. If this 813 * is a deliberate copy-on-read then we don't want to ignore the error. 814 * Simply report it in all cases. 815 */ 816 goto err; 817 } 818 819 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 820 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 821 nb_sectors * BDRV_SECTOR_SIZE); 822 823 err: 824 qemu_vfree(bounce_buffer); 825 return ret; 826 } 827 828 /* 829 * Forwards an already correctly aligned request to the BlockDriver. This 830 * handles copy on read and zeroing after EOF; any other features must be 831 * implemented by the caller. 832 */ 833 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 834 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 835 int64_t align, QEMUIOVector *qiov, int flags) 836 { 837 BlockDriver *drv = bs->drv; 838 int ret; 839 840 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 841 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 842 843 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 844 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 845 assert(!qiov || bytes == qiov->size); 846 847 /* Handle Copy on Read and associated serialisation */ 848 if (flags & BDRV_REQ_COPY_ON_READ) { 849 /* If we touch the same cluster it counts as an overlap. This 850 * guarantees that allocating writes will be serialized and not race 851 * with each other for the same cluster. For example, in copy-on-read 852 * it ensures that the CoR read and write operations are atomic and 853 * guest writes cannot interleave between them. */ 854 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 855 } 856 857 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 858 wait_serialising_requests(req); 859 } 860 861 if (flags & BDRV_REQ_COPY_ON_READ) { 862 int pnum; 863 864 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 865 if (ret < 0) { 866 goto out; 867 } 868 869 if (!ret || pnum != nb_sectors) { 870 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 871 goto out; 872 } 873 } 874 875 /* Forward the request to the BlockDriver */ 876 if (!bs->zero_beyond_eof) { 877 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 878 } else { 879 /* Read zeros after EOF */ 880 int64_t total_sectors, max_nb_sectors; 881 882 total_sectors = bdrv_nb_sectors(bs); 883 if (total_sectors < 0) { 884 ret = total_sectors; 885 goto out; 886 } 887 888 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 889 align >> BDRV_SECTOR_BITS); 890 if (nb_sectors < max_nb_sectors) { 891 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 892 } else if (max_nb_sectors > 0) { 893 QEMUIOVector local_qiov; 894 895 qemu_iovec_init(&local_qiov, qiov->niov); 896 qemu_iovec_concat(&local_qiov, qiov, 0, 897 max_nb_sectors * BDRV_SECTOR_SIZE); 898 899 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 900 &local_qiov); 901 902 qemu_iovec_destroy(&local_qiov); 903 } else { 904 ret = 0; 905 } 906 907 /* Reading beyond end of file is supposed to produce zeroes */ 908 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 909 uint64_t offset = MAX(0, total_sectors - sector_num); 910 uint64_t bytes = (sector_num + nb_sectors - offset) * 911 BDRV_SECTOR_SIZE; 912 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 913 } 914 } 915 916 out: 917 return ret; 918 } 919 920 /* 921 * Handle a read request in coroutine context 922 */ 923 int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 924 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 925 BdrvRequestFlags flags) 926 { 927 BlockDriver *drv = bs->drv; 928 BdrvTrackedRequest req; 929 930 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 931 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 932 uint8_t *head_buf = NULL; 933 uint8_t *tail_buf = NULL; 934 QEMUIOVector local_qiov; 935 bool use_local_qiov = false; 936 int ret; 937 938 if (!drv) { 939 return -ENOMEDIUM; 940 } 941 942 ret = bdrv_check_byte_request(bs, offset, bytes); 943 if (ret < 0) { 944 return ret; 945 } 946 947 /* Don't do copy-on-read if we read data before write operation */ 948 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 949 flags |= BDRV_REQ_COPY_ON_READ; 950 } 951 952 /* throttling disk I/O */ 953 if (bs->io_limits_enabled) { 954 throttle_group_co_io_limits_intercept(bs, bytes, false); 955 } 956 957 /* Align read if necessary by padding qiov */ 958 if (offset & (align - 1)) { 959 head_buf = qemu_blockalign(bs, align); 960 qemu_iovec_init(&local_qiov, qiov->niov + 2); 961 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 962 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 963 use_local_qiov = true; 964 965 bytes += offset & (align - 1); 966 offset = offset & ~(align - 1); 967 } 968 969 if ((offset + bytes) & (align - 1)) { 970 if (!use_local_qiov) { 971 qemu_iovec_init(&local_qiov, qiov->niov + 1); 972 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 973 use_local_qiov = true; 974 } 975 tail_buf = qemu_blockalign(bs, align); 976 qemu_iovec_add(&local_qiov, tail_buf, 977 align - ((offset + bytes) & (align - 1))); 978 979 bytes = ROUND_UP(bytes, align); 980 } 981 982 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 983 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 984 use_local_qiov ? &local_qiov : qiov, 985 flags); 986 tracked_request_end(&req); 987 988 if (use_local_qiov) { 989 qemu_iovec_destroy(&local_qiov); 990 qemu_vfree(head_buf); 991 qemu_vfree(tail_buf); 992 } 993 994 return ret; 995 } 996 997 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 998 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 999 BdrvRequestFlags flags) 1000 { 1001 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1002 return -EINVAL; 1003 } 1004 1005 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1006 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1007 } 1008 1009 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1010 int nb_sectors, QEMUIOVector *qiov) 1011 { 1012 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1013 1014 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1015 } 1016 1017 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, 1018 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1019 { 1020 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); 1021 1022 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1023 BDRV_REQ_NO_SERIALISING); 1024 } 1025 1026 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1027 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1028 { 1029 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1030 1031 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1032 BDRV_REQ_COPY_ON_READ); 1033 } 1034 1035 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1036 1037 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1038 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1039 { 1040 BlockDriver *drv = bs->drv; 1041 QEMUIOVector qiov; 1042 struct iovec iov = {0}; 1043 int ret = 0; 1044 1045 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1046 BDRV_REQUEST_MAX_SECTORS); 1047 1048 while (nb_sectors > 0 && !ret) { 1049 int num = nb_sectors; 1050 1051 /* Align request. Block drivers can expect the "bulk" of the request 1052 * to be aligned. 1053 */ 1054 if (bs->bl.write_zeroes_alignment 1055 && num > bs->bl.write_zeroes_alignment) { 1056 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1057 /* Make a small request up to the first aligned sector. */ 1058 num = bs->bl.write_zeroes_alignment; 1059 num -= sector_num % bs->bl.write_zeroes_alignment; 1060 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1061 /* Shorten the request to the last aligned sector. num cannot 1062 * underflow because num > bs->bl.write_zeroes_alignment. 1063 */ 1064 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1065 } 1066 } 1067 1068 /* limit request size */ 1069 if (num > max_write_zeroes) { 1070 num = max_write_zeroes; 1071 } 1072 1073 ret = -ENOTSUP; 1074 /* First try the efficient write zeroes operation */ 1075 if (drv->bdrv_co_write_zeroes) { 1076 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1077 } 1078 1079 if (ret == -ENOTSUP) { 1080 /* Fall back to bounce buffer if write zeroes is unsupported */ 1081 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1082 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1083 num = MIN(num, max_xfer_len); 1084 iov.iov_len = num * BDRV_SECTOR_SIZE; 1085 if (iov.iov_base == NULL) { 1086 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1087 if (iov.iov_base == NULL) { 1088 ret = -ENOMEM; 1089 goto fail; 1090 } 1091 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1092 } 1093 qemu_iovec_init_external(&qiov, &iov, 1); 1094 1095 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1096 1097 /* Keep bounce buffer around if it is big enough for all 1098 * all future requests. 1099 */ 1100 if (num < max_xfer_len) { 1101 qemu_vfree(iov.iov_base); 1102 iov.iov_base = NULL; 1103 } 1104 } 1105 1106 sector_num += num; 1107 nb_sectors -= num; 1108 } 1109 1110 fail: 1111 qemu_vfree(iov.iov_base); 1112 return ret; 1113 } 1114 1115 /* 1116 * Forwards an already correctly aligned write request to the BlockDriver. 1117 */ 1118 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1119 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1120 QEMUIOVector *qiov, int flags) 1121 { 1122 BlockDriver *drv = bs->drv; 1123 bool waited; 1124 int ret; 1125 1126 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1127 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1128 1129 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1130 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1131 assert(!qiov || bytes == qiov->size); 1132 1133 waited = wait_serialising_requests(req); 1134 assert(!waited || !req->serialising); 1135 assert(req->overlap_offset <= offset); 1136 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1137 1138 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1139 1140 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1141 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1142 qemu_iovec_is_zero(qiov)) { 1143 flags |= BDRV_REQ_ZERO_WRITE; 1144 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1145 flags |= BDRV_REQ_MAY_UNMAP; 1146 } 1147 } 1148 1149 if (ret < 0) { 1150 /* Do nothing, write notifier decided to fail this request */ 1151 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1152 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1153 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1154 } else { 1155 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1156 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1157 } 1158 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1159 1160 if (ret == 0 && !bs->enable_write_cache) { 1161 ret = bdrv_co_flush(bs); 1162 } 1163 1164 bdrv_set_dirty(bs, sector_num, nb_sectors); 1165 1166 if (bs->wr_highest_offset < offset + bytes) { 1167 bs->wr_highest_offset = offset + bytes; 1168 } 1169 1170 if (ret >= 0) { 1171 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1172 } 1173 1174 return ret; 1175 } 1176 1177 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1178 int64_t offset, 1179 unsigned int bytes, 1180 BdrvRequestFlags flags, 1181 BdrvTrackedRequest *req) 1182 { 1183 uint8_t *buf = NULL; 1184 QEMUIOVector local_qiov; 1185 struct iovec iov; 1186 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1187 unsigned int head_padding_bytes, tail_padding_bytes; 1188 int ret = 0; 1189 1190 head_padding_bytes = offset & (align - 1); 1191 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1192 1193 1194 assert(flags & BDRV_REQ_ZERO_WRITE); 1195 if (head_padding_bytes || tail_padding_bytes) { 1196 buf = qemu_blockalign(bs, align); 1197 iov = (struct iovec) { 1198 .iov_base = buf, 1199 .iov_len = align, 1200 }; 1201 qemu_iovec_init_external(&local_qiov, &iov, 1); 1202 } 1203 if (head_padding_bytes) { 1204 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1205 1206 /* RMW the unaligned part before head. */ 1207 mark_request_serialising(req, align); 1208 wait_serialising_requests(req); 1209 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1210 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1211 align, &local_qiov, 0); 1212 if (ret < 0) { 1213 goto fail; 1214 } 1215 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1216 1217 memset(buf + head_padding_bytes, 0, zero_bytes); 1218 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1219 &local_qiov, 1220 flags & ~BDRV_REQ_ZERO_WRITE); 1221 if (ret < 0) { 1222 goto fail; 1223 } 1224 offset += zero_bytes; 1225 bytes -= zero_bytes; 1226 } 1227 1228 assert(!bytes || (offset & (align - 1)) == 0); 1229 if (bytes >= align) { 1230 /* Write the aligned part in the middle. */ 1231 uint64_t aligned_bytes = bytes & ~(align - 1); 1232 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1233 NULL, flags); 1234 if (ret < 0) { 1235 goto fail; 1236 } 1237 bytes -= aligned_bytes; 1238 offset += aligned_bytes; 1239 } 1240 1241 assert(!bytes || (offset & (align - 1)) == 0); 1242 if (bytes) { 1243 assert(align == tail_padding_bytes + bytes); 1244 /* RMW the unaligned part after tail. */ 1245 mark_request_serialising(req, align); 1246 wait_serialising_requests(req); 1247 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1248 ret = bdrv_aligned_preadv(bs, req, offset, align, 1249 align, &local_qiov, 0); 1250 if (ret < 0) { 1251 goto fail; 1252 } 1253 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1254 1255 memset(buf, 0, bytes); 1256 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1257 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1258 } 1259 fail: 1260 qemu_vfree(buf); 1261 return ret; 1262 1263 } 1264 1265 /* 1266 * Handle a write request in coroutine context 1267 */ 1268 int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1269 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1270 BdrvRequestFlags flags) 1271 { 1272 BdrvTrackedRequest req; 1273 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1274 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1275 uint8_t *head_buf = NULL; 1276 uint8_t *tail_buf = NULL; 1277 QEMUIOVector local_qiov; 1278 bool use_local_qiov = false; 1279 int ret; 1280 1281 if (!bs->drv) { 1282 return -ENOMEDIUM; 1283 } 1284 if (bs->read_only) { 1285 return -EPERM; 1286 } 1287 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1288 1289 ret = bdrv_check_byte_request(bs, offset, bytes); 1290 if (ret < 0) { 1291 return ret; 1292 } 1293 1294 /* throttling disk I/O */ 1295 if (bs->io_limits_enabled) { 1296 throttle_group_co_io_limits_intercept(bs, bytes, true); 1297 } 1298 1299 /* 1300 * Align write if necessary by performing a read-modify-write cycle. 1301 * Pad qiov with the read parts and be sure to have a tracked request not 1302 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1303 */ 1304 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1305 1306 if (!qiov) { 1307 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1308 goto out; 1309 } 1310 1311 if (offset & (align - 1)) { 1312 QEMUIOVector head_qiov; 1313 struct iovec head_iov; 1314 1315 mark_request_serialising(&req, align); 1316 wait_serialising_requests(&req); 1317 1318 head_buf = qemu_blockalign(bs, align); 1319 head_iov = (struct iovec) { 1320 .iov_base = head_buf, 1321 .iov_len = align, 1322 }; 1323 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1324 1325 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1326 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1327 align, &head_qiov, 0); 1328 if (ret < 0) { 1329 goto fail; 1330 } 1331 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1332 1333 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1334 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1335 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1336 use_local_qiov = true; 1337 1338 bytes += offset & (align - 1); 1339 offset = offset & ~(align - 1); 1340 } 1341 1342 if ((offset + bytes) & (align - 1)) { 1343 QEMUIOVector tail_qiov; 1344 struct iovec tail_iov; 1345 size_t tail_bytes; 1346 bool waited; 1347 1348 mark_request_serialising(&req, align); 1349 waited = wait_serialising_requests(&req); 1350 assert(!waited || !use_local_qiov); 1351 1352 tail_buf = qemu_blockalign(bs, align); 1353 tail_iov = (struct iovec) { 1354 .iov_base = tail_buf, 1355 .iov_len = align, 1356 }; 1357 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1358 1359 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1360 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1361 align, &tail_qiov, 0); 1362 if (ret < 0) { 1363 goto fail; 1364 } 1365 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1366 1367 if (!use_local_qiov) { 1368 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1369 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1370 use_local_qiov = true; 1371 } 1372 1373 tail_bytes = (offset + bytes) & (align - 1); 1374 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1375 1376 bytes = ROUND_UP(bytes, align); 1377 } 1378 1379 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1380 use_local_qiov ? &local_qiov : qiov, 1381 flags); 1382 1383 fail: 1384 1385 if (use_local_qiov) { 1386 qemu_iovec_destroy(&local_qiov); 1387 } 1388 qemu_vfree(head_buf); 1389 qemu_vfree(tail_buf); 1390 out: 1391 tracked_request_end(&req); 1392 return ret; 1393 } 1394 1395 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1396 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1397 BdrvRequestFlags flags) 1398 { 1399 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1400 return -EINVAL; 1401 } 1402 1403 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1404 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1405 } 1406 1407 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1408 int nb_sectors, QEMUIOVector *qiov) 1409 { 1410 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1411 1412 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1413 } 1414 1415 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1416 int64_t sector_num, int nb_sectors, 1417 BdrvRequestFlags flags) 1418 { 1419 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1420 1421 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1422 flags &= ~BDRV_REQ_MAY_UNMAP; 1423 } 1424 1425 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1426 BDRV_REQ_ZERO_WRITE | flags); 1427 } 1428 1429 typedef struct BdrvCoGetBlockStatusData { 1430 BlockDriverState *bs; 1431 BlockDriverState *base; 1432 BlockDriverState **file; 1433 int64_t sector_num; 1434 int nb_sectors; 1435 int *pnum; 1436 int64_t ret; 1437 bool done; 1438 } BdrvCoGetBlockStatusData; 1439 1440 /* 1441 * Returns the allocation status of the specified sectors. 1442 * Drivers not implementing the functionality are assumed to not support 1443 * backing files, hence all their sectors are reported as allocated. 1444 * 1445 * If 'sector_num' is beyond the end of the disk image the return value is 0 1446 * and 'pnum' is set to 0. 1447 * 1448 * 'pnum' is set to the number of sectors (including and immediately following 1449 * the specified sector) that are known to be in the same 1450 * allocated/unallocated state. 1451 * 1452 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1453 * beyond the end of the disk image it will be clamped. 1454 * 1455 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' 1456 * points to the BDS which the sector range is allocated in. 1457 */ 1458 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1459 int64_t sector_num, 1460 int nb_sectors, int *pnum, 1461 BlockDriverState **file) 1462 { 1463 int64_t total_sectors; 1464 int64_t n; 1465 int64_t ret, ret2; 1466 1467 total_sectors = bdrv_nb_sectors(bs); 1468 if (total_sectors < 0) { 1469 return total_sectors; 1470 } 1471 1472 if (sector_num >= total_sectors) { 1473 *pnum = 0; 1474 return 0; 1475 } 1476 1477 n = total_sectors - sector_num; 1478 if (n < nb_sectors) { 1479 nb_sectors = n; 1480 } 1481 1482 if (!bs->drv->bdrv_co_get_block_status) { 1483 *pnum = nb_sectors; 1484 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1485 if (bs->drv->protocol_name) { 1486 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1487 } 1488 return ret; 1489 } 1490 1491 *file = NULL; 1492 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, 1493 file); 1494 if (ret < 0) { 1495 *pnum = 0; 1496 return ret; 1497 } 1498 1499 if (ret & BDRV_BLOCK_RAW) { 1500 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1501 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1502 *pnum, pnum, file); 1503 } 1504 1505 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1506 ret |= BDRV_BLOCK_ALLOCATED; 1507 } else { 1508 if (bdrv_unallocated_blocks_are_zero(bs)) { 1509 ret |= BDRV_BLOCK_ZERO; 1510 } else if (bs->backing) { 1511 BlockDriverState *bs2 = bs->backing->bs; 1512 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1513 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1514 ret |= BDRV_BLOCK_ZERO; 1515 } 1516 } 1517 } 1518 1519 if (*file && *file != bs && 1520 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1521 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1522 BlockDriverState *file2; 1523 int file_pnum; 1524 1525 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1526 *pnum, &file_pnum, &file2); 1527 if (ret2 >= 0) { 1528 /* Ignore errors. This is just providing extra information, it 1529 * is useful but not necessary. 1530 */ 1531 if (!file_pnum) { 1532 /* !file_pnum indicates an offset at or beyond the EOF; it is 1533 * perfectly valid for the format block driver to point to such 1534 * offsets, so catch it and mark everything as zero */ 1535 ret |= BDRV_BLOCK_ZERO; 1536 } else { 1537 /* Limit request to the range reported by the protocol driver */ 1538 *pnum = file_pnum; 1539 ret |= (ret2 & BDRV_BLOCK_ZERO); 1540 } 1541 } 1542 } 1543 1544 return ret; 1545 } 1546 1547 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1548 BlockDriverState *base, 1549 int64_t sector_num, 1550 int nb_sectors, 1551 int *pnum, 1552 BlockDriverState **file) 1553 { 1554 BlockDriverState *p; 1555 int64_t ret = 0; 1556 1557 assert(bs != base); 1558 for (p = bs; p != base; p = backing_bs(p)) { 1559 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); 1560 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1561 break; 1562 } 1563 /* [sector_num, pnum] unallocated on this layer, which could be only 1564 * the first part of [sector_num, nb_sectors]. */ 1565 nb_sectors = MIN(nb_sectors, *pnum); 1566 } 1567 return ret; 1568 } 1569 1570 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1571 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1572 { 1573 BdrvCoGetBlockStatusData *data = opaque; 1574 1575 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1576 data->sector_num, 1577 data->nb_sectors, 1578 data->pnum, 1579 data->file); 1580 data->done = true; 1581 } 1582 1583 /* 1584 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1585 * 1586 * See bdrv_co_get_block_status_above() for details. 1587 */ 1588 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1589 BlockDriverState *base, 1590 int64_t sector_num, 1591 int nb_sectors, int *pnum, 1592 BlockDriverState **file) 1593 { 1594 Coroutine *co; 1595 BdrvCoGetBlockStatusData data = { 1596 .bs = bs, 1597 .base = base, 1598 .file = file, 1599 .sector_num = sector_num, 1600 .nb_sectors = nb_sectors, 1601 .pnum = pnum, 1602 .done = false, 1603 }; 1604 1605 if (qemu_in_coroutine()) { 1606 /* Fast-path if already in coroutine context */ 1607 bdrv_get_block_status_above_co_entry(&data); 1608 } else { 1609 AioContext *aio_context = bdrv_get_aio_context(bs); 1610 1611 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1612 qemu_coroutine_enter(co, &data); 1613 while (!data.done) { 1614 aio_poll(aio_context, true); 1615 } 1616 } 1617 return data.ret; 1618 } 1619 1620 int64_t bdrv_get_block_status(BlockDriverState *bs, 1621 int64_t sector_num, 1622 int nb_sectors, int *pnum, 1623 BlockDriverState **file) 1624 { 1625 return bdrv_get_block_status_above(bs, backing_bs(bs), 1626 sector_num, nb_sectors, pnum, file); 1627 } 1628 1629 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1630 int nb_sectors, int *pnum) 1631 { 1632 BlockDriverState *file; 1633 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, 1634 &file); 1635 if (ret < 0) { 1636 return ret; 1637 } 1638 return !!(ret & BDRV_BLOCK_ALLOCATED); 1639 } 1640 1641 /* 1642 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1643 * 1644 * Return true if the given sector is allocated in any image between 1645 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1646 * sector is allocated in any image of the chain. Return false otherwise. 1647 * 1648 * 'pnum' is set to the number of sectors (including and immediately following 1649 * the specified sector) that are known to be in the same 1650 * allocated/unallocated state. 1651 * 1652 */ 1653 int bdrv_is_allocated_above(BlockDriverState *top, 1654 BlockDriverState *base, 1655 int64_t sector_num, 1656 int nb_sectors, int *pnum) 1657 { 1658 BlockDriverState *intermediate; 1659 int ret, n = nb_sectors; 1660 1661 intermediate = top; 1662 while (intermediate && intermediate != base) { 1663 int pnum_inter; 1664 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1665 &pnum_inter); 1666 if (ret < 0) { 1667 return ret; 1668 } else if (ret) { 1669 *pnum = pnum_inter; 1670 return 1; 1671 } 1672 1673 /* 1674 * [sector_num, nb_sectors] is unallocated on top but intermediate 1675 * might have 1676 * 1677 * [sector_num+x, nr_sectors] allocated. 1678 */ 1679 if (n > pnum_inter && 1680 (intermediate == top || 1681 sector_num + pnum_inter < intermediate->total_sectors)) { 1682 n = pnum_inter; 1683 } 1684 1685 intermediate = backing_bs(intermediate); 1686 } 1687 1688 *pnum = n; 1689 return 0; 1690 } 1691 1692 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1693 const uint8_t *buf, int nb_sectors) 1694 { 1695 BlockDriver *drv = bs->drv; 1696 int ret; 1697 1698 if (!drv) { 1699 return -ENOMEDIUM; 1700 } 1701 if (!drv->bdrv_write_compressed) { 1702 return -ENOTSUP; 1703 } 1704 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1705 if (ret < 0) { 1706 return ret; 1707 } 1708 1709 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1710 1711 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1712 } 1713 1714 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1715 int64_t pos, int size) 1716 { 1717 QEMUIOVector qiov; 1718 struct iovec iov = { 1719 .iov_base = (void *) buf, 1720 .iov_len = size, 1721 }; 1722 1723 qemu_iovec_init_external(&qiov, &iov, 1); 1724 return bdrv_writev_vmstate(bs, &qiov, pos); 1725 } 1726 1727 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1728 { 1729 BlockDriver *drv = bs->drv; 1730 1731 if (!drv) { 1732 return -ENOMEDIUM; 1733 } else if (drv->bdrv_save_vmstate) { 1734 return drv->bdrv_save_vmstate(bs, qiov, pos); 1735 } else if (bs->file) { 1736 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1737 } 1738 1739 return -ENOTSUP; 1740 } 1741 1742 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1743 int64_t pos, int size) 1744 { 1745 BlockDriver *drv = bs->drv; 1746 if (!drv) 1747 return -ENOMEDIUM; 1748 if (drv->bdrv_load_vmstate) 1749 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1750 if (bs->file) 1751 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1752 return -ENOTSUP; 1753 } 1754 1755 /**************************************************************/ 1756 /* async I/Os */ 1757 1758 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1759 QEMUIOVector *qiov, int nb_sectors, 1760 BlockCompletionFunc *cb, void *opaque) 1761 { 1762 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1763 1764 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1765 cb, opaque, false); 1766 } 1767 1768 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1769 QEMUIOVector *qiov, int nb_sectors, 1770 BlockCompletionFunc *cb, void *opaque) 1771 { 1772 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1773 1774 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1775 cb, opaque, true); 1776 } 1777 1778 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1779 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1780 BlockCompletionFunc *cb, void *opaque) 1781 { 1782 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1783 1784 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1785 BDRV_REQ_ZERO_WRITE | flags, 1786 cb, opaque, true); 1787 } 1788 1789 1790 typedef struct MultiwriteCB { 1791 int error; 1792 int num_requests; 1793 int num_callbacks; 1794 struct { 1795 BlockCompletionFunc *cb; 1796 void *opaque; 1797 QEMUIOVector *free_qiov; 1798 } callbacks[]; 1799 } MultiwriteCB; 1800 1801 static void multiwrite_user_cb(MultiwriteCB *mcb) 1802 { 1803 int i; 1804 1805 for (i = 0; i < mcb->num_callbacks; i++) { 1806 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1807 if (mcb->callbacks[i].free_qiov) { 1808 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1809 } 1810 g_free(mcb->callbacks[i].free_qiov); 1811 } 1812 } 1813 1814 static void multiwrite_cb(void *opaque, int ret) 1815 { 1816 MultiwriteCB *mcb = opaque; 1817 1818 trace_multiwrite_cb(mcb, ret); 1819 1820 if (ret < 0 && !mcb->error) { 1821 mcb->error = ret; 1822 } 1823 1824 mcb->num_requests--; 1825 if (mcb->num_requests == 0) { 1826 multiwrite_user_cb(mcb); 1827 g_free(mcb); 1828 } 1829 } 1830 1831 static int multiwrite_req_compare(const void *a, const void *b) 1832 { 1833 const BlockRequest *req1 = a, *req2 = b; 1834 1835 /* 1836 * Note that we can't simply subtract req2->sector from req1->sector 1837 * here as that could overflow the return value. 1838 */ 1839 if (req1->sector > req2->sector) { 1840 return 1; 1841 } else if (req1->sector < req2->sector) { 1842 return -1; 1843 } else { 1844 return 0; 1845 } 1846 } 1847 1848 /* 1849 * Takes a bunch of requests and tries to merge them. Returns the number of 1850 * requests that remain after merging. 1851 */ 1852 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1853 int num_reqs, MultiwriteCB *mcb) 1854 { 1855 int i, outidx; 1856 1857 // Sort requests by start sector 1858 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1859 1860 // Check if adjacent requests touch the same clusters. If so, combine them, 1861 // filling up gaps with zero sectors. 1862 outidx = 0; 1863 for (i = 1; i < num_reqs; i++) { 1864 int merge = 0; 1865 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1866 1867 // Handle exactly sequential writes and overlapping writes. 1868 if (reqs[i].sector <= oldreq_last) { 1869 merge = 1; 1870 } 1871 1872 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > 1873 bs->bl.max_iov) { 1874 merge = 0; 1875 } 1876 1877 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1878 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1879 merge = 0; 1880 } 1881 1882 if (merge) { 1883 size_t size; 1884 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1885 qemu_iovec_init(qiov, 1886 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1887 1888 // Add the first request to the merged one. If the requests are 1889 // overlapping, drop the last sectors of the first request. 1890 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1891 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1892 1893 // We should need to add any zeros between the two requests 1894 assert (reqs[i].sector <= oldreq_last); 1895 1896 // Add the second request 1897 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1898 1899 // Add tail of first request, if necessary 1900 if (qiov->size < reqs[outidx].qiov->size) { 1901 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1902 reqs[outidx].qiov->size - qiov->size); 1903 } 1904 1905 reqs[outidx].nb_sectors = qiov->size >> 9; 1906 reqs[outidx].qiov = qiov; 1907 1908 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1909 } else { 1910 outidx++; 1911 reqs[outidx].sector = reqs[i].sector; 1912 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1913 reqs[outidx].qiov = reqs[i].qiov; 1914 } 1915 } 1916 1917 if (bs->blk) { 1918 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1919 num_reqs - outidx - 1); 1920 } 1921 1922 return outidx + 1; 1923 } 1924 1925 /* 1926 * Submit multiple AIO write requests at once. 1927 * 1928 * On success, the function returns 0 and all requests in the reqs array have 1929 * been submitted. In error case this function returns -1, and any of the 1930 * requests may or may not be submitted yet. In particular, this means that the 1931 * callback will be called for some of the requests, for others it won't. The 1932 * caller must check the error field of the BlockRequest to wait for the right 1933 * callbacks (if error != 0, no callback will be called). 1934 * 1935 * The implementation may modify the contents of the reqs array, e.g. to merge 1936 * requests. However, the fields opaque and error are left unmodified as they 1937 * are used to signal failure for a single request to the caller. 1938 */ 1939 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1940 { 1941 MultiwriteCB *mcb; 1942 int i; 1943 1944 /* don't submit writes if we don't have a medium */ 1945 if (bs->drv == NULL) { 1946 for (i = 0; i < num_reqs; i++) { 1947 reqs[i].error = -ENOMEDIUM; 1948 } 1949 return -1; 1950 } 1951 1952 if (num_reqs == 0) { 1953 return 0; 1954 } 1955 1956 // Create MultiwriteCB structure 1957 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1958 mcb->num_requests = 0; 1959 mcb->num_callbacks = num_reqs; 1960 1961 for (i = 0; i < num_reqs; i++) { 1962 mcb->callbacks[i].cb = reqs[i].cb; 1963 mcb->callbacks[i].opaque = reqs[i].opaque; 1964 } 1965 1966 // Check for mergable requests 1967 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1968 1969 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1970 1971 /* Run the aio requests. */ 1972 mcb->num_requests = num_reqs; 1973 for (i = 0; i < num_reqs; i++) { 1974 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1975 reqs[i].nb_sectors, reqs[i].flags, 1976 multiwrite_cb, mcb, 1977 true); 1978 } 1979 1980 return 0; 1981 } 1982 1983 void bdrv_aio_cancel(BlockAIOCB *acb) 1984 { 1985 qemu_aio_ref(acb); 1986 bdrv_aio_cancel_async(acb); 1987 while (acb->refcnt > 1) { 1988 if (acb->aiocb_info->get_aio_context) { 1989 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1990 } else if (acb->bs) { 1991 aio_poll(bdrv_get_aio_context(acb->bs), true); 1992 } else { 1993 abort(); 1994 } 1995 } 1996 qemu_aio_unref(acb); 1997 } 1998 1999 /* Async version of aio cancel. The caller is not blocked if the acb implements 2000 * cancel_async, otherwise we do nothing and let the request normally complete. 2001 * In either case the completion callback must be called. */ 2002 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2003 { 2004 if (acb->aiocb_info->cancel_async) { 2005 acb->aiocb_info->cancel_async(acb); 2006 } 2007 } 2008 2009 /**************************************************************/ 2010 /* async block device emulation */ 2011 2012 typedef struct BlockAIOCBSync { 2013 BlockAIOCB common; 2014 QEMUBH *bh; 2015 int ret; 2016 /* vector translation state */ 2017 QEMUIOVector *qiov; 2018 uint8_t *bounce; 2019 int is_write; 2020 } BlockAIOCBSync; 2021 2022 static const AIOCBInfo bdrv_em_aiocb_info = { 2023 .aiocb_size = sizeof(BlockAIOCBSync), 2024 }; 2025 2026 static void bdrv_aio_bh_cb(void *opaque) 2027 { 2028 BlockAIOCBSync *acb = opaque; 2029 2030 if (!acb->is_write && acb->ret >= 0) { 2031 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2032 } 2033 qemu_vfree(acb->bounce); 2034 acb->common.cb(acb->common.opaque, acb->ret); 2035 qemu_bh_delete(acb->bh); 2036 acb->bh = NULL; 2037 qemu_aio_unref(acb); 2038 } 2039 2040 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2041 int64_t sector_num, 2042 QEMUIOVector *qiov, 2043 int nb_sectors, 2044 BlockCompletionFunc *cb, 2045 void *opaque, 2046 int is_write) 2047 2048 { 2049 BlockAIOCBSync *acb; 2050 2051 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2052 acb->is_write = is_write; 2053 acb->qiov = qiov; 2054 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2055 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2056 2057 if (acb->bounce == NULL) { 2058 acb->ret = -ENOMEM; 2059 } else if (is_write) { 2060 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2061 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2062 } else { 2063 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2064 } 2065 2066 qemu_bh_schedule(acb->bh); 2067 2068 return &acb->common; 2069 } 2070 2071 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2072 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2073 BlockCompletionFunc *cb, void *opaque) 2074 { 2075 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2076 } 2077 2078 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2079 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2080 BlockCompletionFunc *cb, void *opaque) 2081 { 2082 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2083 } 2084 2085 2086 typedef struct BlockAIOCBCoroutine { 2087 BlockAIOCB common; 2088 BlockRequest req; 2089 bool is_write; 2090 bool need_bh; 2091 bool *done; 2092 QEMUBH* bh; 2093 } BlockAIOCBCoroutine; 2094 2095 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2096 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2097 }; 2098 2099 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2100 { 2101 if (!acb->need_bh) { 2102 acb->common.cb(acb->common.opaque, acb->req.error); 2103 qemu_aio_unref(acb); 2104 } 2105 } 2106 2107 static void bdrv_co_em_bh(void *opaque) 2108 { 2109 BlockAIOCBCoroutine *acb = opaque; 2110 2111 assert(!acb->need_bh); 2112 qemu_bh_delete(acb->bh); 2113 bdrv_co_complete(acb); 2114 } 2115 2116 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2117 { 2118 acb->need_bh = false; 2119 if (acb->req.error != -EINPROGRESS) { 2120 BlockDriverState *bs = acb->common.bs; 2121 2122 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2123 qemu_bh_schedule(acb->bh); 2124 } 2125 } 2126 2127 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2128 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2129 { 2130 BlockAIOCBCoroutine *acb = opaque; 2131 BlockDriverState *bs = acb->common.bs; 2132 2133 if (!acb->is_write) { 2134 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2135 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2136 } else { 2137 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2138 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2139 } 2140 2141 bdrv_co_complete(acb); 2142 } 2143 2144 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2145 int64_t sector_num, 2146 QEMUIOVector *qiov, 2147 int nb_sectors, 2148 BdrvRequestFlags flags, 2149 BlockCompletionFunc *cb, 2150 void *opaque, 2151 bool is_write) 2152 { 2153 Coroutine *co; 2154 BlockAIOCBCoroutine *acb; 2155 2156 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2157 acb->need_bh = true; 2158 acb->req.error = -EINPROGRESS; 2159 acb->req.sector = sector_num; 2160 acb->req.nb_sectors = nb_sectors; 2161 acb->req.qiov = qiov; 2162 acb->req.flags = flags; 2163 acb->is_write = is_write; 2164 2165 co = qemu_coroutine_create(bdrv_co_do_rw); 2166 qemu_coroutine_enter(co, acb); 2167 2168 bdrv_co_maybe_schedule_bh(acb); 2169 return &acb->common; 2170 } 2171 2172 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2173 { 2174 BlockAIOCBCoroutine *acb = opaque; 2175 BlockDriverState *bs = acb->common.bs; 2176 2177 acb->req.error = bdrv_co_flush(bs); 2178 bdrv_co_complete(acb); 2179 } 2180 2181 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2182 BlockCompletionFunc *cb, void *opaque) 2183 { 2184 trace_bdrv_aio_flush(bs, opaque); 2185 2186 Coroutine *co; 2187 BlockAIOCBCoroutine *acb; 2188 2189 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2190 acb->need_bh = true; 2191 acb->req.error = -EINPROGRESS; 2192 2193 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2194 qemu_coroutine_enter(co, acb); 2195 2196 bdrv_co_maybe_schedule_bh(acb); 2197 return &acb->common; 2198 } 2199 2200 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2201 { 2202 BlockAIOCBCoroutine *acb = opaque; 2203 BlockDriverState *bs = acb->common.bs; 2204 2205 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2206 bdrv_co_complete(acb); 2207 } 2208 2209 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2210 int64_t sector_num, int nb_sectors, 2211 BlockCompletionFunc *cb, void *opaque) 2212 { 2213 Coroutine *co; 2214 BlockAIOCBCoroutine *acb; 2215 2216 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2217 2218 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2219 acb->need_bh = true; 2220 acb->req.error = -EINPROGRESS; 2221 acb->req.sector = sector_num; 2222 acb->req.nb_sectors = nb_sectors; 2223 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2224 qemu_coroutine_enter(co, acb); 2225 2226 bdrv_co_maybe_schedule_bh(acb); 2227 return &acb->common; 2228 } 2229 2230 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2231 BlockCompletionFunc *cb, void *opaque) 2232 { 2233 BlockAIOCB *acb; 2234 2235 acb = g_malloc(aiocb_info->aiocb_size); 2236 acb->aiocb_info = aiocb_info; 2237 acb->bs = bs; 2238 acb->cb = cb; 2239 acb->opaque = opaque; 2240 acb->refcnt = 1; 2241 return acb; 2242 } 2243 2244 void qemu_aio_ref(void *p) 2245 { 2246 BlockAIOCB *acb = p; 2247 acb->refcnt++; 2248 } 2249 2250 void qemu_aio_unref(void *p) 2251 { 2252 BlockAIOCB *acb = p; 2253 assert(acb->refcnt > 0); 2254 if (--acb->refcnt == 0) { 2255 g_free(acb); 2256 } 2257 } 2258 2259 /**************************************************************/ 2260 /* Coroutine block device emulation */ 2261 2262 typedef struct CoroutineIOCompletion { 2263 Coroutine *coroutine; 2264 int ret; 2265 } CoroutineIOCompletion; 2266 2267 static void bdrv_co_io_em_complete(void *opaque, int ret) 2268 { 2269 CoroutineIOCompletion *co = opaque; 2270 2271 co->ret = ret; 2272 qemu_coroutine_enter(co->coroutine, NULL); 2273 } 2274 2275 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2276 int nb_sectors, QEMUIOVector *iov, 2277 bool is_write) 2278 { 2279 CoroutineIOCompletion co = { 2280 .coroutine = qemu_coroutine_self(), 2281 }; 2282 BlockAIOCB *acb; 2283 2284 if (is_write) { 2285 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2286 bdrv_co_io_em_complete, &co); 2287 } else { 2288 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2289 bdrv_co_io_em_complete, &co); 2290 } 2291 2292 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2293 if (!acb) { 2294 return -EIO; 2295 } 2296 qemu_coroutine_yield(); 2297 2298 return co.ret; 2299 } 2300 2301 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2302 int64_t sector_num, int nb_sectors, 2303 QEMUIOVector *iov) 2304 { 2305 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2306 } 2307 2308 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2309 int64_t sector_num, int nb_sectors, 2310 QEMUIOVector *iov) 2311 { 2312 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2313 } 2314 2315 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2316 { 2317 RwCo *rwco = opaque; 2318 2319 rwco->ret = bdrv_co_flush(rwco->bs); 2320 } 2321 2322 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2323 { 2324 int ret; 2325 BdrvTrackedRequest req; 2326 2327 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2328 bdrv_is_sg(bs)) { 2329 return 0; 2330 } 2331 2332 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2333 /* Write back cached data to the OS even with cache=unsafe */ 2334 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2335 if (bs->drv->bdrv_co_flush_to_os) { 2336 ret = bs->drv->bdrv_co_flush_to_os(bs); 2337 if (ret < 0) { 2338 goto out; 2339 } 2340 } 2341 2342 /* But don't actually force it to the disk with cache=unsafe */ 2343 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2344 goto flush_parent; 2345 } 2346 2347 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2348 if (bs->drv->bdrv_co_flush_to_disk) { 2349 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2350 } else if (bs->drv->bdrv_aio_flush) { 2351 BlockAIOCB *acb; 2352 CoroutineIOCompletion co = { 2353 .coroutine = qemu_coroutine_self(), 2354 }; 2355 2356 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2357 if (acb == NULL) { 2358 ret = -EIO; 2359 } else { 2360 qemu_coroutine_yield(); 2361 ret = co.ret; 2362 } 2363 } else { 2364 /* 2365 * Some block drivers always operate in either writethrough or unsafe 2366 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2367 * know how the server works (because the behaviour is hardcoded or 2368 * depends on server-side configuration), so we can't ensure that 2369 * everything is safe on disk. Returning an error doesn't work because 2370 * that would break guests even if the server operates in writethrough 2371 * mode. 2372 * 2373 * Let's hope the user knows what he's doing. 2374 */ 2375 ret = 0; 2376 } 2377 if (ret < 0) { 2378 goto out; 2379 } 2380 2381 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2382 * in the case of cache=unsafe, so there are no useless flushes. 2383 */ 2384 flush_parent: 2385 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2386 out: 2387 tracked_request_end(&req); 2388 return ret; 2389 } 2390 2391 int bdrv_flush(BlockDriverState *bs) 2392 { 2393 Coroutine *co; 2394 RwCo rwco = { 2395 .bs = bs, 2396 .ret = NOT_DONE, 2397 }; 2398 2399 if (qemu_in_coroutine()) { 2400 /* Fast-path if already in coroutine context */ 2401 bdrv_flush_co_entry(&rwco); 2402 } else { 2403 AioContext *aio_context = bdrv_get_aio_context(bs); 2404 2405 co = qemu_coroutine_create(bdrv_flush_co_entry); 2406 qemu_coroutine_enter(co, &rwco); 2407 while (rwco.ret == NOT_DONE) { 2408 aio_poll(aio_context, true); 2409 } 2410 } 2411 2412 return rwco.ret; 2413 } 2414 2415 typedef struct DiscardCo { 2416 BlockDriverState *bs; 2417 int64_t sector_num; 2418 int nb_sectors; 2419 int ret; 2420 } DiscardCo; 2421 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2422 { 2423 DiscardCo *rwco = opaque; 2424 2425 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2426 } 2427 2428 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2429 int nb_sectors) 2430 { 2431 BdrvTrackedRequest req; 2432 int max_discard, ret; 2433 2434 if (!bs->drv) { 2435 return -ENOMEDIUM; 2436 } 2437 2438 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2439 if (ret < 0) { 2440 return ret; 2441 } else if (bs->read_only) { 2442 return -EPERM; 2443 } 2444 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2445 2446 /* Do nothing if disabled. */ 2447 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2448 return 0; 2449 } 2450 2451 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2452 return 0; 2453 } 2454 2455 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2456 BDRV_TRACKED_DISCARD); 2457 bdrv_set_dirty(bs, sector_num, nb_sectors); 2458 2459 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2460 while (nb_sectors > 0) { 2461 int ret; 2462 int num = nb_sectors; 2463 2464 /* align request */ 2465 if (bs->bl.discard_alignment && 2466 num >= bs->bl.discard_alignment && 2467 sector_num % bs->bl.discard_alignment) { 2468 if (num > bs->bl.discard_alignment) { 2469 num = bs->bl.discard_alignment; 2470 } 2471 num -= sector_num % bs->bl.discard_alignment; 2472 } 2473 2474 /* limit request size */ 2475 if (num > max_discard) { 2476 num = max_discard; 2477 } 2478 2479 if (bs->drv->bdrv_co_discard) { 2480 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2481 } else { 2482 BlockAIOCB *acb; 2483 CoroutineIOCompletion co = { 2484 .coroutine = qemu_coroutine_self(), 2485 }; 2486 2487 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2488 bdrv_co_io_em_complete, &co); 2489 if (acb == NULL) { 2490 ret = -EIO; 2491 goto out; 2492 } else { 2493 qemu_coroutine_yield(); 2494 ret = co.ret; 2495 } 2496 } 2497 if (ret && ret != -ENOTSUP) { 2498 goto out; 2499 } 2500 2501 sector_num += num; 2502 nb_sectors -= num; 2503 } 2504 ret = 0; 2505 out: 2506 tracked_request_end(&req); 2507 return ret; 2508 } 2509 2510 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2511 { 2512 Coroutine *co; 2513 DiscardCo rwco = { 2514 .bs = bs, 2515 .sector_num = sector_num, 2516 .nb_sectors = nb_sectors, 2517 .ret = NOT_DONE, 2518 }; 2519 2520 if (qemu_in_coroutine()) { 2521 /* Fast-path if already in coroutine context */ 2522 bdrv_discard_co_entry(&rwco); 2523 } else { 2524 AioContext *aio_context = bdrv_get_aio_context(bs); 2525 2526 co = qemu_coroutine_create(bdrv_discard_co_entry); 2527 qemu_coroutine_enter(co, &rwco); 2528 while (rwco.ret == NOT_DONE) { 2529 aio_poll(aio_context, true); 2530 } 2531 } 2532 2533 return rwco.ret; 2534 } 2535 2536 typedef struct { 2537 CoroutineIOCompletion *co; 2538 QEMUBH *bh; 2539 } BdrvIoctlCompletionData; 2540 2541 static void bdrv_ioctl_bh_cb(void *opaque) 2542 { 2543 BdrvIoctlCompletionData *data = opaque; 2544 2545 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2546 qemu_bh_delete(data->bh); 2547 } 2548 2549 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2550 { 2551 BlockDriver *drv = bs->drv; 2552 BdrvTrackedRequest tracked_req; 2553 CoroutineIOCompletion co = { 2554 .coroutine = qemu_coroutine_self(), 2555 }; 2556 BlockAIOCB *acb; 2557 2558 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2559 if (!drv || !drv->bdrv_aio_ioctl) { 2560 co.ret = -ENOTSUP; 2561 goto out; 2562 } 2563 2564 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2565 if (!acb) { 2566 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2567 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2568 bdrv_ioctl_bh_cb, data); 2569 data->co = &co; 2570 qemu_bh_schedule(data->bh); 2571 } 2572 qemu_coroutine_yield(); 2573 out: 2574 tracked_request_end(&tracked_req); 2575 return co.ret; 2576 } 2577 2578 typedef struct { 2579 BlockDriverState *bs; 2580 int req; 2581 void *buf; 2582 int ret; 2583 } BdrvIoctlCoData; 2584 2585 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2586 { 2587 BdrvIoctlCoData *data = opaque; 2588 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2589 } 2590 2591 /* needed for generic scsi interface */ 2592 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2593 { 2594 BdrvIoctlCoData data = { 2595 .bs = bs, 2596 .req = req, 2597 .buf = buf, 2598 .ret = -EINPROGRESS, 2599 }; 2600 2601 if (qemu_in_coroutine()) { 2602 /* Fast-path if already in coroutine context */ 2603 bdrv_co_ioctl_entry(&data); 2604 } else { 2605 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2606 2607 qemu_coroutine_enter(co, &data); 2608 while (data.ret == -EINPROGRESS) { 2609 aio_poll(bdrv_get_aio_context(bs), true); 2610 } 2611 } 2612 return data.ret; 2613 } 2614 2615 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2616 { 2617 BlockAIOCBCoroutine *acb = opaque; 2618 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2619 acb->req.req, acb->req.buf); 2620 bdrv_co_complete(acb); 2621 } 2622 2623 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2624 unsigned long int req, void *buf, 2625 BlockCompletionFunc *cb, void *opaque) 2626 { 2627 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2628 bs, cb, opaque); 2629 Coroutine *co; 2630 2631 acb->need_bh = true; 2632 acb->req.error = -EINPROGRESS; 2633 acb->req.req = req; 2634 acb->req.buf = buf; 2635 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2636 qemu_coroutine_enter(co, acb); 2637 2638 bdrv_co_maybe_schedule_bh(acb); 2639 return &acb->common; 2640 } 2641 2642 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2643 { 2644 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2645 } 2646 2647 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2648 { 2649 return memset(qemu_blockalign(bs, size), 0, size); 2650 } 2651 2652 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2653 { 2654 size_t align = bdrv_opt_mem_align(bs); 2655 2656 /* Ensure that NULL is never returned on success */ 2657 assert(align > 0); 2658 if (size == 0) { 2659 size = align; 2660 } 2661 2662 return qemu_try_memalign(align, size); 2663 } 2664 2665 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2666 { 2667 void *mem = qemu_try_blockalign(bs, size); 2668 2669 if (mem) { 2670 memset(mem, 0, size); 2671 } 2672 2673 return mem; 2674 } 2675 2676 /* 2677 * Check if all memory in this vector is sector aligned. 2678 */ 2679 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2680 { 2681 int i; 2682 size_t alignment = bdrv_min_mem_align(bs); 2683 2684 for (i = 0; i < qiov->niov; i++) { 2685 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2686 return false; 2687 } 2688 if (qiov->iov[i].iov_len % alignment) { 2689 return false; 2690 } 2691 } 2692 2693 return true; 2694 } 2695 2696 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2697 NotifierWithReturn *notifier) 2698 { 2699 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2700 } 2701 2702 void bdrv_io_plug(BlockDriverState *bs) 2703 { 2704 BlockDriver *drv = bs->drv; 2705 if (drv && drv->bdrv_io_plug) { 2706 drv->bdrv_io_plug(bs); 2707 } else if (bs->file) { 2708 bdrv_io_plug(bs->file->bs); 2709 } 2710 } 2711 2712 void bdrv_io_unplug(BlockDriverState *bs) 2713 { 2714 BlockDriver *drv = bs->drv; 2715 if (drv && drv->bdrv_io_unplug) { 2716 drv->bdrv_io_unplug(bs); 2717 } else if (bs->file) { 2718 bdrv_io_unplug(bs->file->bs); 2719 } 2720 } 2721 2722 void bdrv_flush_io_queue(BlockDriverState *bs) 2723 { 2724 BlockDriver *drv = bs->drv; 2725 if (drv && drv->bdrv_flush_io_queue) { 2726 drv->bdrv_flush_io_queue(bs); 2727 } else if (bs->file) { 2728 bdrv_flush_io_queue(bs->file->bs); 2729 } 2730 bdrv_start_throttled_reqs(bs); 2731 } 2732 2733 void bdrv_drained_begin(BlockDriverState *bs) 2734 { 2735 if (!bs->quiesce_counter++) { 2736 aio_disable_external(bdrv_get_aio_context(bs)); 2737 } 2738 bdrv_drain(bs); 2739 } 2740 2741 void bdrv_drained_end(BlockDriverState *bs) 2742 { 2743 assert(bs->quiesce_counter > 0); 2744 if (--bs->quiesce_counter > 0) { 2745 return; 2746 } 2747 aio_enable_external(bdrv_get_aio_context(bs)); 2748 } 2749