1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "block/throttle-groups.h" 31 #include "qemu/cutils.h" 32 #include "qapi/error.h" 33 #include "qemu/error-report.h" 34 35 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 36 37 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 38 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 39 BlockCompletionFunc *cb, void *opaque); 40 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 41 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 42 BlockCompletionFunc *cb, void *opaque); 43 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 44 int64_t sector_num, int nb_sectors, 45 QEMUIOVector *iov); 46 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 47 int64_t sector_num, int nb_sectors, 48 QEMUIOVector *iov); 49 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 50 int64_t sector_num, 51 QEMUIOVector *qiov, 52 int nb_sectors, 53 BdrvRequestFlags flags, 54 BlockCompletionFunc *cb, 55 void *opaque, 56 bool is_write); 57 static void coroutine_fn bdrv_co_do_rw(void *opaque); 58 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 59 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 60 61 /* throttling disk I/O limits */ 62 void bdrv_set_io_limits(BlockDriverState *bs, 63 ThrottleConfig *cfg) 64 { 65 int i; 66 67 throttle_group_config(bs, cfg); 68 69 for (i = 0; i < 2; i++) { 70 qemu_co_enter_next(&bs->throttled_reqs[i]); 71 } 72 } 73 74 /* this function drain all the throttled IOs */ 75 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 76 { 77 bool drained = false; 78 bool enabled = bs->io_limits_enabled; 79 int i; 80 81 bs->io_limits_enabled = false; 82 83 for (i = 0; i < 2; i++) { 84 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 85 drained = true; 86 } 87 } 88 89 bs->io_limits_enabled = enabled; 90 91 return drained; 92 } 93 94 void bdrv_io_limits_disable(BlockDriverState *bs) 95 { 96 bs->io_limits_enabled = false; 97 bdrv_start_throttled_reqs(bs); 98 throttle_group_unregister_bs(bs); 99 } 100 101 /* should be called before bdrv_set_io_limits if a limit is set */ 102 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 103 { 104 assert(!bs->io_limits_enabled); 105 throttle_group_register_bs(bs, group); 106 bs->io_limits_enabled = true; 107 } 108 109 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 110 { 111 /* this bs is not part of any group */ 112 if (!bs->throttle_state) { 113 return; 114 } 115 116 /* this bs is a part of the same group than the one we want */ 117 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 118 return; 119 } 120 121 /* need to change the group this bs belong to */ 122 bdrv_io_limits_disable(bs); 123 bdrv_io_limits_enable(bs, group); 124 } 125 126 void bdrv_setup_io_funcs(BlockDriver *bdrv) 127 { 128 /* Block drivers without coroutine functions need emulation */ 129 if (!bdrv->bdrv_co_readv) { 130 bdrv->bdrv_co_readv = bdrv_co_readv_em; 131 bdrv->bdrv_co_writev = bdrv_co_writev_em; 132 133 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 134 * the block driver lacks aio we need to emulate that too. 135 */ 136 if (!bdrv->bdrv_aio_readv) { 137 /* add AIO emulation layer */ 138 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 139 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 140 } 141 } 142 } 143 144 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 145 { 146 BlockDriver *drv = bs->drv; 147 Error *local_err = NULL; 148 149 memset(&bs->bl, 0, sizeof(bs->bl)); 150 151 if (!drv) { 152 return; 153 } 154 155 /* Take some limits from the children as a default */ 156 if (bs->file) { 157 bdrv_refresh_limits(bs->file->bs, &local_err); 158 if (local_err) { 159 error_propagate(errp, local_err); 160 return; 161 } 162 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 163 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 164 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 165 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 166 bs->bl.max_iov = bs->file->bs->bl.max_iov; 167 } else { 168 bs->bl.min_mem_alignment = 512; 169 bs->bl.opt_mem_alignment = getpagesize(); 170 171 /* Safe default since most protocols use readv()/writev()/etc */ 172 bs->bl.max_iov = IOV_MAX; 173 } 174 175 if (bs->backing) { 176 bdrv_refresh_limits(bs->backing->bs, &local_err); 177 if (local_err) { 178 error_propagate(errp, local_err); 179 return; 180 } 181 bs->bl.opt_transfer_length = 182 MAX(bs->bl.opt_transfer_length, 183 bs->backing->bs->bl.opt_transfer_length); 184 bs->bl.max_transfer_length = 185 MIN_NON_ZERO(bs->bl.max_transfer_length, 186 bs->backing->bs->bl.max_transfer_length); 187 bs->bl.opt_mem_alignment = 188 MAX(bs->bl.opt_mem_alignment, 189 bs->backing->bs->bl.opt_mem_alignment); 190 bs->bl.min_mem_alignment = 191 MAX(bs->bl.min_mem_alignment, 192 bs->backing->bs->bl.min_mem_alignment); 193 bs->bl.max_iov = 194 MIN(bs->bl.max_iov, 195 bs->backing->bs->bl.max_iov); 196 } 197 198 /* Then let the driver override it */ 199 if (drv->bdrv_refresh_limits) { 200 drv->bdrv_refresh_limits(bs, errp); 201 } 202 } 203 204 /** 205 * The copy-on-read flag is actually a reference count so multiple users may 206 * use the feature without worrying about clobbering its previous state. 207 * Copy-on-read stays enabled until all users have called to disable it. 208 */ 209 void bdrv_enable_copy_on_read(BlockDriverState *bs) 210 { 211 bs->copy_on_read++; 212 } 213 214 void bdrv_disable_copy_on_read(BlockDriverState *bs) 215 { 216 assert(bs->copy_on_read > 0); 217 bs->copy_on_read--; 218 } 219 220 /* Check if any requests are in-flight (including throttled requests) */ 221 bool bdrv_requests_pending(BlockDriverState *bs) 222 { 223 BdrvChild *child; 224 225 if (!QLIST_EMPTY(&bs->tracked_requests)) { 226 return true; 227 } 228 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 229 return true; 230 } 231 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 232 return true; 233 } 234 235 QLIST_FOREACH(child, &bs->children, next) { 236 if (bdrv_requests_pending(child->bs)) { 237 return true; 238 } 239 } 240 241 return false; 242 } 243 244 static void bdrv_drain_recurse(BlockDriverState *bs) 245 { 246 BdrvChild *child; 247 248 if (bs->drv && bs->drv->bdrv_drain) { 249 bs->drv->bdrv_drain(bs); 250 } 251 QLIST_FOREACH(child, &bs->children, next) { 252 bdrv_drain_recurse(child->bs); 253 } 254 } 255 256 /* 257 * Wait for pending requests to complete on a single BlockDriverState subtree, 258 * and suspend block driver's internal I/O until next request arrives. 259 * 260 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 261 * AioContext. 262 * 263 * Only this BlockDriverState's AioContext is run, so in-flight requests must 264 * not depend on events in other AioContexts. In that case, use 265 * bdrv_drain_all() instead. 266 */ 267 void bdrv_drain(BlockDriverState *bs) 268 { 269 bool busy = true; 270 271 bdrv_drain_recurse(bs); 272 while (busy) { 273 /* Keep iterating */ 274 bdrv_flush_io_queue(bs); 275 busy = bdrv_requests_pending(bs); 276 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 277 } 278 } 279 280 /* 281 * Wait for pending requests to complete across all BlockDriverStates 282 * 283 * This function does not flush data to disk, use bdrv_flush_all() for that 284 * after calling this function. 285 */ 286 void bdrv_drain_all(void) 287 { 288 /* Always run first iteration so any pending completion BHs run */ 289 bool busy = true; 290 BlockDriverState *bs = NULL; 291 GSList *aio_ctxs = NULL, *ctx; 292 293 while ((bs = bdrv_next(bs))) { 294 AioContext *aio_context = bdrv_get_aio_context(bs); 295 296 aio_context_acquire(aio_context); 297 if (bs->job) { 298 block_job_pause(bs->job); 299 } 300 bdrv_drain_recurse(bs); 301 aio_context_release(aio_context); 302 303 if (!g_slist_find(aio_ctxs, aio_context)) { 304 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 305 } 306 } 307 308 /* Note that completion of an asynchronous I/O operation can trigger any 309 * number of other I/O operations on other devices---for example a 310 * coroutine can submit an I/O request to another device in response to 311 * request completion. Therefore we must keep looping until there was no 312 * more activity rather than simply draining each device independently. 313 */ 314 while (busy) { 315 busy = false; 316 317 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 318 AioContext *aio_context = ctx->data; 319 bs = NULL; 320 321 aio_context_acquire(aio_context); 322 while ((bs = bdrv_next(bs))) { 323 if (aio_context == bdrv_get_aio_context(bs)) { 324 bdrv_flush_io_queue(bs); 325 if (bdrv_requests_pending(bs)) { 326 busy = true; 327 aio_poll(aio_context, busy); 328 } 329 } 330 } 331 busy |= aio_poll(aio_context, false); 332 aio_context_release(aio_context); 333 } 334 } 335 336 bs = NULL; 337 while ((bs = bdrv_next(bs))) { 338 AioContext *aio_context = bdrv_get_aio_context(bs); 339 340 aio_context_acquire(aio_context); 341 if (bs->job) { 342 block_job_resume(bs->job); 343 } 344 aio_context_release(aio_context); 345 } 346 g_slist_free(aio_ctxs); 347 } 348 349 /** 350 * Remove an active request from the tracked requests list 351 * 352 * This function should be called when a tracked request is completing. 353 */ 354 static void tracked_request_end(BdrvTrackedRequest *req) 355 { 356 if (req->serialising) { 357 req->bs->serialising_in_flight--; 358 } 359 360 QLIST_REMOVE(req, list); 361 qemu_co_queue_restart_all(&req->wait_queue); 362 } 363 364 /** 365 * Add an active request to the tracked requests list 366 */ 367 static void tracked_request_begin(BdrvTrackedRequest *req, 368 BlockDriverState *bs, 369 int64_t offset, 370 unsigned int bytes, 371 enum BdrvTrackedRequestType type) 372 { 373 *req = (BdrvTrackedRequest){ 374 .bs = bs, 375 .offset = offset, 376 .bytes = bytes, 377 .type = type, 378 .co = qemu_coroutine_self(), 379 .serialising = false, 380 .overlap_offset = offset, 381 .overlap_bytes = bytes, 382 }; 383 384 qemu_co_queue_init(&req->wait_queue); 385 386 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 387 } 388 389 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 390 { 391 int64_t overlap_offset = req->offset & ~(align - 1); 392 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 393 - overlap_offset; 394 395 if (!req->serialising) { 396 req->bs->serialising_in_flight++; 397 req->serialising = true; 398 } 399 400 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 401 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 402 } 403 404 /** 405 * Round a region to cluster boundaries 406 */ 407 void bdrv_round_to_clusters(BlockDriverState *bs, 408 int64_t sector_num, int nb_sectors, 409 int64_t *cluster_sector_num, 410 int *cluster_nb_sectors) 411 { 412 BlockDriverInfo bdi; 413 414 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 415 *cluster_sector_num = sector_num; 416 *cluster_nb_sectors = nb_sectors; 417 } else { 418 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 419 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 420 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 421 nb_sectors, c); 422 } 423 } 424 425 static int bdrv_get_cluster_size(BlockDriverState *bs) 426 { 427 BlockDriverInfo bdi; 428 int ret; 429 430 ret = bdrv_get_info(bs, &bdi); 431 if (ret < 0 || bdi.cluster_size == 0) { 432 return bs->request_alignment; 433 } else { 434 return bdi.cluster_size; 435 } 436 } 437 438 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 439 int64_t offset, unsigned int bytes) 440 { 441 /* aaaa bbbb */ 442 if (offset >= req->overlap_offset + req->overlap_bytes) { 443 return false; 444 } 445 /* bbbb aaaa */ 446 if (req->overlap_offset >= offset + bytes) { 447 return false; 448 } 449 return true; 450 } 451 452 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 453 { 454 BlockDriverState *bs = self->bs; 455 BdrvTrackedRequest *req; 456 bool retry; 457 bool waited = false; 458 459 if (!bs->serialising_in_flight) { 460 return false; 461 } 462 463 do { 464 retry = false; 465 QLIST_FOREACH(req, &bs->tracked_requests, list) { 466 if (req == self || (!req->serialising && !self->serialising)) { 467 continue; 468 } 469 if (tracked_request_overlaps(req, self->overlap_offset, 470 self->overlap_bytes)) 471 { 472 /* Hitting this means there was a reentrant request, for 473 * example, a block driver issuing nested requests. This must 474 * never happen since it means deadlock. 475 */ 476 assert(qemu_coroutine_self() != req->co); 477 478 /* If the request is already (indirectly) waiting for us, or 479 * will wait for us as soon as it wakes up, then just go on 480 * (instead of producing a deadlock in the former case). */ 481 if (!req->waiting_for) { 482 self->waiting_for = req; 483 qemu_co_queue_wait(&req->wait_queue); 484 self->waiting_for = NULL; 485 retry = true; 486 waited = true; 487 break; 488 } 489 } 490 } 491 } while (retry); 492 493 return waited; 494 } 495 496 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 497 size_t size) 498 { 499 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 500 return -EIO; 501 } 502 503 if (!bdrv_is_inserted(bs)) { 504 return -ENOMEDIUM; 505 } 506 507 if (offset < 0) { 508 return -EIO; 509 } 510 511 return 0; 512 } 513 514 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 515 int nb_sectors) 516 { 517 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 518 return -EIO; 519 } 520 521 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 522 nb_sectors * BDRV_SECTOR_SIZE); 523 } 524 525 typedef struct RwCo { 526 BlockDriverState *bs; 527 int64_t offset; 528 QEMUIOVector *qiov; 529 bool is_write; 530 int ret; 531 BdrvRequestFlags flags; 532 } RwCo; 533 534 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 535 { 536 RwCo *rwco = opaque; 537 538 if (!rwco->is_write) { 539 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 540 rwco->qiov->size, rwco->qiov, 541 rwco->flags); 542 } else { 543 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 544 rwco->qiov->size, rwco->qiov, 545 rwco->flags); 546 } 547 } 548 549 /* 550 * Process a vectored synchronous request using coroutines 551 */ 552 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 553 QEMUIOVector *qiov, bool is_write, 554 BdrvRequestFlags flags) 555 { 556 Coroutine *co; 557 RwCo rwco = { 558 .bs = bs, 559 .offset = offset, 560 .qiov = qiov, 561 .is_write = is_write, 562 .ret = NOT_DONE, 563 .flags = flags, 564 }; 565 566 /** 567 * In sync call context, when the vcpu is blocked, this throttling timer 568 * will not fire; so the I/O throttling function has to be disabled here 569 * if it has been enabled. 570 */ 571 if (bs->io_limits_enabled) { 572 fprintf(stderr, "Disabling I/O throttling on '%s' due " 573 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 574 bdrv_io_limits_disable(bs); 575 } 576 577 if (qemu_in_coroutine()) { 578 /* Fast-path if already in coroutine context */ 579 bdrv_rw_co_entry(&rwco); 580 } else { 581 AioContext *aio_context = bdrv_get_aio_context(bs); 582 583 co = qemu_coroutine_create(bdrv_rw_co_entry); 584 qemu_coroutine_enter(co, &rwco); 585 while (rwco.ret == NOT_DONE) { 586 aio_poll(aio_context, true); 587 } 588 } 589 return rwco.ret; 590 } 591 592 /* 593 * Process a synchronous request using coroutines 594 */ 595 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 596 int nb_sectors, bool is_write, BdrvRequestFlags flags) 597 { 598 QEMUIOVector qiov; 599 struct iovec iov = { 600 .iov_base = (void *)buf, 601 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 602 }; 603 604 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 605 return -EINVAL; 606 } 607 608 qemu_iovec_init_external(&qiov, &iov, 1); 609 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 610 &qiov, is_write, flags); 611 } 612 613 /* return < 0 if error. See bdrv_write() for the return codes */ 614 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 615 uint8_t *buf, int nb_sectors) 616 { 617 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 618 } 619 620 /* Return < 0 if error. Important errors are: 621 -EIO generic I/O error (may happen for all errors) 622 -ENOMEDIUM No media inserted. 623 -EINVAL Invalid sector number or nb_sectors 624 -EACCES Trying to write a read-only device 625 */ 626 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 627 const uint8_t *buf, int nb_sectors) 628 { 629 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 630 } 631 632 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 633 int nb_sectors, BdrvRequestFlags flags) 634 { 635 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 636 BDRV_REQ_ZERO_WRITE | flags); 637 } 638 639 /* 640 * Completely zero out a block device with the help of bdrv_write_zeroes. 641 * The operation is sped up by checking the block status and only writing 642 * zeroes to the device if they currently do not return zeroes. Optional 643 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 644 * 645 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 646 */ 647 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 648 { 649 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 650 BlockDriverState *file; 651 int n; 652 653 target_sectors = bdrv_nb_sectors(bs); 654 if (target_sectors < 0) { 655 return target_sectors; 656 } 657 658 for (;;) { 659 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 660 if (nb_sectors <= 0) { 661 return 0; 662 } 663 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); 664 if (ret < 0) { 665 error_report("error getting block status at sector %" PRId64 ": %s", 666 sector_num, strerror(-ret)); 667 return ret; 668 } 669 if (ret & BDRV_BLOCK_ZERO) { 670 sector_num += n; 671 continue; 672 } 673 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 674 if (ret < 0) { 675 error_report("error writing zeroes at sector %" PRId64 ": %s", 676 sector_num, strerror(-ret)); 677 return ret; 678 } 679 sector_num += n; 680 } 681 } 682 683 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 684 { 685 QEMUIOVector qiov; 686 struct iovec iov = { 687 .iov_base = (void *)buf, 688 .iov_len = bytes, 689 }; 690 int ret; 691 692 if (bytes < 0) { 693 return -EINVAL; 694 } 695 696 qemu_iovec_init_external(&qiov, &iov, 1); 697 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 698 if (ret < 0) { 699 return ret; 700 } 701 702 return bytes; 703 } 704 705 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 706 { 707 int ret; 708 709 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 710 if (ret < 0) { 711 return ret; 712 } 713 714 return qiov->size; 715 } 716 717 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 718 const void *buf, int bytes) 719 { 720 QEMUIOVector qiov; 721 struct iovec iov = { 722 .iov_base = (void *) buf, 723 .iov_len = bytes, 724 }; 725 726 if (bytes < 0) { 727 return -EINVAL; 728 } 729 730 qemu_iovec_init_external(&qiov, &iov, 1); 731 return bdrv_pwritev(bs, offset, &qiov); 732 } 733 734 /* 735 * Writes to the file and ensures that no writes are reordered across this 736 * request (acts as a barrier) 737 * 738 * Returns 0 on success, -errno in error cases. 739 */ 740 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 741 const void *buf, int count) 742 { 743 int ret; 744 745 ret = bdrv_pwrite(bs, offset, buf, count); 746 if (ret < 0) { 747 return ret; 748 } 749 750 /* No flush needed for cache modes that already do it */ 751 if (bs->enable_write_cache) { 752 bdrv_flush(bs); 753 } 754 755 return 0; 756 } 757 758 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 759 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 760 { 761 /* Perform I/O through a temporary buffer so that users who scribble over 762 * their read buffer while the operation is in progress do not end up 763 * modifying the image file. This is critical for zero-copy guest I/O 764 * where anything might happen inside guest memory. 765 */ 766 void *bounce_buffer; 767 768 BlockDriver *drv = bs->drv; 769 struct iovec iov; 770 QEMUIOVector bounce_qiov; 771 int64_t cluster_sector_num; 772 int cluster_nb_sectors; 773 size_t skip_bytes; 774 int ret; 775 776 /* Cover entire cluster so no additional backing file I/O is required when 777 * allocating cluster in the image file. 778 */ 779 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 780 &cluster_sector_num, &cluster_nb_sectors); 781 782 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 783 cluster_sector_num, cluster_nb_sectors); 784 785 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 786 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 787 if (bounce_buffer == NULL) { 788 ret = -ENOMEM; 789 goto err; 790 } 791 792 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 793 794 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 795 &bounce_qiov); 796 if (ret < 0) { 797 goto err; 798 } 799 800 if (drv->bdrv_co_write_zeroes && 801 buffer_is_zero(bounce_buffer, iov.iov_len)) { 802 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 803 cluster_nb_sectors, 0); 804 } else { 805 /* This does not change the data on the disk, it is not necessary 806 * to flush even in cache=writethrough mode. 807 */ 808 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 809 &bounce_qiov); 810 } 811 812 if (ret < 0) { 813 /* It might be okay to ignore write errors for guest requests. If this 814 * is a deliberate copy-on-read then we don't want to ignore the error. 815 * Simply report it in all cases. 816 */ 817 goto err; 818 } 819 820 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 821 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 822 nb_sectors * BDRV_SECTOR_SIZE); 823 824 err: 825 qemu_vfree(bounce_buffer); 826 return ret; 827 } 828 829 /* 830 * Forwards an already correctly aligned request to the BlockDriver. This 831 * handles copy on read and zeroing after EOF; any other features must be 832 * implemented by the caller. 833 */ 834 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 835 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 836 int64_t align, QEMUIOVector *qiov, int flags) 837 { 838 BlockDriver *drv = bs->drv; 839 int ret; 840 841 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 842 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 843 844 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 845 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 846 assert(!qiov || bytes == qiov->size); 847 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 848 849 /* Handle Copy on Read and associated serialisation */ 850 if (flags & BDRV_REQ_COPY_ON_READ) { 851 /* If we touch the same cluster it counts as an overlap. This 852 * guarantees that allocating writes will be serialized and not race 853 * with each other for the same cluster. For example, in copy-on-read 854 * it ensures that the CoR read and write operations are atomic and 855 * guest writes cannot interleave between them. */ 856 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 857 } 858 859 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 860 wait_serialising_requests(req); 861 } 862 863 if (flags & BDRV_REQ_COPY_ON_READ) { 864 int pnum; 865 866 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 867 if (ret < 0) { 868 goto out; 869 } 870 871 if (!ret || pnum != nb_sectors) { 872 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 873 goto out; 874 } 875 } 876 877 /* Forward the request to the BlockDriver */ 878 if (!bs->zero_beyond_eof) { 879 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 880 } else { 881 /* Read zeros after EOF */ 882 int64_t total_sectors, max_nb_sectors; 883 884 total_sectors = bdrv_nb_sectors(bs); 885 if (total_sectors < 0) { 886 ret = total_sectors; 887 goto out; 888 } 889 890 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 891 align >> BDRV_SECTOR_BITS); 892 if (nb_sectors < max_nb_sectors) { 893 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 894 } else if (max_nb_sectors > 0) { 895 QEMUIOVector local_qiov; 896 897 qemu_iovec_init(&local_qiov, qiov->niov); 898 qemu_iovec_concat(&local_qiov, qiov, 0, 899 max_nb_sectors * BDRV_SECTOR_SIZE); 900 901 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 902 &local_qiov); 903 904 qemu_iovec_destroy(&local_qiov); 905 } else { 906 ret = 0; 907 } 908 909 /* Reading beyond end of file is supposed to produce zeroes */ 910 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 911 uint64_t offset = MAX(0, total_sectors - sector_num); 912 uint64_t bytes = (sector_num + nb_sectors - offset) * 913 BDRV_SECTOR_SIZE; 914 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 915 } 916 } 917 918 out: 919 return ret; 920 } 921 922 /* 923 * Handle a read request in coroutine context 924 */ 925 int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 926 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 927 BdrvRequestFlags flags) 928 { 929 BlockDriver *drv = bs->drv; 930 BdrvTrackedRequest req; 931 932 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 933 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 934 uint8_t *head_buf = NULL; 935 uint8_t *tail_buf = NULL; 936 QEMUIOVector local_qiov; 937 bool use_local_qiov = false; 938 int ret; 939 940 if (!drv) { 941 return -ENOMEDIUM; 942 } 943 944 ret = bdrv_check_byte_request(bs, offset, bytes); 945 if (ret < 0) { 946 return ret; 947 } 948 949 /* Don't do copy-on-read if we read data before write operation */ 950 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 951 flags |= BDRV_REQ_COPY_ON_READ; 952 } 953 954 /* throttling disk I/O */ 955 if (bs->io_limits_enabled) { 956 throttle_group_co_io_limits_intercept(bs, bytes, false); 957 } 958 959 /* Align read if necessary by padding qiov */ 960 if (offset & (align - 1)) { 961 head_buf = qemu_blockalign(bs, align); 962 qemu_iovec_init(&local_qiov, qiov->niov + 2); 963 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 964 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 965 use_local_qiov = true; 966 967 bytes += offset & (align - 1); 968 offset = offset & ~(align - 1); 969 } 970 971 if ((offset + bytes) & (align - 1)) { 972 if (!use_local_qiov) { 973 qemu_iovec_init(&local_qiov, qiov->niov + 1); 974 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 975 use_local_qiov = true; 976 } 977 tail_buf = qemu_blockalign(bs, align); 978 qemu_iovec_add(&local_qiov, tail_buf, 979 align - ((offset + bytes) & (align - 1))); 980 981 bytes = ROUND_UP(bytes, align); 982 } 983 984 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 985 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 986 use_local_qiov ? &local_qiov : qiov, 987 flags); 988 tracked_request_end(&req); 989 990 if (use_local_qiov) { 991 qemu_iovec_destroy(&local_qiov); 992 qemu_vfree(head_buf); 993 qemu_vfree(tail_buf); 994 } 995 996 return ret; 997 } 998 999 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1000 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1001 BdrvRequestFlags flags) 1002 { 1003 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1004 return -EINVAL; 1005 } 1006 1007 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1008 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1009 } 1010 1011 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1012 int nb_sectors, QEMUIOVector *qiov) 1013 { 1014 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1015 1016 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1017 } 1018 1019 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, 1020 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1021 { 1022 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); 1023 1024 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1025 BDRV_REQ_NO_SERIALISING); 1026 } 1027 1028 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1029 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1030 { 1031 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1032 1033 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1034 BDRV_REQ_COPY_ON_READ); 1035 } 1036 1037 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1038 1039 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1040 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1041 { 1042 BlockDriver *drv = bs->drv; 1043 QEMUIOVector qiov; 1044 struct iovec iov = {0}; 1045 int ret = 0; 1046 1047 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1048 BDRV_REQUEST_MAX_SECTORS); 1049 1050 while (nb_sectors > 0 && !ret) { 1051 int num = nb_sectors; 1052 1053 /* Align request. Block drivers can expect the "bulk" of the request 1054 * to be aligned. 1055 */ 1056 if (bs->bl.write_zeroes_alignment 1057 && num > bs->bl.write_zeroes_alignment) { 1058 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1059 /* Make a small request up to the first aligned sector. */ 1060 num = bs->bl.write_zeroes_alignment; 1061 num -= sector_num % bs->bl.write_zeroes_alignment; 1062 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1063 /* Shorten the request to the last aligned sector. num cannot 1064 * underflow because num > bs->bl.write_zeroes_alignment. 1065 */ 1066 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1067 } 1068 } 1069 1070 /* limit request size */ 1071 if (num > max_write_zeroes) { 1072 num = max_write_zeroes; 1073 } 1074 1075 ret = -ENOTSUP; 1076 /* First try the efficient write zeroes operation */ 1077 if (drv->bdrv_co_write_zeroes) { 1078 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1079 } 1080 1081 if (ret == -ENOTSUP) { 1082 /* Fall back to bounce buffer if write zeroes is unsupported */ 1083 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1084 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1085 num = MIN(num, max_xfer_len); 1086 iov.iov_len = num * BDRV_SECTOR_SIZE; 1087 if (iov.iov_base == NULL) { 1088 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1089 if (iov.iov_base == NULL) { 1090 ret = -ENOMEM; 1091 goto fail; 1092 } 1093 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1094 } 1095 qemu_iovec_init_external(&qiov, &iov, 1); 1096 1097 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1098 1099 /* Keep bounce buffer around if it is big enough for all 1100 * all future requests. 1101 */ 1102 if (num < max_xfer_len) { 1103 qemu_vfree(iov.iov_base); 1104 iov.iov_base = NULL; 1105 } 1106 } 1107 1108 sector_num += num; 1109 nb_sectors -= num; 1110 } 1111 1112 fail: 1113 qemu_vfree(iov.iov_base); 1114 return ret; 1115 } 1116 1117 /* 1118 * Forwards an already correctly aligned write request to the BlockDriver. 1119 */ 1120 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1121 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1122 QEMUIOVector *qiov, int flags) 1123 { 1124 BlockDriver *drv = bs->drv; 1125 bool waited; 1126 int ret; 1127 1128 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1129 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1130 1131 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1132 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1133 assert(!qiov || bytes == qiov->size); 1134 assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1135 1136 waited = wait_serialising_requests(req); 1137 assert(!waited || !req->serialising); 1138 assert(req->overlap_offset <= offset); 1139 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1140 1141 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1142 1143 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1144 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1145 qemu_iovec_is_zero(qiov)) { 1146 flags |= BDRV_REQ_ZERO_WRITE; 1147 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1148 flags |= BDRV_REQ_MAY_UNMAP; 1149 } 1150 } 1151 1152 if (ret < 0) { 1153 /* Do nothing, write notifier decided to fail this request */ 1154 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1155 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1156 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1157 } else { 1158 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1159 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1160 } 1161 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1162 1163 if (ret == 0 && !bs->enable_write_cache) { 1164 ret = bdrv_co_flush(bs); 1165 } 1166 1167 bdrv_set_dirty(bs, sector_num, nb_sectors); 1168 1169 if (bs->wr_highest_offset < offset + bytes) { 1170 bs->wr_highest_offset = offset + bytes; 1171 } 1172 1173 if (ret >= 0) { 1174 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1175 } 1176 1177 return ret; 1178 } 1179 1180 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1181 int64_t offset, 1182 unsigned int bytes, 1183 BdrvRequestFlags flags, 1184 BdrvTrackedRequest *req) 1185 { 1186 uint8_t *buf = NULL; 1187 QEMUIOVector local_qiov; 1188 struct iovec iov; 1189 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1190 unsigned int head_padding_bytes, tail_padding_bytes; 1191 int ret = 0; 1192 1193 head_padding_bytes = offset & (align - 1); 1194 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1195 1196 1197 assert(flags & BDRV_REQ_ZERO_WRITE); 1198 if (head_padding_bytes || tail_padding_bytes) { 1199 buf = qemu_blockalign(bs, align); 1200 iov = (struct iovec) { 1201 .iov_base = buf, 1202 .iov_len = align, 1203 }; 1204 qemu_iovec_init_external(&local_qiov, &iov, 1); 1205 } 1206 if (head_padding_bytes) { 1207 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1208 1209 /* RMW the unaligned part before head. */ 1210 mark_request_serialising(req, align); 1211 wait_serialising_requests(req); 1212 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1213 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1214 align, &local_qiov, 0); 1215 if (ret < 0) { 1216 goto fail; 1217 } 1218 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1219 1220 memset(buf + head_padding_bytes, 0, zero_bytes); 1221 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1222 &local_qiov, 1223 flags & ~BDRV_REQ_ZERO_WRITE); 1224 if (ret < 0) { 1225 goto fail; 1226 } 1227 offset += zero_bytes; 1228 bytes -= zero_bytes; 1229 } 1230 1231 assert(!bytes || (offset & (align - 1)) == 0); 1232 if (bytes >= align) { 1233 /* Write the aligned part in the middle. */ 1234 uint64_t aligned_bytes = bytes & ~(align - 1); 1235 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1236 NULL, flags); 1237 if (ret < 0) { 1238 goto fail; 1239 } 1240 bytes -= aligned_bytes; 1241 offset += aligned_bytes; 1242 } 1243 1244 assert(!bytes || (offset & (align - 1)) == 0); 1245 if (bytes) { 1246 assert(align == tail_padding_bytes + bytes); 1247 /* RMW the unaligned part after tail. */ 1248 mark_request_serialising(req, align); 1249 wait_serialising_requests(req); 1250 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1251 ret = bdrv_aligned_preadv(bs, req, offset, align, 1252 align, &local_qiov, 0); 1253 if (ret < 0) { 1254 goto fail; 1255 } 1256 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1257 1258 memset(buf, 0, bytes); 1259 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1260 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1261 } 1262 fail: 1263 qemu_vfree(buf); 1264 return ret; 1265 1266 } 1267 1268 /* 1269 * Handle a write request in coroutine context 1270 */ 1271 int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1272 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1273 BdrvRequestFlags flags) 1274 { 1275 BdrvTrackedRequest req; 1276 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1277 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1278 uint8_t *head_buf = NULL; 1279 uint8_t *tail_buf = NULL; 1280 QEMUIOVector local_qiov; 1281 bool use_local_qiov = false; 1282 int ret; 1283 1284 if (!bs->drv) { 1285 return -ENOMEDIUM; 1286 } 1287 if (bs->read_only) { 1288 return -EPERM; 1289 } 1290 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1291 1292 ret = bdrv_check_byte_request(bs, offset, bytes); 1293 if (ret < 0) { 1294 return ret; 1295 } 1296 1297 /* throttling disk I/O */ 1298 if (bs->io_limits_enabled) { 1299 throttle_group_co_io_limits_intercept(bs, bytes, true); 1300 } 1301 1302 /* 1303 * Align write if necessary by performing a read-modify-write cycle. 1304 * Pad qiov with the read parts and be sure to have a tracked request not 1305 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1306 */ 1307 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1308 1309 if (!qiov) { 1310 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1311 goto out; 1312 } 1313 1314 if (offset & (align - 1)) { 1315 QEMUIOVector head_qiov; 1316 struct iovec head_iov; 1317 1318 mark_request_serialising(&req, align); 1319 wait_serialising_requests(&req); 1320 1321 head_buf = qemu_blockalign(bs, align); 1322 head_iov = (struct iovec) { 1323 .iov_base = head_buf, 1324 .iov_len = align, 1325 }; 1326 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1327 1328 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1329 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1330 align, &head_qiov, 0); 1331 if (ret < 0) { 1332 goto fail; 1333 } 1334 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1335 1336 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1337 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1338 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1339 use_local_qiov = true; 1340 1341 bytes += offset & (align - 1); 1342 offset = offset & ~(align - 1); 1343 } 1344 1345 if ((offset + bytes) & (align - 1)) { 1346 QEMUIOVector tail_qiov; 1347 struct iovec tail_iov; 1348 size_t tail_bytes; 1349 bool waited; 1350 1351 mark_request_serialising(&req, align); 1352 waited = wait_serialising_requests(&req); 1353 assert(!waited || !use_local_qiov); 1354 1355 tail_buf = qemu_blockalign(bs, align); 1356 tail_iov = (struct iovec) { 1357 .iov_base = tail_buf, 1358 .iov_len = align, 1359 }; 1360 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1361 1362 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1363 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1364 align, &tail_qiov, 0); 1365 if (ret < 0) { 1366 goto fail; 1367 } 1368 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1369 1370 if (!use_local_qiov) { 1371 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1372 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1373 use_local_qiov = true; 1374 } 1375 1376 tail_bytes = (offset + bytes) & (align - 1); 1377 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1378 1379 bytes = ROUND_UP(bytes, align); 1380 } 1381 1382 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1383 use_local_qiov ? &local_qiov : qiov, 1384 flags); 1385 1386 fail: 1387 1388 if (use_local_qiov) { 1389 qemu_iovec_destroy(&local_qiov); 1390 } 1391 qemu_vfree(head_buf); 1392 qemu_vfree(tail_buf); 1393 out: 1394 tracked_request_end(&req); 1395 return ret; 1396 } 1397 1398 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1399 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1400 BdrvRequestFlags flags) 1401 { 1402 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1403 return -EINVAL; 1404 } 1405 1406 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1407 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1408 } 1409 1410 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1411 int nb_sectors, QEMUIOVector *qiov) 1412 { 1413 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1414 1415 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1416 } 1417 1418 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1419 int64_t sector_num, int nb_sectors, 1420 BdrvRequestFlags flags) 1421 { 1422 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1423 1424 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1425 flags &= ~BDRV_REQ_MAY_UNMAP; 1426 } 1427 1428 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1429 BDRV_REQ_ZERO_WRITE | flags); 1430 } 1431 1432 typedef struct BdrvCoGetBlockStatusData { 1433 BlockDriverState *bs; 1434 BlockDriverState *base; 1435 BlockDriverState **file; 1436 int64_t sector_num; 1437 int nb_sectors; 1438 int *pnum; 1439 int64_t ret; 1440 bool done; 1441 } BdrvCoGetBlockStatusData; 1442 1443 /* 1444 * Returns the allocation status of the specified sectors. 1445 * Drivers not implementing the functionality are assumed to not support 1446 * backing files, hence all their sectors are reported as allocated. 1447 * 1448 * If 'sector_num' is beyond the end of the disk image the return value is 0 1449 * and 'pnum' is set to 0. 1450 * 1451 * 'pnum' is set to the number of sectors (including and immediately following 1452 * the specified sector) that are known to be in the same 1453 * allocated/unallocated state. 1454 * 1455 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1456 * beyond the end of the disk image it will be clamped. 1457 * 1458 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' 1459 * points to the BDS which the sector range is allocated in. 1460 */ 1461 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1462 int64_t sector_num, 1463 int nb_sectors, int *pnum, 1464 BlockDriverState **file) 1465 { 1466 int64_t total_sectors; 1467 int64_t n; 1468 int64_t ret, ret2; 1469 1470 total_sectors = bdrv_nb_sectors(bs); 1471 if (total_sectors < 0) { 1472 return total_sectors; 1473 } 1474 1475 if (sector_num >= total_sectors) { 1476 *pnum = 0; 1477 return 0; 1478 } 1479 1480 n = total_sectors - sector_num; 1481 if (n < nb_sectors) { 1482 nb_sectors = n; 1483 } 1484 1485 if (!bs->drv->bdrv_co_get_block_status) { 1486 *pnum = nb_sectors; 1487 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1488 if (bs->drv->protocol_name) { 1489 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1490 } 1491 return ret; 1492 } 1493 1494 *file = NULL; 1495 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, 1496 file); 1497 if (ret < 0) { 1498 *pnum = 0; 1499 return ret; 1500 } 1501 1502 if (ret & BDRV_BLOCK_RAW) { 1503 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1504 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1505 *pnum, pnum, file); 1506 } 1507 1508 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1509 ret |= BDRV_BLOCK_ALLOCATED; 1510 } else { 1511 if (bdrv_unallocated_blocks_are_zero(bs)) { 1512 ret |= BDRV_BLOCK_ZERO; 1513 } else if (bs->backing) { 1514 BlockDriverState *bs2 = bs->backing->bs; 1515 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1516 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1517 ret |= BDRV_BLOCK_ZERO; 1518 } 1519 } 1520 } 1521 1522 if (*file && *file != bs && 1523 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1524 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1525 BlockDriverState *file2; 1526 int file_pnum; 1527 1528 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1529 *pnum, &file_pnum, &file2); 1530 if (ret2 >= 0) { 1531 /* Ignore errors. This is just providing extra information, it 1532 * is useful but not necessary. 1533 */ 1534 if (!file_pnum) { 1535 /* !file_pnum indicates an offset at or beyond the EOF; it is 1536 * perfectly valid for the format block driver to point to such 1537 * offsets, so catch it and mark everything as zero */ 1538 ret |= BDRV_BLOCK_ZERO; 1539 } else { 1540 /* Limit request to the range reported by the protocol driver */ 1541 *pnum = file_pnum; 1542 ret |= (ret2 & BDRV_BLOCK_ZERO); 1543 } 1544 } 1545 } 1546 1547 return ret; 1548 } 1549 1550 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1551 BlockDriverState *base, 1552 int64_t sector_num, 1553 int nb_sectors, 1554 int *pnum, 1555 BlockDriverState **file) 1556 { 1557 BlockDriverState *p; 1558 int64_t ret = 0; 1559 1560 assert(bs != base); 1561 for (p = bs; p != base; p = backing_bs(p)) { 1562 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); 1563 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1564 break; 1565 } 1566 /* [sector_num, pnum] unallocated on this layer, which could be only 1567 * the first part of [sector_num, nb_sectors]. */ 1568 nb_sectors = MIN(nb_sectors, *pnum); 1569 } 1570 return ret; 1571 } 1572 1573 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1574 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1575 { 1576 BdrvCoGetBlockStatusData *data = opaque; 1577 1578 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1579 data->sector_num, 1580 data->nb_sectors, 1581 data->pnum, 1582 data->file); 1583 data->done = true; 1584 } 1585 1586 /* 1587 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1588 * 1589 * See bdrv_co_get_block_status_above() for details. 1590 */ 1591 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1592 BlockDriverState *base, 1593 int64_t sector_num, 1594 int nb_sectors, int *pnum, 1595 BlockDriverState **file) 1596 { 1597 Coroutine *co; 1598 BdrvCoGetBlockStatusData data = { 1599 .bs = bs, 1600 .base = base, 1601 .file = file, 1602 .sector_num = sector_num, 1603 .nb_sectors = nb_sectors, 1604 .pnum = pnum, 1605 .done = false, 1606 }; 1607 1608 if (qemu_in_coroutine()) { 1609 /* Fast-path if already in coroutine context */ 1610 bdrv_get_block_status_above_co_entry(&data); 1611 } else { 1612 AioContext *aio_context = bdrv_get_aio_context(bs); 1613 1614 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1615 qemu_coroutine_enter(co, &data); 1616 while (!data.done) { 1617 aio_poll(aio_context, true); 1618 } 1619 } 1620 return data.ret; 1621 } 1622 1623 int64_t bdrv_get_block_status(BlockDriverState *bs, 1624 int64_t sector_num, 1625 int nb_sectors, int *pnum, 1626 BlockDriverState **file) 1627 { 1628 return bdrv_get_block_status_above(bs, backing_bs(bs), 1629 sector_num, nb_sectors, pnum, file); 1630 } 1631 1632 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1633 int nb_sectors, int *pnum) 1634 { 1635 BlockDriverState *file; 1636 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, 1637 &file); 1638 if (ret < 0) { 1639 return ret; 1640 } 1641 return !!(ret & BDRV_BLOCK_ALLOCATED); 1642 } 1643 1644 /* 1645 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1646 * 1647 * Return true if the given sector is allocated in any image between 1648 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1649 * sector is allocated in any image of the chain. Return false otherwise. 1650 * 1651 * 'pnum' is set to the number of sectors (including and immediately following 1652 * the specified sector) that are known to be in the same 1653 * allocated/unallocated state. 1654 * 1655 */ 1656 int bdrv_is_allocated_above(BlockDriverState *top, 1657 BlockDriverState *base, 1658 int64_t sector_num, 1659 int nb_sectors, int *pnum) 1660 { 1661 BlockDriverState *intermediate; 1662 int ret, n = nb_sectors; 1663 1664 intermediate = top; 1665 while (intermediate && intermediate != base) { 1666 int pnum_inter; 1667 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1668 &pnum_inter); 1669 if (ret < 0) { 1670 return ret; 1671 } else if (ret) { 1672 *pnum = pnum_inter; 1673 return 1; 1674 } 1675 1676 /* 1677 * [sector_num, nb_sectors] is unallocated on top but intermediate 1678 * might have 1679 * 1680 * [sector_num+x, nr_sectors] allocated. 1681 */ 1682 if (n > pnum_inter && 1683 (intermediate == top || 1684 sector_num + pnum_inter < intermediate->total_sectors)) { 1685 n = pnum_inter; 1686 } 1687 1688 intermediate = backing_bs(intermediate); 1689 } 1690 1691 *pnum = n; 1692 return 0; 1693 } 1694 1695 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1696 const uint8_t *buf, int nb_sectors) 1697 { 1698 BlockDriver *drv = bs->drv; 1699 int ret; 1700 1701 if (!drv) { 1702 return -ENOMEDIUM; 1703 } 1704 if (!drv->bdrv_write_compressed) { 1705 return -ENOTSUP; 1706 } 1707 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1708 if (ret < 0) { 1709 return ret; 1710 } 1711 1712 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1713 1714 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1715 } 1716 1717 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1718 int64_t pos, int size) 1719 { 1720 QEMUIOVector qiov; 1721 struct iovec iov = { 1722 .iov_base = (void *) buf, 1723 .iov_len = size, 1724 }; 1725 1726 qemu_iovec_init_external(&qiov, &iov, 1); 1727 return bdrv_writev_vmstate(bs, &qiov, pos); 1728 } 1729 1730 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1731 { 1732 BlockDriver *drv = bs->drv; 1733 1734 if (!drv) { 1735 return -ENOMEDIUM; 1736 } else if (drv->bdrv_save_vmstate) { 1737 return drv->bdrv_save_vmstate(bs, qiov, pos); 1738 } else if (bs->file) { 1739 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1740 } 1741 1742 return -ENOTSUP; 1743 } 1744 1745 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1746 int64_t pos, int size) 1747 { 1748 BlockDriver *drv = bs->drv; 1749 if (!drv) 1750 return -ENOMEDIUM; 1751 if (drv->bdrv_load_vmstate) 1752 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1753 if (bs->file) 1754 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1755 return -ENOTSUP; 1756 } 1757 1758 /**************************************************************/ 1759 /* async I/Os */ 1760 1761 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1762 QEMUIOVector *qiov, int nb_sectors, 1763 BlockCompletionFunc *cb, void *opaque) 1764 { 1765 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1766 1767 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1768 cb, opaque, false); 1769 } 1770 1771 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1772 QEMUIOVector *qiov, int nb_sectors, 1773 BlockCompletionFunc *cb, void *opaque) 1774 { 1775 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1776 1777 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1778 cb, opaque, true); 1779 } 1780 1781 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1782 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1783 BlockCompletionFunc *cb, void *opaque) 1784 { 1785 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1786 1787 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1788 BDRV_REQ_ZERO_WRITE | flags, 1789 cb, opaque, true); 1790 } 1791 1792 1793 typedef struct MultiwriteCB { 1794 int error; 1795 int num_requests; 1796 int num_callbacks; 1797 struct { 1798 BlockCompletionFunc *cb; 1799 void *opaque; 1800 QEMUIOVector *free_qiov; 1801 } callbacks[]; 1802 } MultiwriteCB; 1803 1804 static void multiwrite_user_cb(MultiwriteCB *mcb) 1805 { 1806 int i; 1807 1808 for (i = 0; i < mcb->num_callbacks; i++) { 1809 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1810 if (mcb->callbacks[i].free_qiov) { 1811 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1812 } 1813 g_free(mcb->callbacks[i].free_qiov); 1814 } 1815 } 1816 1817 static void multiwrite_cb(void *opaque, int ret) 1818 { 1819 MultiwriteCB *mcb = opaque; 1820 1821 trace_multiwrite_cb(mcb, ret); 1822 1823 if (ret < 0 && !mcb->error) { 1824 mcb->error = ret; 1825 } 1826 1827 mcb->num_requests--; 1828 if (mcb->num_requests == 0) { 1829 multiwrite_user_cb(mcb); 1830 g_free(mcb); 1831 } 1832 } 1833 1834 static int multiwrite_req_compare(const void *a, const void *b) 1835 { 1836 const BlockRequest *req1 = a, *req2 = b; 1837 1838 /* 1839 * Note that we can't simply subtract req2->sector from req1->sector 1840 * here as that could overflow the return value. 1841 */ 1842 if (req1->sector > req2->sector) { 1843 return 1; 1844 } else if (req1->sector < req2->sector) { 1845 return -1; 1846 } else { 1847 return 0; 1848 } 1849 } 1850 1851 /* 1852 * Takes a bunch of requests and tries to merge them. Returns the number of 1853 * requests that remain after merging. 1854 */ 1855 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1856 int num_reqs, MultiwriteCB *mcb) 1857 { 1858 int i, outidx; 1859 1860 // Sort requests by start sector 1861 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1862 1863 // Check if adjacent requests touch the same clusters. If so, combine them, 1864 // filling up gaps with zero sectors. 1865 outidx = 0; 1866 for (i = 1; i < num_reqs; i++) { 1867 int merge = 0; 1868 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1869 1870 // Handle exactly sequential writes and overlapping writes. 1871 if (reqs[i].sector <= oldreq_last) { 1872 merge = 1; 1873 } 1874 1875 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > 1876 bs->bl.max_iov) { 1877 merge = 0; 1878 } 1879 1880 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1881 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1882 merge = 0; 1883 } 1884 1885 if (merge) { 1886 size_t size; 1887 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1888 qemu_iovec_init(qiov, 1889 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1890 1891 // Add the first request to the merged one. If the requests are 1892 // overlapping, drop the last sectors of the first request. 1893 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1894 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1895 1896 // We should need to add any zeros between the two requests 1897 assert (reqs[i].sector <= oldreq_last); 1898 1899 // Add the second request 1900 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1901 1902 // Add tail of first request, if necessary 1903 if (qiov->size < reqs[outidx].qiov->size) { 1904 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1905 reqs[outidx].qiov->size - qiov->size); 1906 } 1907 1908 reqs[outidx].nb_sectors = qiov->size >> 9; 1909 reqs[outidx].qiov = qiov; 1910 1911 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1912 } else { 1913 outidx++; 1914 reqs[outidx].sector = reqs[i].sector; 1915 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1916 reqs[outidx].qiov = reqs[i].qiov; 1917 } 1918 } 1919 1920 if (bs->blk) { 1921 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1922 num_reqs - outidx - 1); 1923 } 1924 1925 return outidx + 1; 1926 } 1927 1928 /* 1929 * Submit multiple AIO write requests at once. 1930 * 1931 * On success, the function returns 0 and all requests in the reqs array have 1932 * been submitted. In error case this function returns -1, and any of the 1933 * requests may or may not be submitted yet. In particular, this means that the 1934 * callback will be called for some of the requests, for others it won't. The 1935 * caller must check the error field of the BlockRequest to wait for the right 1936 * callbacks (if error != 0, no callback will be called). 1937 * 1938 * The implementation may modify the contents of the reqs array, e.g. to merge 1939 * requests. However, the fields opaque and error are left unmodified as they 1940 * are used to signal failure for a single request to the caller. 1941 */ 1942 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1943 { 1944 MultiwriteCB *mcb; 1945 int i; 1946 1947 /* don't submit writes if we don't have a medium */ 1948 if (bs->drv == NULL) { 1949 for (i = 0; i < num_reqs; i++) { 1950 reqs[i].error = -ENOMEDIUM; 1951 } 1952 return -1; 1953 } 1954 1955 if (num_reqs == 0) { 1956 return 0; 1957 } 1958 1959 // Create MultiwriteCB structure 1960 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1961 mcb->num_requests = 0; 1962 mcb->num_callbacks = num_reqs; 1963 1964 for (i = 0; i < num_reqs; i++) { 1965 mcb->callbacks[i].cb = reqs[i].cb; 1966 mcb->callbacks[i].opaque = reqs[i].opaque; 1967 } 1968 1969 // Check for mergable requests 1970 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1971 1972 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1973 1974 /* Run the aio requests. */ 1975 mcb->num_requests = num_reqs; 1976 for (i = 0; i < num_reqs; i++) { 1977 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1978 reqs[i].nb_sectors, reqs[i].flags, 1979 multiwrite_cb, mcb, 1980 true); 1981 } 1982 1983 return 0; 1984 } 1985 1986 void bdrv_aio_cancel(BlockAIOCB *acb) 1987 { 1988 qemu_aio_ref(acb); 1989 bdrv_aio_cancel_async(acb); 1990 while (acb->refcnt > 1) { 1991 if (acb->aiocb_info->get_aio_context) { 1992 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1993 } else if (acb->bs) { 1994 aio_poll(bdrv_get_aio_context(acb->bs), true); 1995 } else { 1996 abort(); 1997 } 1998 } 1999 qemu_aio_unref(acb); 2000 } 2001 2002 /* Async version of aio cancel. The caller is not blocked if the acb implements 2003 * cancel_async, otherwise we do nothing and let the request normally complete. 2004 * In either case the completion callback must be called. */ 2005 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2006 { 2007 if (acb->aiocb_info->cancel_async) { 2008 acb->aiocb_info->cancel_async(acb); 2009 } 2010 } 2011 2012 /**************************************************************/ 2013 /* async block device emulation */ 2014 2015 typedef struct BlockAIOCBSync { 2016 BlockAIOCB common; 2017 QEMUBH *bh; 2018 int ret; 2019 /* vector translation state */ 2020 QEMUIOVector *qiov; 2021 uint8_t *bounce; 2022 int is_write; 2023 } BlockAIOCBSync; 2024 2025 static const AIOCBInfo bdrv_em_aiocb_info = { 2026 .aiocb_size = sizeof(BlockAIOCBSync), 2027 }; 2028 2029 static void bdrv_aio_bh_cb(void *opaque) 2030 { 2031 BlockAIOCBSync *acb = opaque; 2032 2033 if (!acb->is_write && acb->ret >= 0) { 2034 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2035 } 2036 qemu_vfree(acb->bounce); 2037 acb->common.cb(acb->common.opaque, acb->ret); 2038 qemu_bh_delete(acb->bh); 2039 acb->bh = NULL; 2040 qemu_aio_unref(acb); 2041 } 2042 2043 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2044 int64_t sector_num, 2045 QEMUIOVector *qiov, 2046 int nb_sectors, 2047 BlockCompletionFunc *cb, 2048 void *opaque, 2049 int is_write) 2050 2051 { 2052 BlockAIOCBSync *acb; 2053 2054 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2055 acb->is_write = is_write; 2056 acb->qiov = qiov; 2057 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2058 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2059 2060 if (acb->bounce == NULL) { 2061 acb->ret = -ENOMEM; 2062 } else if (is_write) { 2063 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2064 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2065 } else { 2066 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2067 } 2068 2069 qemu_bh_schedule(acb->bh); 2070 2071 return &acb->common; 2072 } 2073 2074 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2075 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2076 BlockCompletionFunc *cb, void *opaque) 2077 { 2078 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2079 } 2080 2081 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2082 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2083 BlockCompletionFunc *cb, void *opaque) 2084 { 2085 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2086 } 2087 2088 2089 typedef struct BlockAIOCBCoroutine { 2090 BlockAIOCB common; 2091 BlockRequest req; 2092 bool is_write; 2093 bool need_bh; 2094 bool *done; 2095 QEMUBH* bh; 2096 } BlockAIOCBCoroutine; 2097 2098 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2099 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2100 }; 2101 2102 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2103 { 2104 if (!acb->need_bh) { 2105 acb->common.cb(acb->common.opaque, acb->req.error); 2106 qemu_aio_unref(acb); 2107 } 2108 } 2109 2110 static void bdrv_co_em_bh(void *opaque) 2111 { 2112 BlockAIOCBCoroutine *acb = opaque; 2113 2114 assert(!acb->need_bh); 2115 qemu_bh_delete(acb->bh); 2116 bdrv_co_complete(acb); 2117 } 2118 2119 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2120 { 2121 acb->need_bh = false; 2122 if (acb->req.error != -EINPROGRESS) { 2123 BlockDriverState *bs = acb->common.bs; 2124 2125 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2126 qemu_bh_schedule(acb->bh); 2127 } 2128 } 2129 2130 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2131 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2132 { 2133 BlockAIOCBCoroutine *acb = opaque; 2134 BlockDriverState *bs = acb->common.bs; 2135 2136 if (!acb->is_write) { 2137 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2138 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2139 } else { 2140 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2141 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2142 } 2143 2144 bdrv_co_complete(acb); 2145 } 2146 2147 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2148 int64_t sector_num, 2149 QEMUIOVector *qiov, 2150 int nb_sectors, 2151 BdrvRequestFlags flags, 2152 BlockCompletionFunc *cb, 2153 void *opaque, 2154 bool is_write) 2155 { 2156 Coroutine *co; 2157 BlockAIOCBCoroutine *acb; 2158 2159 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2160 acb->need_bh = true; 2161 acb->req.error = -EINPROGRESS; 2162 acb->req.sector = sector_num; 2163 acb->req.nb_sectors = nb_sectors; 2164 acb->req.qiov = qiov; 2165 acb->req.flags = flags; 2166 acb->is_write = is_write; 2167 2168 co = qemu_coroutine_create(bdrv_co_do_rw); 2169 qemu_coroutine_enter(co, acb); 2170 2171 bdrv_co_maybe_schedule_bh(acb); 2172 return &acb->common; 2173 } 2174 2175 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2176 { 2177 BlockAIOCBCoroutine *acb = opaque; 2178 BlockDriverState *bs = acb->common.bs; 2179 2180 acb->req.error = bdrv_co_flush(bs); 2181 bdrv_co_complete(acb); 2182 } 2183 2184 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2185 BlockCompletionFunc *cb, void *opaque) 2186 { 2187 trace_bdrv_aio_flush(bs, opaque); 2188 2189 Coroutine *co; 2190 BlockAIOCBCoroutine *acb; 2191 2192 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2193 acb->need_bh = true; 2194 acb->req.error = -EINPROGRESS; 2195 2196 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2197 qemu_coroutine_enter(co, acb); 2198 2199 bdrv_co_maybe_schedule_bh(acb); 2200 return &acb->common; 2201 } 2202 2203 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2204 { 2205 BlockAIOCBCoroutine *acb = opaque; 2206 BlockDriverState *bs = acb->common.bs; 2207 2208 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2209 bdrv_co_complete(acb); 2210 } 2211 2212 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2213 int64_t sector_num, int nb_sectors, 2214 BlockCompletionFunc *cb, void *opaque) 2215 { 2216 Coroutine *co; 2217 BlockAIOCBCoroutine *acb; 2218 2219 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2220 2221 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2222 acb->need_bh = true; 2223 acb->req.error = -EINPROGRESS; 2224 acb->req.sector = sector_num; 2225 acb->req.nb_sectors = nb_sectors; 2226 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2227 qemu_coroutine_enter(co, acb); 2228 2229 bdrv_co_maybe_schedule_bh(acb); 2230 return &acb->common; 2231 } 2232 2233 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2234 BlockCompletionFunc *cb, void *opaque) 2235 { 2236 BlockAIOCB *acb; 2237 2238 acb = g_malloc(aiocb_info->aiocb_size); 2239 acb->aiocb_info = aiocb_info; 2240 acb->bs = bs; 2241 acb->cb = cb; 2242 acb->opaque = opaque; 2243 acb->refcnt = 1; 2244 return acb; 2245 } 2246 2247 void qemu_aio_ref(void *p) 2248 { 2249 BlockAIOCB *acb = p; 2250 acb->refcnt++; 2251 } 2252 2253 void qemu_aio_unref(void *p) 2254 { 2255 BlockAIOCB *acb = p; 2256 assert(acb->refcnt > 0); 2257 if (--acb->refcnt == 0) { 2258 g_free(acb); 2259 } 2260 } 2261 2262 /**************************************************************/ 2263 /* Coroutine block device emulation */ 2264 2265 typedef struct CoroutineIOCompletion { 2266 Coroutine *coroutine; 2267 int ret; 2268 } CoroutineIOCompletion; 2269 2270 static void bdrv_co_io_em_complete(void *opaque, int ret) 2271 { 2272 CoroutineIOCompletion *co = opaque; 2273 2274 co->ret = ret; 2275 qemu_coroutine_enter(co->coroutine, NULL); 2276 } 2277 2278 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2279 int nb_sectors, QEMUIOVector *iov, 2280 bool is_write) 2281 { 2282 CoroutineIOCompletion co = { 2283 .coroutine = qemu_coroutine_self(), 2284 }; 2285 BlockAIOCB *acb; 2286 2287 if (is_write) { 2288 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2289 bdrv_co_io_em_complete, &co); 2290 } else { 2291 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2292 bdrv_co_io_em_complete, &co); 2293 } 2294 2295 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2296 if (!acb) { 2297 return -EIO; 2298 } 2299 qemu_coroutine_yield(); 2300 2301 return co.ret; 2302 } 2303 2304 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2305 int64_t sector_num, int nb_sectors, 2306 QEMUIOVector *iov) 2307 { 2308 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2309 } 2310 2311 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2312 int64_t sector_num, int nb_sectors, 2313 QEMUIOVector *iov) 2314 { 2315 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2316 } 2317 2318 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2319 { 2320 RwCo *rwco = opaque; 2321 2322 rwco->ret = bdrv_co_flush(rwco->bs); 2323 } 2324 2325 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2326 { 2327 int ret; 2328 BdrvTrackedRequest req; 2329 2330 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2331 bdrv_is_sg(bs)) { 2332 return 0; 2333 } 2334 2335 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2336 2337 /* Write back all layers by calling one driver function */ 2338 if (bs->drv->bdrv_co_flush) { 2339 ret = bs->drv->bdrv_co_flush(bs); 2340 goto out; 2341 } 2342 2343 /* Write back cached data to the OS even with cache=unsafe */ 2344 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2345 if (bs->drv->bdrv_co_flush_to_os) { 2346 ret = bs->drv->bdrv_co_flush_to_os(bs); 2347 if (ret < 0) { 2348 goto out; 2349 } 2350 } 2351 2352 /* But don't actually force it to the disk with cache=unsafe */ 2353 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2354 goto flush_parent; 2355 } 2356 2357 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2358 if (bs->drv->bdrv_co_flush_to_disk) { 2359 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2360 } else if (bs->drv->bdrv_aio_flush) { 2361 BlockAIOCB *acb; 2362 CoroutineIOCompletion co = { 2363 .coroutine = qemu_coroutine_self(), 2364 }; 2365 2366 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2367 if (acb == NULL) { 2368 ret = -EIO; 2369 } else { 2370 qemu_coroutine_yield(); 2371 ret = co.ret; 2372 } 2373 } else { 2374 /* 2375 * Some block drivers always operate in either writethrough or unsafe 2376 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2377 * know how the server works (because the behaviour is hardcoded or 2378 * depends on server-side configuration), so we can't ensure that 2379 * everything is safe on disk. Returning an error doesn't work because 2380 * that would break guests even if the server operates in writethrough 2381 * mode. 2382 * 2383 * Let's hope the user knows what he's doing. 2384 */ 2385 ret = 0; 2386 } 2387 if (ret < 0) { 2388 goto out; 2389 } 2390 2391 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2392 * in the case of cache=unsafe, so there are no useless flushes. 2393 */ 2394 flush_parent: 2395 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2396 out: 2397 tracked_request_end(&req); 2398 return ret; 2399 } 2400 2401 int bdrv_flush(BlockDriverState *bs) 2402 { 2403 Coroutine *co; 2404 RwCo rwco = { 2405 .bs = bs, 2406 .ret = NOT_DONE, 2407 }; 2408 2409 if (qemu_in_coroutine()) { 2410 /* Fast-path if already in coroutine context */ 2411 bdrv_flush_co_entry(&rwco); 2412 } else { 2413 AioContext *aio_context = bdrv_get_aio_context(bs); 2414 2415 co = qemu_coroutine_create(bdrv_flush_co_entry); 2416 qemu_coroutine_enter(co, &rwco); 2417 while (rwco.ret == NOT_DONE) { 2418 aio_poll(aio_context, true); 2419 } 2420 } 2421 2422 return rwco.ret; 2423 } 2424 2425 typedef struct DiscardCo { 2426 BlockDriverState *bs; 2427 int64_t sector_num; 2428 int nb_sectors; 2429 int ret; 2430 } DiscardCo; 2431 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2432 { 2433 DiscardCo *rwco = opaque; 2434 2435 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2436 } 2437 2438 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2439 int nb_sectors) 2440 { 2441 BdrvTrackedRequest req; 2442 int max_discard, ret; 2443 2444 if (!bs->drv) { 2445 return -ENOMEDIUM; 2446 } 2447 2448 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2449 if (ret < 0) { 2450 return ret; 2451 } else if (bs->read_only) { 2452 return -EPERM; 2453 } 2454 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2455 2456 /* Do nothing if disabled. */ 2457 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2458 return 0; 2459 } 2460 2461 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2462 return 0; 2463 } 2464 2465 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2466 BDRV_TRACKED_DISCARD); 2467 bdrv_set_dirty(bs, sector_num, nb_sectors); 2468 2469 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2470 while (nb_sectors > 0) { 2471 int ret; 2472 int num = nb_sectors; 2473 2474 /* align request */ 2475 if (bs->bl.discard_alignment && 2476 num >= bs->bl.discard_alignment && 2477 sector_num % bs->bl.discard_alignment) { 2478 if (num > bs->bl.discard_alignment) { 2479 num = bs->bl.discard_alignment; 2480 } 2481 num -= sector_num % bs->bl.discard_alignment; 2482 } 2483 2484 /* limit request size */ 2485 if (num > max_discard) { 2486 num = max_discard; 2487 } 2488 2489 if (bs->drv->bdrv_co_discard) { 2490 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2491 } else { 2492 BlockAIOCB *acb; 2493 CoroutineIOCompletion co = { 2494 .coroutine = qemu_coroutine_self(), 2495 }; 2496 2497 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2498 bdrv_co_io_em_complete, &co); 2499 if (acb == NULL) { 2500 ret = -EIO; 2501 goto out; 2502 } else { 2503 qemu_coroutine_yield(); 2504 ret = co.ret; 2505 } 2506 } 2507 if (ret && ret != -ENOTSUP) { 2508 goto out; 2509 } 2510 2511 sector_num += num; 2512 nb_sectors -= num; 2513 } 2514 ret = 0; 2515 out: 2516 tracked_request_end(&req); 2517 return ret; 2518 } 2519 2520 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2521 { 2522 Coroutine *co; 2523 DiscardCo rwco = { 2524 .bs = bs, 2525 .sector_num = sector_num, 2526 .nb_sectors = nb_sectors, 2527 .ret = NOT_DONE, 2528 }; 2529 2530 if (qemu_in_coroutine()) { 2531 /* Fast-path if already in coroutine context */ 2532 bdrv_discard_co_entry(&rwco); 2533 } else { 2534 AioContext *aio_context = bdrv_get_aio_context(bs); 2535 2536 co = qemu_coroutine_create(bdrv_discard_co_entry); 2537 qemu_coroutine_enter(co, &rwco); 2538 while (rwco.ret == NOT_DONE) { 2539 aio_poll(aio_context, true); 2540 } 2541 } 2542 2543 return rwco.ret; 2544 } 2545 2546 typedef struct { 2547 CoroutineIOCompletion *co; 2548 QEMUBH *bh; 2549 } BdrvIoctlCompletionData; 2550 2551 static void bdrv_ioctl_bh_cb(void *opaque) 2552 { 2553 BdrvIoctlCompletionData *data = opaque; 2554 2555 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2556 qemu_bh_delete(data->bh); 2557 } 2558 2559 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2560 { 2561 BlockDriver *drv = bs->drv; 2562 BdrvTrackedRequest tracked_req; 2563 CoroutineIOCompletion co = { 2564 .coroutine = qemu_coroutine_self(), 2565 }; 2566 BlockAIOCB *acb; 2567 2568 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2569 if (!drv || !drv->bdrv_aio_ioctl) { 2570 co.ret = -ENOTSUP; 2571 goto out; 2572 } 2573 2574 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2575 if (!acb) { 2576 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2577 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2578 bdrv_ioctl_bh_cb, data); 2579 data->co = &co; 2580 qemu_bh_schedule(data->bh); 2581 } 2582 qemu_coroutine_yield(); 2583 out: 2584 tracked_request_end(&tracked_req); 2585 return co.ret; 2586 } 2587 2588 typedef struct { 2589 BlockDriverState *bs; 2590 int req; 2591 void *buf; 2592 int ret; 2593 } BdrvIoctlCoData; 2594 2595 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2596 { 2597 BdrvIoctlCoData *data = opaque; 2598 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2599 } 2600 2601 /* needed for generic scsi interface */ 2602 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2603 { 2604 BdrvIoctlCoData data = { 2605 .bs = bs, 2606 .req = req, 2607 .buf = buf, 2608 .ret = -EINPROGRESS, 2609 }; 2610 2611 if (qemu_in_coroutine()) { 2612 /* Fast-path if already in coroutine context */ 2613 bdrv_co_ioctl_entry(&data); 2614 } else { 2615 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2616 2617 qemu_coroutine_enter(co, &data); 2618 while (data.ret == -EINPROGRESS) { 2619 aio_poll(bdrv_get_aio_context(bs), true); 2620 } 2621 } 2622 return data.ret; 2623 } 2624 2625 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2626 { 2627 BlockAIOCBCoroutine *acb = opaque; 2628 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2629 acb->req.req, acb->req.buf); 2630 bdrv_co_complete(acb); 2631 } 2632 2633 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2634 unsigned long int req, void *buf, 2635 BlockCompletionFunc *cb, void *opaque) 2636 { 2637 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2638 bs, cb, opaque); 2639 Coroutine *co; 2640 2641 acb->need_bh = true; 2642 acb->req.error = -EINPROGRESS; 2643 acb->req.req = req; 2644 acb->req.buf = buf; 2645 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2646 qemu_coroutine_enter(co, acb); 2647 2648 bdrv_co_maybe_schedule_bh(acb); 2649 return &acb->common; 2650 } 2651 2652 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2653 { 2654 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2655 } 2656 2657 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2658 { 2659 return memset(qemu_blockalign(bs, size), 0, size); 2660 } 2661 2662 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2663 { 2664 size_t align = bdrv_opt_mem_align(bs); 2665 2666 /* Ensure that NULL is never returned on success */ 2667 assert(align > 0); 2668 if (size == 0) { 2669 size = align; 2670 } 2671 2672 return qemu_try_memalign(align, size); 2673 } 2674 2675 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2676 { 2677 void *mem = qemu_try_blockalign(bs, size); 2678 2679 if (mem) { 2680 memset(mem, 0, size); 2681 } 2682 2683 return mem; 2684 } 2685 2686 /* 2687 * Check if all memory in this vector is sector aligned. 2688 */ 2689 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2690 { 2691 int i; 2692 size_t alignment = bdrv_min_mem_align(bs); 2693 2694 for (i = 0; i < qiov->niov; i++) { 2695 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2696 return false; 2697 } 2698 if (qiov->iov[i].iov_len % alignment) { 2699 return false; 2700 } 2701 } 2702 2703 return true; 2704 } 2705 2706 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2707 NotifierWithReturn *notifier) 2708 { 2709 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2710 } 2711 2712 void bdrv_io_plug(BlockDriverState *bs) 2713 { 2714 BlockDriver *drv = bs->drv; 2715 if (drv && drv->bdrv_io_plug) { 2716 drv->bdrv_io_plug(bs); 2717 } else if (bs->file) { 2718 bdrv_io_plug(bs->file->bs); 2719 } 2720 } 2721 2722 void bdrv_io_unplug(BlockDriverState *bs) 2723 { 2724 BlockDriver *drv = bs->drv; 2725 if (drv && drv->bdrv_io_unplug) { 2726 drv->bdrv_io_unplug(bs); 2727 } else if (bs->file) { 2728 bdrv_io_unplug(bs->file->bs); 2729 } 2730 } 2731 2732 void bdrv_flush_io_queue(BlockDriverState *bs) 2733 { 2734 BlockDriver *drv = bs->drv; 2735 if (drv && drv->bdrv_flush_io_queue) { 2736 drv->bdrv_flush_io_queue(bs); 2737 } else if (bs->file) { 2738 bdrv_flush_io_queue(bs->file->bs); 2739 } 2740 bdrv_start_throttled_reqs(bs); 2741 } 2742 2743 void bdrv_drained_begin(BlockDriverState *bs) 2744 { 2745 if (!bs->quiesce_counter++) { 2746 aio_disable_external(bdrv_get_aio_context(bs)); 2747 } 2748 bdrv_drain(bs); 2749 } 2750 2751 void bdrv_drained_end(BlockDriverState *bs) 2752 { 2753 assert(bs->quiesce_counter > 0); 2754 if (--bs->quiesce_counter > 0) { 2755 return; 2756 } 2757 aio_enable_external(bdrv_get_aio_context(bs)); 2758 } 2759