1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "block/throttle-groups.h" 31 #include "qemu/error-report.h" 32 33 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 34 35 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 36 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 37 BlockCompletionFunc *cb, void *opaque); 38 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 39 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 40 BlockCompletionFunc *cb, void *opaque); 41 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 42 int64_t sector_num, int nb_sectors, 43 QEMUIOVector *iov); 44 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 45 int64_t sector_num, int nb_sectors, 46 QEMUIOVector *iov); 47 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 48 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 49 BdrvRequestFlags flags); 50 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 51 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 52 BdrvRequestFlags flags); 53 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 54 int64_t sector_num, 55 QEMUIOVector *qiov, 56 int nb_sectors, 57 BdrvRequestFlags flags, 58 BlockCompletionFunc *cb, 59 void *opaque, 60 bool is_write); 61 static void coroutine_fn bdrv_co_do_rw(void *opaque); 62 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 63 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 64 65 /* throttling disk I/O limits */ 66 void bdrv_set_io_limits(BlockDriverState *bs, 67 ThrottleConfig *cfg) 68 { 69 int i; 70 71 throttle_group_config(bs, cfg); 72 73 for (i = 0; i < 2; i++) { 74 qemu_co_enter_next(&bs->throttled_reqs[i]); 75 } 76 } 77 78 /* this function drain all the throttled IOs */ 79 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 80 { 81 bool drained = false; 82 bool enabled = bs->io_limits_enabled; 83 int i; 84 85 bs->io_limits_enabled = false; 86 87 for (i = 0; i < 2; i++) { 88 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 89 drained = true; 90 } 91 } 92 93 bs->io_limits_enabled = enabled; 94 95 return drained; 96 } 97 98 void bdrv_io_limits_disable(BlockDriverState *bs) 99 { 100 bs->io_limits_enabled = false; 101 bdrv_start_throttled_reqs(bs); 102 throttle_group_unregister_bs(bs); 103 } 104 105 /* should be called before bdrv_set_io_limits if a limit is set */ 106 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 107 { 108 assert(!bs->io_limits_enabled); 109 throttle_group_register_bs(bs, group); 110 bs->io_limits_enabled = true; 111 } 112 113 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 114 { 115 /* this bs is not part of any group */ 116 if (!bs->throttle_state) { 117 return; 118 } 119 120 /* this bs is a part of the same group than the one we want */ 121 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 122 return; 123 } 124 125 /* need to change the group this bs belong to */ 126 bdrv_io_limits_disable(bs); 127 bdrv_io_limits_enable(bs, group); 128 } 129 130 void bdrv_setup_io_funcs(BlockDriver *bdrv) 131 { 132 /* Block drivers without coroutine functions need emulation */ 133 if (!bdrv->bdrv_co_readv) { 134 bdrv->bdrv_co_readv = bdrv_co_readv_em; 135 bdrv->bdrv_co_writev = bdrv_co_writev_em; 136 137 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 138 * the block driver lacks aio we need to emulate that too. 139 */ 140 if (!bdrv->bdrv_aio_readv) { 141 /* add AIO emulation layer */ 142 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 143 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 144 } 145 } 146 } 147 148 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 149 { 150 BlockDriver *drv = bs->drv; 151 Error *local_err = NULL; 152 153 memset(&bs->bl, 0, sizeof(bs->bl)); 154 155 if (!drv) { 156 return; 157 } 158 159 /* Take some limits from the children as a default */ 160 if (bs->file) { 161 bdrv_refresh_limits(bs->file->bs, &local_err); 162 if (local_err) { 163 error_propagate(errp, local_err); 164 return; 165 } 166 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 167 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 168 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 169 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 170 bs->bl.max_iov = bs->file->bs->bl.max_iov; 171 } else { 172 bs->bl.min_mem_alignment = 512; 173 bs->bl.opt_mem_alignment = getpagesize(); 174 175 /* Safe default since most protocols use readv()/writev()/etc */ 176 bs->bl.max_iov = IOV_MAX; 177 } 178 179 if (bs->backing) { 180 bdrv_refresh_limits(bs->backing->bs, &local_err); 181 if (local_err) { 182 error_propagate(errp, local_err); 183 return; 184 } 185 bs->bl.opt_transfer_length = 186 MAX(bs->bl.opt_transfer_length, 187 bs->backing->bs->bl.opt_transfer_length); 188 bs->bl.max_transfer_length = 189 MIN_NON_ZERO(bs->bl.max_transfer_length, 190 bs->backing->bs->bl.max_transfer_length); 191 bs->bl.opt_mem_alignment = 192 MAX(bs->bl.opt_mem_alignment, 193 bs->backing->bs->bl.opt_mem_alignment); 194 bs->bl.min_mem_alignment = 195 MAX(bs->bl.min_mem_alignment, 196 bs->backing->bs->bl.min_mem_alignment); 197 bs->bl.max_iov = 198 MIN(bs->bl.max_iov, 199 bs->backing->bs->bl.max_iov); 200 } 201 202 /* Then let the driver override it */ 203 if (drv->bdrv_refresh_limits) { 204 drv->bdrv_refresh_limits(bs, errp); 205 } 206 } 207 208 /** 209 * The copy-on-read flag is actually a reference count so multiple users may 210 * use the feature without worrying about clobbering its previous state. 211 * Copy-on-read stays enabled until all users have called to disable it. 212 */ 213 void bdrv_enable_copy_on_read(BlockDriverState *bs) 214 { 215 bs->copy_on_read++; 216 } 217 218 void bdrv_disable_copy_on_read(BlockDriverState *bs) 219 { 220 assert(bs->copy_on_read > 0); 221 bs->copy_on_read--; 222 } 223 224 /* Check if any requests are in-flight (including throttled requests) */ 225 bool bdrv_requests_pending(BlockDriverState *bs) 226 { 227 BdrvChild *child; 228 229 if (!QLIST_EMPTY(&bs->tracked_requests)) { 230 return true; 231 } 232 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 233 return true; 234 } 235 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 236 return true; 237 } 238 239 QLIST_FOREACH(child, &bs->children, next) { 240 if (bdrv_requests_pending(child->bs)) { 241 return true; 242 } 243 } 244 245 return false; 246 } 247 248 static void bdrv_drain_recurse(BlockDriverState *bs) 249 { 250 BdrvChild *child; 251 252 if (bs->drv && bs->drv->bdrv_drain) { 253 bs->drv->bdrv_drain(bs); 254 } 255 QLIST_FOREACH(child, &bs->children, next) { 256 bdrv_drain_recurse(child->bs); 257 } 258 } 259 260 /* 261 * Wait for pending requests to complete on a single BlockDriverState subtree, 262 * and suspend block driver's internal I/O until next request arrives. 263 * 264 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 265 * AioContext. 266 * 267 * Only this BlockDriverState's AioContext is run, so in-flight requests must 268 * not depend on events in other AioContexts. In that case, use 269 * bdrv_drain_all() instead. 270 */ 271 void bdrv_drain(BlockDriverState *bs) 272 { 273 bool busy = true; 274 275 bdrv_drain_recurse(bs); 276 while (busy) { 277 /* Keep iterating */ 278 bdrv_flush_io_queue(bs); 279 busy = bdrv_requests_pending(bs); 280 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 281 } 282 } 283 284 /* 285 * Wait for pending requests to complete across all BlockDriverStates 286 * 287 * This function does not flush data to disk, use bdrv_flush_all() for that 288 * after calling this function. 289 */ 290 void bdrv_drain_all(void) 291 { 292 /* Always run first iteration so any pending completion BHs run */ 293 bool busy = true; 294 BlockDriverState *bs = NULL; 295 GSList *aio_ctxs = NULL, *ctx; 296 297 while ((bs = bdrv_next(bs))) { 298 AioContext *aio_context = bdrv_get_aio_context(bs); 299 300 aio_context_acquire(aio_context); 301 if (bs->job) { 302 block_job_pause(bs->job); 303 } 304 aio_context_release(aio_context); 305 306 if (!g_slist_find(aio_ctxs, aio_context)) { 307 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 308 } 309 } 310 311 /* Note that completion of an asynchronous I/O operation can trigger any 312 * number of other I/O operations on other devices---for example a 313 * coroutine can submit an I/O request to another device in response to 314 * request completion. Therefore we must keep looping until there was no 315 * more activity rather than simply draining each device independently. 316 */ 317 while (busy) { 318 busy = false; 319 320 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 321 AioContext *aio_context = ctx->data; 322 bs = NULL; 323 324 aio_context_acquire(aio_context); 325 while ((bs = bdrv_next(bs))) { 326 if (aio_context == bdrv_get_aio_context(bs)) { 327 bdrv_flush_io_queue(bs); 328 if (bdrv_requests_pending(bs)) { 329 busy = true; 330 aio_poll(aio_context, busy); 331 } 332 } 333 } 334 busy |= aio_poll(aio_context, false); 335 aio_context_release(aio_context); 336 } 337 } 338 339 bs = NULL; 340 while ((bs = bdrv_next(bs))) { 341 AioContext *aio_context = bdrv_get_aio_context(bs); 342 343 aio_context_acquire(aio_context); 344 if (bs->job) { 345 block_job_resume(bs->job); 346 } 347 aio_context_release(aio_context); 348 } 349 g_slist_free(aio_ctxs); 350 } 351 352 /** 353 * Remove an active request from the tracked requests list 354 * 355 * This function should be called when a tracked request is completing. 356 */ 357 static void tracked_request_end(BdrvTrackedRequest *req) 358 { 359 if (req->serialising) { 360 req->bs->serialising_in_flight--; 361 } 362 363 QLIST_REMOVE(req, list); 364 qemu_co_queue_restart_all(&req->wait_queue); 365 } 366 367 /** 368 * Add an active request to the tracked requests list 369 */ 370 static void tracked_request_begin(BdrvTrackedRequest *req, 371 BlockDriverState *bs, 372 int64_t offset, 373 unsigned int bytes, 374 enum BdrvTrackedRequestType type) 375 { 376 *req = (BdrvTrackedRequest){ 377 .bs = bs, 378 .offset = offset, 379 .bytes = bytes, 380 .type = type, 381 .co = qemu_coroutine_self(), 382 .serialising = false, 383 .overlap_offset = offset, 384 .overlap_bytes = bytes, 385 }; 386 387 qemu_co_queue_init(&req->wait_queue); 388 389 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 390 } 391 392 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 393 { 394 int64_t overlap_offset = req->offset & ~(align - 1); 395 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 396 - overlap_offset; 397 398 if (!req->serialising) { 399 req->bs->serialising_in_flight++; 400 req->serialising = true; 401 } 402 403 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 404 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 405 } 406 407 /** 408 * Round a region to cluster boundaries 409 */ 410 void bdrv_round_to_clusters(BlockDriverState *bs, 411 int64_t sector_num, int nb_sectors, 412 int64_t *cluster_sector_num, 413 int *cluster_nb_sectors) 414 { 415 BlockDriverInfo bdi; 416 417 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 418 *cluster_sector_num = sector_num; 419 *cluster_nb_sectors = nb_sectors; 420 } else { 421 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 422 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 423 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 424 nb_sectors, c); 425 } 426 } 427 428 static int bdrv_get_cluster_size(BlockDriverState *bs) 429 { 430 BlockDriverInfo bdi; 431 int ret; 432 433 ret = bdrv_get_info(bs, &bdi); 434 if (ret < 0 || bdi.cluster_size == 0) { 435 return bs->request_alignment; 436 } else { 437 return bdi.cluster_size; 438 } 439 } 440 441 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 442 int64_t offset, unsigned int bytes) 443 { 444 /* aaaa bbbb */ 445 if (offset >= req->overlap_offset + req->overlap_bytes) { 446 return false; 447 } 448 /* bbbb aaaa */ 449 if (req->overlap_offset >= offset + bytes) { 450 return false; 451 } 452 return true; 453 } 454 455 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 456 { 457 BlockDriverState *bs = self->bs; 458 BdrvTrackedRequest *req; 459 bool retry; 460 bool waited = false; 461 462 if (!bs->serialising_in_flight) { 463 return false; 464 } 465 466 do { 467 retry = false; 468 QLIST_FOREACH(req, &bs->tracked_requests, list) { 469 if (req == self || (!req->serialising && !self->serialising)) { 470 continue; 471 } 472 if (tracked_request_overlaps(req, self->overlap_offset, 473 self->overlap_bytes)) 474 { 475 /* Hitting this means there was a reentrant request, for 476 * example, a block driver issuing nested requests. This must 477 * never happen since it means deadlock. 478 */ 479 assert(qemu_coroutine_self() != req->co); 480 481 /* If the request is already (indirectly) waiting for us, or 482 * will wait for us as soon as it wakes up, then just go on 483 * (instead of producing a deadlock in the former case). */ 484 if (!req->waiting_for) { 485 self->waiting_for = req; 486 qemu_co_queue_wait(&req->wait_queue); 487 self->waiting_for = NULL; 488 retry = true; 489 waited = true; 490 break; 491 } 492 } 493 } 494 } while (retry); 495 496 return waited; 497 } 498 499 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 500 size_t size) 501 { 502 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 503 return -EIO; 504 } 505 506 if (!bdrv_is_inserted(bs)) { 507 return -ENOMEDIUM; 508 } 509 510 if (offset < 0) { 511 return -EIO; 512 } 513 514 return 0; 515 } 516 517 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 518 int nb_sectors) 519 { 520 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 521 return -EIO; 522 } 523 524 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 525 nb_sectors * BDRV_SECTOR_SIZE); 526 } 527 528 typedef struct RwCo { 529 BlockDriverState *bs; 530 int64_t offset; 531 QEMUIOVector *qiov; 532 bool is_write; 533 int ret; 534 BdrvRequestFlags flags; 535 } RwCo; 536 537 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 538 { 539 RwCo *rwco = opaque; 540 541 if (!rwco->is_write) { 542 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 543 rwco->qiov->size, rwco->qiov, 544 rwco->flags); 545 } else { 546 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 547 rwco->qiov->size, rwco->qiov, 548 rwco->flags); 549 } 550 } 551 552 /* 553 * Process a vectored synchronous request using coroutines 554 */ 555 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 556 QEMUIOVector *qiov, bool is_write, 557 BdrvRequestFlags flags) 558 { 559 Coroutine *co; 560 RwCo rwco = { 561 .bs = bs, 562 .offset = offset, 563 .qiov = qiov, 564 .is_write = is_write, 565 .ret = NOT_DONE, 566 .flags = flags, 567 }; 568 569 /** 570 * In sync call context, when the vcpu is blocked, this throttling timer 571 * will not fire; so the I/O throttling function has to be disabled here 572 * if it has been enabled. 573 */ 574 if (bs->io_limits_enabled) { 575 fprintf(stderr, "Disabling I/O throttling on '%s' due " 576 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 577 bdrv_io_limits_disable(bs); 578 } 579 580 if (qemu_in_coroutine()) { 581 /* Fast-path if already in coroutine context */ 582 bdrv_rw_co_entry(&rwco); 583 } else { 584 AioContext *aio_context = bdrv_get_aio_context(bs); 585 586 co = qemu_coroutine_create(bdrv_rw_co_entry); 587 qemu_coroutine_enter(co, &rwco); 588 while (rwco.ret == NOT_DONE) { 589 aio_poll(aio_context, true); 590 } 591 } 592 return rwco.ret; 593 } 594 595 /* 596 * Process a synchronous request using coroutines 597 */ 598 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 599 int nb_sectors, bool is_write, BdrvRequestFlags flags) 600 { 601 QEMUIOVector qiov; 602 struct iovec iov = { 603 .iov_base = (void *)buf, 604 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 605 }; 606 607 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 608 return -EINVAL; 609 } 610 611 qemu_iovec_init_external(&qiov, &iov, 1); 612 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 613 &qiov, is_write, flags); 614 } 615 616 /* return < 0 if error. See bdrv_write() for the return codes */ 617 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 618 uint8_t *buf, int nb_sectors) 619 { 620 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 621 } 622 623 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 624 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 625 uint8_t *buf, int nb_sectors) 626 { 627 bool enabled; 628 int ret; 629 630 enabled = bs->io_limits_enabled; 631 bs->io_limits_enabled = false; 632 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 633 bs->io_limits_enabled = enabled; 634 return ret; 635 } 636 637 /* Return < 0 if error. Important errors are: 638 -EIO generic I/O error (may happen for all errors) 639 -ENOMEDIUM No media inserted. 640 -EINVAL Invalid sector number or nb_sectors 641 -EACCES Trying to write a read-only device 642 */ 643 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 644 const uint8_t *buf, int nb_sectors) 645 { 646 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 647 } 648 649 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 650 int nb_sectors, BdrvRequestFlags flags) 651 { 652 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 653 BDRV_REQ_ZERO_WRITE | flags); 654 } 655 656 /* 657 * Completely zero out a block device with the help of bdrv_write_zeroes. 658 * The operation is sped up by checking the block status and only writing 659 * zeroes to the device if they currently do not return zeroes. Optional 660 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 661 * 662 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 663 */ 664 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 665 { 666 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 667 BlockDriverState *file; 668 int n; 669 670 target_sectors = bdrv_nb_sectors(bs); 671 if (target_sectors < 0) { 672 return target_sectors; 673 } 674 675 for (;;) { 676 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 677 if (nb_sectors <= 0) { 678 return 0; 679 } 680 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); 681 if (ret < 0) { 682 error_report("error getting block status at sector %" PRId64 ": %s", 683 sector_num, strerror(-ret)); 684 return ret; 685 } 686 if (ret & BDRV_BLOCK_ZERO) { 687 sector_num += n; 688 continue; 689 } 690 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 691 if (ret < 0) { 692 error_report("error writing zeroes at sector %" PRId64 ": %s", 693 sector_num, strerror(-ret)); 694 return ret; 695 } 696 sector_num += n; 697 } 698 } 699 700 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 701 { 702 QEMUIOVector qiov; 703 struct iovec iov = { 704 .iov_base = (void *)buf, 705 .iov_len = bytes, 706 }; 707 int ret; 708 709 if (bytes < 0) { 710 return -EINVAL; 711 } 712 713 qemu_iovec_init_external(&qiov, &iov, 1); 714 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 715 if (ret < 0) { 716 return ret; 717 } 718 719 return bytes; 720 } 721 722 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 723 { 724 int ret; 725 726 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 727 if (ret < 0) { 728 return ret; 729 } 730 731 return qiov->size; 732 } 733 734 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 735 const void *buf, int bytes) 736 { 737 QEMUIOVector qiov; 738 struct iovec iov = { 739 .iov_base = (void *) buf, 740 .iov_len = bytes, 741 }; 742 743 if (bytes < 0) { 744 return -EINVAL; 745 } 746 747 qemu_iovec_init_external(&qiov, &iov, 1); 748 return bdrv_pwritev(bs, offset, &qiov); 749 } 750 751 /* 752 * Writes to the file and ensures that no writes are reordered across this 753 * request (acts as a barrier) 754 * 755 * Returns 0 on success, -errno in error cases. 756 */ 757 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 758 const void *buf, int count) 759 { 760 int ret; 761 762 ret = bdrv_pwrite(bs, offset, buf, count); 763 if (ret < 0) { 764 return ret; 765 } 766 767 /* No flush needed for cache modes that already do it */ 768 if (bs->enable_write_cache) { 769 bdrv_flush(bs); 770 } 771 772 return 0; 773 } 774 775 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 776 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 777 { 778 /* Perform I/O through a temporary buffer so that users who scribble over 779 * their read buffer while the operation is in progress do not end up 780 * modifying the image file. This is critical for zero-copy guest I/O 781 * where anything might happen inside guest memory. 782 */ 783 void *bounce_buffer; 784 785 BlockDriver *drv = bs->drv; 786 struct iovec iov; 787 QEMUIOVector bounce_qiov; 788 int64_t cluster_sector_num; 789 int cluster_nb_sectors; 790 size_t skip_bytes; 791 int ret; 792 793 /* Cover entire cluster so no additional backing file I/O is required when 794 * allocating cluster in the image file. 795 */ 796 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 797 &cluster_sector_num, &cluster_nb_sectors); 798 799 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 800 cluster_sector_num, cluster_nb_sectors); 801 802 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 803 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 804 if (bounce_buffer == NULL) { 805 ret = -ENOMEM; 806 goto err; 807 } 808 809 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 810 811 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 812 &bounce_qiov); 813 if (ret < 0) { 814 goto err; 815 } 816 817 if (drv->bdrv_co_write_zeroes && 818 buffer_is_zero(bounce_buffer, iov.iov_len)) { 819 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 820 cluster_nb_sectors, 0); 821 } else { 822 /* This does not change the data on the disk, it is not necessary 823 * to flush even in cache=writethrough mode. 824 */ 825 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 826 &bounce_qiov); 827 } 828 829 if (ret < 0) { 830 /* It might be okay to ignore write errors for guest requests. If this 831 * is a deliberate copy-on-read then we don't want to ignore the error. 832 * Simply report it in all cases. 833 */ 834 goto err; 835 } 836 837 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 838 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 839 nb_sectors * BDRV_SECTOR_SIZE); 840 841 err: 842 qemu_vfree(bounce_buffer); 843 return ret; 844 } 845 846 /* 847 * Forwards an already correctly aligned request to the BlockDriver. This 848 * handles copy on read and zeroing after EOF; any other features must be 849 * implemented by the caller. 850 */ 851 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 852 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 853 int64_t align, QEMUIOVector *qiov, int flags) 854 { 855 BlockDriver *drv = bs->drv; 856 int ret; 857 858 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 859 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 860 861 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 862 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 863 assert(!qiov || bytes == qiov->size); 864 865 /* Handle Copy on Read and associated serialisation */ 866 if (flags & BDRV_REQ_COPY_ON_READ) { 867 /* If we touch the same cluster it counts as an overlap. This 868 * guarantees that allocating writes will be serialized and not race 869 * with each other for the same cluster. For example, in copy-on-read 870 * it ensures that the CoR read and write operations are atomic and 871 * guest writes cannot interleave between them. */ 872 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 873 } 874 875 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 876 wait_serialising_requests(req); 877 } 878 879 if (flags & BDRV_REQ_COPY_ON_READ) { 880 int pnum; 881 882 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 883 if (ret < 0) { 884 goto out; 885 } 886 887 if (!ret || pnum != nb_sectors) { 888 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 889 goto out; 890 } 891 } 892 893 /* Forward the request to the BlockDriver */ 894 if (!bs->zero_beyond_eof) { 895 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 896 } else { 897 /* Read zeros after EOF */ 898 int64_t total_sectors, max_nb_sectors; 899 900 total_sectors = bdrv_nb_sectors(bs); 901 if (total_sectors < 0) { 902 ret = total_sectors; 903 goto out; 904 } 905 906 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 907 align >> BDRV_SECTOR_BITS); 908 if (nb_sectors < max_nb_sectors) { 909 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 910 } else if (max_nb_sectors > 0) { 911 QEMUIOVector local_qiov; 912 913 qemu_iovec_init(&local_qiov, qiov->niov); 914 qemu_iovec_concat(&local_qiov, qiov, 0, 915 max_nb_sectors * BDRV_SECTOR_SIZE); 916 917 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 918 &local_qiov); 919 920 qemu_iovec_destroy(&local_qiov); 921 } else { 922 ret = 0; 923 } 924 925 /* Reading beyond end of file is supposed to produce zeroes */ 926 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 927 uint64_t offset = MAX(0, total_sectors - sector_num); 928 uint64_t bytes = (sector_num + nb_sectors - offset) * 929 BDRV_SECTOR_SIZE; 930 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 931 } 932 } 933 934 out: 935 return ret; 936 } 937 938 /* 939 * Handle a read request in coroutine context 940 */ 941 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 942 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 943 BdrvRequestFlags flags) 944 { 945 BlockDriver *drv = bs->drv; 946 BdrvTrackedRequest req; 947 948 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 949 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 950 uint8_t *head_buf = NULL; 951 uint8_t *tail_buf = NULL; 952 QEMUIOVector local_qiov; 953 bool use_local_qiov = false; 954 int ret; 955 956 if (!drv) { 957 return -ENOMEDIUM; 958 } 959 960 ret = bdrv_check_byte_request(bs, offset, bytes); 961 if (ret < 0) { 962 return ret; 963 } 964 965 /* Don't do copy-on-read if we read data before write operation */ 966 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 967 flags |= BDRV_REQ_COPY_ON_READ; 968 } 969 970 /* throttling disk I/O */ 971 if (bs->io_limits_enabled) { 972 throttle_group_co_io_limits_intercept(bs, bytes, false); 973 } 974 975 /* Align read if necessary by padding qiov */ 976 if (offset & (align - 1)) { 977 head_buf = qemu_blockalign(bs, align); 978 qemu_iovec_init(&local_qiov, qiov->niov + 2); 979 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 980 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 981 use_local_qiov = true; 982 983 bytes += offset & (align - 1); 984 offset = offset & ~(align - 1); 985 } 986 987 if ((offset + bytes) & (align - 1)) { 988 if (!use_local_qiov) { 989 qemu_iovec_init(&local_qiov, qiov->niov + 1); 990 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 991 use_local_qiov = true; 992 } 993 tail_buf = qemu_blockalign(bs, align); 994 qemu_iovec_add(&local_qiov, tail_buf, 995 align - ((offset + bytes) & (align - 1))); 996 997 bytes = ROUND_UP(bytes, align); 998 } 999 1000 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1001 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 1002 use_local_qiov ? &local_qiov : qiov, 1003 flags); 1004 tracked_request_end(&req); 1005 1006 if (use_local_qiov) { 1007 qemu_iovec_destroy(&local_qiov); 1008 qemu_vfree(head_buf); 1009 qemu_vfree(tail_buf); 1010 } 1011 1012 return ret; 1013 } 1014 1015 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1016 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1017 BdrvRequestFlags flags) 1018 { 1019 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1020 return -EINVAL; 1021 } 1022 1023 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1024 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1025 } 1026 1027 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1028 int nb_sectors, QEMUIOVector *qiov) 1029 { 1030 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1031 1032 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1033 } 1034 1035 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, 1036 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1037 { 1038 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); 1039 1040 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1041 BDRV_REQ_NO_SERIALISING); 1042 } 1043 1044 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1045 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1046 { 1047 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1048 1049 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1050 BDRV_REQ_COPY_ON_READ); 1051 } 1052 1053 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1054 1055 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1056 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1057 { 1058 BlockDriver *drv = bs->drv; 1059 QEMUIOVector qiov; 1060 struct iovec iov = {0}; 1061 int ret = 0; 1062 1063 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1064 BDRV_REQUEST_MAX_SECTORS); 1065 1066 while (nb_sectors > 0 && !ret) { 1067 int num = nb_sectors; 1068 1069 /* Align request. Block drivers can expect the "bulk" of the request 1070 * to be aligned. 1071 */ 1072 if (bs->bl.write_zeroes_alignment 1073 && num > bs->bl.write_zeroes_alignment) { 1074 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1075 /* Make a small request up to the first aligned sector. */ 1076 num = bs->bl.write_zeroes_alignment; 1077 num -= sector_num % bs->bl.write_zeroes_alignment; 1078 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1079 /* Shorten the request to the last aligned sector. num cannot 1080 * underflow because num > bs->bl.write_zeroes_alignment. 1081 */ 1082 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1083 } 1084 } 1085 1086 /* limit request size */ 1087 if (num > max_write_zeroes) { 1088 num = max_write_zeroes; 1089 } 1090 1091 ret = -ENOTSUP; 1092 /* First try the efficient write zeroes operation */ 1093 if (drv->bdrv_co_write_zeroes) { 1094 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1095 } 1096 1097 if (ret == -ENOTSUP) { 1098 /* Fall back to bounce buffer if write zeroes is unsupported */ 1099 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1100 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1101 num = MIN(num, max_xfer_len); 1102 iov.iov_len = num * BDRV_SECTOR_SIZE; 1103 if (iov.iov_base == NULL) { 1104 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1105 if (iov.iov_base == NULL) { 1106 ret = -ENOMEM; 1107 goto fail; 1108 } 1109 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1110 } 1111 qemu_iovec_init_external(&qiov, &iov, 1); 1112 1113 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1114 1115 /* Keep bounce buffer around if it is big enough for all 1116 * all future requests. 1117 */ 1118 if (num < max_xfer_len) { 1119 qemu_vfree(iov.iov_base); 1120 iov.iov_base = NULL; 1121 } 1122 } 1123 1124 sector_num += num; 1125 nb_sectors -= num; 1126 } 1127 1128 fail: 1129 qemu_vfree(iov.iov_base); 1130 return ret; 1131 } 1132 1133 /* 1134 * Forwards an already correctly aligned write request to the BlockDriver. 1135 */ 1136 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1137 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1138 QEMUIOVector *qiov, int flags) 1139 { 1140 BlockDriver *drv = bs->drv; 1141 bool waited; 1142 int ret; 1143 1144 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1145 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1146 1147 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1148 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1149 assert(!qiov || bytes == qiov->size); 1150 1151 waited = wait_serialising_requests(req); 1152 assert(!waited || !req->serialising); 1153 assert(req->overlap_offset <= offset); 1154 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1155 1156 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1157 1158 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1159 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1160 qemu_iovec_is_zero(qiov)) { 1161 flags |= BDRV_REQ_ZERO_WRITE; 1162 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1163 flags |= BDRV_REQ_MAY_UNMAP; 1164 } 1165 } 1166 1167 if (ret < 0) { 1168 /* Do nothing, write notifier decided to fail this request */ 1169 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1170 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1171 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1172 } else { 1173 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1174 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1175 } 1176 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1177 1178 if (ret == 0 && !bs->enable_write_cache) { 1179 ret = bdrv_co_flush(bs); 1180 } 1181 1182 bdrv_set_dirty(bs, sector_num, nb_sectors); 1183 1184 if (bs->wr_highest_offset < offset + bytes) { 1185 bs->wr_highest_offset = offset + bytes; 1186 } 1187 1188 if (ret >= 0) { 1189 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1190 } 1191 1192 return ret; 1193 } 1194 1195 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1196 int64_t offset, 1197 unsigned int bytes, 1198 BdrvRequestFlags flags, 1199 BdrvTrackedRequest *req) 1200 { 1201 uint8_t *buf = NULL; 1202 QEMUIOVector local_qiov; 1203 struct iovec iov; 1204 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1205 unsigned int head_padding_bytes, tail_padding_bytes; 1206 int ret = 0; 1207 1208 head_padding_bytes = offset & (align - 1); 1209 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1210 1211 1212 assert(flags & BDRV_REQ_ZERO_WRITE); 1213 if (head_padding_bytes || tail_padding_bytes) { 1214 buf = qemu_blockalign(bs, align); 1215 iov = (struct iovec) { 1216 .iov_base = buf, 1217 .iov_len = align, 1218 }; 1219 qemu_iovec_init_external(&local_qiov, &iov, 1); 1220 } 1221 if (head_padding_bytes) { 1222 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1223 1224 /* RMW the unaligned part before head. */ 1225 mark_request_serialising(req, align); 1226 wait_serialising_requests(req); 1227 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1228 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1229 align, &local_qiov, 0); 1230 if (ret < 0) { 1231 goto fail; 1232 } 1233 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1234 1235 memset(buf + head_padding_bytes, 0, zero_bytes); 1236 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1237 &local_qiov, 1238 flags & ~BDRV_REQ_ZERO_WRITE); 1239 if (ret < 0) { 1240 goto fail; 1241 } 1242 offset += zero_bytes; 1243 bytes -= zero_bytes; 1244 } 1245 1246 assert(!bytes || (offset & (align - 1)) == 0); 1247 if (bytes >= align) { 1248 /* Write the aligned part in the middle. */ 1249 uint64_t aligned_bytes = bytes & ~(align - 1); 1250 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1251 NULL, flags); 1252 if (ret < 0) { 1253 goto fail; 1254 } 1255 bytes -= aligned_bytes; 1256 offset += aligned_bytes; 1257 } 1258 1259 assert(!bytes || (offset & (align - 1)) == 0); 1260 if (bytes) { 1261 assert(align == tail_padding_bytes + bytes); 1262 /* RMW the unaligned part after tail. */ 1263 mark_request_serialising(req, align); 1264 wait_serialising_requests(req); 1265 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1266 ret = bdrv_aligned_preadv(bs, req, offset, align, 1267 align, &local_qiov, 0); 1268 if (ret < 0) { 1269 goto fail; 1270 } 1271 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1272 1273 memset(buf, 0, bytes); 1274 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1275 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1276 } 1277 fail: 1278 qemu_vfree(buf); 1279 return ret; 1280 1281 } 1282 1283 /* 1284 * Handle a write request in coroutine context 1285 */ 1286 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1287 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1288 BdrvRequestFlags flags) 1289 { 1290 BdrvTrackedRequest req; 1291 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1292 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1293 uint8_t *head_buf = NULL; 1294 uint8_t *tail_buf = NULL; 1295 QEMUIOVector local_qiov; 1296 bool use_local_qiov = false; 1297 int ret; 1298 1299 if (!bs->drv) { 1300 return -ENOMEDIUM; 1301 } 1302 if (bs->read_only) { 1303 return -EPERM; 1304 } 1305 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1306 1307 ret = bdrv_check_byte_request(bs, offset, bytes); 1308 if (ret < 0) { 1309 return ret; 1310 } 1311 1312 /* throttling disk I/O */ 1313 if (bs->io_limits_enabled) { 1314 throttle_group_co_io_limits_intercept(bs, bytes, true); 1315 } 1316 1317 /* 1318 * Align write if necessary by performing a read-modify-write cycle. 1319 * Pad qiov with the read parts and be sure to have a tracked request not 1320 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1321 */ 1322 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1323 1324 if (!qiov) { 1325 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1326 goto out; 1327 } 1328 1329 if (offset & (align - 1)) { 1330 QEMUIOVector head_qiov; 1331 struct iovec head_iov; 1332 1333 mark_request_serialising(&req, align); 1334 wait_serialising_requests(&req); 1335 1336 head_buf = qemu_blockalign(bs, align); 1337 head_iov = (struct iovec) { 1338 .iov_base = head_buf, 1339 .iov_len = align, 1340 }; 1341 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1342 1343 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1344 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1345 align, &head_qiov, 0); 1346 if (ret < 0) { 1347 goto fail; 1348 } 1349 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1350 1351 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1352 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1353 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1354 use_local_qiov = true; 1355 1356 bytes += offset & (align - 1); 1357 offset = offset & ~(align - 1); 1358 } 1359 1360 if ((offset + bytes) & (align - 1)) { 1361 QEMUIOVector tail_qiov; 1362 struct iovec tail_iov; 1363 size_t tail_bytes; 1364 bool waited; 1365 1366 mark_request_serialising(&req, align); 1367 waited = wait_serialising_requests(&req); 1368 assert(!waited || !use_local_qiov); 1369 1370 tail_buf = qemu_blockalign(bs, align); 1371 tail_iov = (struct iovec) { 1372 .iov_base = tail_buf, 1373 .iov_len = align, 1374 }; 1375 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1376 1377 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1378 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1379 align, &tail_qiov, 0); 1380 if (ret < 0) { 1381 goto fail; 1382 } 1383 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1384 1385 if (!use_local_qiov) { 1386 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1387 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1388 use_local_qiov = true; 1389 } 1390 1391 tail_bytes = (offset + bytes) & (align - 1); 1392 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1393 1394 bytes = ROUND_UP(bytes, align); 1395 } 1396 1397 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1398 use_local_qiov ? &local_qiov : qiov, 1399 flags); 1400 1401 fail: 1402 1403 if (use_local_qiov) { 1404 qemu_iovec_destroy(&local_qiov); 1405 } 1406 qemu_vfree(head_buf); 1407 qemu_vfree(tail_buf); 1408 out: 1409 tracked_request_end(&req); 1410 return ret; 1411 } 1412 1413 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1414 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1415 BdrvRequestFlags flags) 1416 { 1417 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1418 return -EINVAL; 1419 } 1420 1421 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1422 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1423 } 1424 1425 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1426 int nb_sectors, QEMUIOVector *qiov) 1427 { 1428 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1429 1430 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1431 } 1432 1433 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1434 int64_t sector_num, int nb_sectors, 1435 BdrvRequestFlags flags) 1436 { 1437 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1438 1439 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1440 flags &= ~BDRV_REQ_MAY_UNMAP; 1441 } 1442 1443 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1444 BDRV_REQ_ZERO_WRITE | flags); 1445 } 1446 1447 int bdrv_flush_all(void) 1448 { 1449 BlockDriverState *bs = NULL; 1450 int result = 0; 1451 1452 while ((bs = bdrv_next(bs))) { 1453 AioContext *aio_context = bdrv_get_aio_context(bs); 1454 int ret; 1455 1456 aio_context_acquire(aio_context); 1457 ret = bdrv_flush(bs); 1458 if (ret < 0 && !result) { 1459 result = ret; 1460 } 1461 aio_context_release(aio_context); 1462 } 1463 1464 return result; 1465 } 1466 1467 typedef struct BdrvCoGetBlockStatusData { 1468 BlockDriverState *bs; 1469 BlockDriverState *base; 1470 BlockDriverState **file; 1471 int64_t sector_num; 1472 int nb_sectors; 1473 int *pnum; 1474 int64_t ret; 1475 bool done; 1476 } BdrvCoGetBlockStatusData; 1477 1478 /* 1479 * Returns the allocation status of the specified sectors. 1480 * Drivers not implementing the functionality are assumed to not support 1481 * backing files, hence all their sectors are reported as allocated. 1482 * 1483 * If 'sector_num' is beyond the end of the disk image the return value is 0 1484 * and 'pnum' is set to 0. 1485 * 1486 * 'pnum' is set to the number of sectors (including and immediately following 1487 * the specified sector) that are known to be in the same 1488 * allocated/unallocated state. 1489 * 1490 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1491 * beyond the end of the disk image it will be clamped. 1492 * 1493 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' 1494 * points to the BDS which the sector range is allocated in. 1495 */ 1496 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1497 int64_t sector_num, 1498 int nb_sectors, int *pnum, 1499 BlockDriverState **file) 1500 { 1501 int64_t total_sectors; 1502 int64_t n; 1503 int64_t ret, ret2; 1504 1505 total_sectors = bdrv_nb_sectors(bs); 1506 if (total_sectors < 0) { 1507 return total_sectors; 1508 } 1509 1510 if (sector_num >= total_sectors) { 1511 *pnum = 0; 1512 return 0; 1513 } 1514 1515 n = total_sectors - sector_num; 1516 if (n < nb_sectors) { 1517 nb_sectors = n; 1518 } 1519 1520 if (!bs->drv->bdrv_co_get_block_status) { 1521 *pnum = nb_sectors; 1522 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1523 if (bs->drv->protocol_name) { 1524 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1525 } 1526 return ret; 1527 } 1528 1529 *file = NULL; 1530 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, 1531 file); 1532 if (ret < 0) { 1533 *pnum = 0; 1534 return ret; 1535 } 1536 1537 if (ret & BDRV_BLOCK_RAW) { 1538 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1539 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1540 *pnum, pnum, file); 1541 } 1542 1543 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1544 ret |= BDRV_BLOCK_ALLOCATED; 1545 } else { 1546 if (bdrv_unallocated_blocks_are_zero(bs)) { 1547 ret |= BDRV_BLOCK_ZERO; 1548 } else if (bs->backing) { 1549 BlockDriverState *bs2 = bs->backing->bs; 1550 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1551 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1552 ret |= BDRV_BLOCK_ZERO; 1553 } 1554 } 1555 } 1556 1557 if (*file && *file != bs && 1558 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1559 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1560 BlockDriverState *file2; 1561 int file_pnum; 1562 1563 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, 1564 *pnum, &file_pnum, &file2); 1565 if (ret2 >= 0) { 1566 /* Ignore errors. This is just providing extra information, it 1567 * is useful but not necessary. 1568 */ 1569 if (!file_pnum) { 1570 /* !file_pnum indicates an offset at or beyond the EOF; it is 1571 * perfectly valid for the format block driver to point to such 1572 * offsets, so catch it and mark everything as zero */ 1573 ret |= BDRV_BLOCK_ZERO; 1574 } else { 1575 /* Limit request to the range reported by the protocol driver */ 1576 *pnum = file_pnum; 1577 ret |= (ret2 & BDRV_BLOCK_ZERO); 1578 } 1579 } 1580 } 1581 1582 return ret; 1583 } 1584 1585 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1586 BlockDriverState *base, 1587 int64_t sector_num, 1588 int nb_sectors, 1589 int *pnum, 1590 BlockDriverState **file) 1591 { 1592 BlockDriverState *p; 1593 int64_t ret = 0; 1594 1595 assert(bs != base); 1596 for (p = bs; p != base; p = backing_bs(p)) { 1597 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); 1598 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1599 break; 1600 } 1601 /* [sector_num, pnum] unallocated on this layer, which could be only 1602 * the first part of [sector_num, nb_sectors]. */ 1603 nb_sectors = MIN(nb_sectors, *pnum); 1604 } 1605 return ret; 1606 } 1607 1608 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1609 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1610 { 1611 BdrvCoGetBlockStatusData *data = opaque; 1612 1613 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1614 data->sector_num, 1615 data->nb_sectors, 1616 data->pnum, 1617 data->file); 1618 data->done = true; 1619 } 1620 1621 /* 1622 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1623 * 1624 * See bdrv_co_get_block_status_above() for details. 1625 */ 1626 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1627 BlockDriverState *base, 1628 int64_t sector_num, 1629 int nb_sectors, int *pnum, 1630 BlockDriverState **file) 1631 { 1632 Coroutine *co; 1633 BdrvCoGetBlockStatusData data = { 1634 .bs = bs, 1635 .base = base, 1636 .file = file, 1637 .sector_num = sector_num, 1638 .nb_sectors = nb_sectors, 1639 .pnum = pnum, 1640 .done = false, 1641 }; 1642 1643 if (qemu_in_coroutine()) { 1644 /* Fast-path if already in coroutine context */ 1645 bdrv_get_block_status_above_co_entry(&data); 1646 } else { 1647 AioContext *aio_context = bdrv_get_aio_context(bs); 1648 1649 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1650 qemu_coroutine_enter(co, &data); 1651 while (!data.done) { 1652 aio_poll(aio_context, true); 1653 } 1654 } 1655 return data.ret; 1656 } 1657 1658 int64_t bdrv_get_block_status(BlockDriverState *bs, 1659 int64_t sector_num, 1660 int nb_sectors, int *pnum, 1661 BlockDriverState **file) 1662 { 1663 return bdrv_get_block_status_above(bs, backing_bs(bs), 1664 sector_num, nb_sectors, pnum, file); 1665 } 1666 1667 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1668 int nb_sectors, int *pnum) 1669 { 1670 BlockDriverState *file; 1671 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, 1672 &file); 1673 if (ret < 0) { 1674 return ret; 1675 } 1676 return !!(ret & BDRV_BLOCK_ALLOCATED); 1677 } 1678 1679 /* 1680 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1681 * 1682 * Return true if the given sector is allocated in any image between 1683 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1684 * sector is allocated in any image of the chain. Return false otherwise. 1685 * 1686 * 'pnum' is set to the number of sectors (including and immediately following 1687 * the specified sector) that are known to be in the same 1688 * allocated/unallocated state. 1689 * 1690 */ 1691 int bdrv_is_allocated_above(BlockDriverState *top, 1692 BlockDriverState *base, 1693 int64_t sector_num, 1694 int nb_sectors, int *pnum) 1695 { 1696 BlockDriverState *intermediate; 1697 int ret, n = nb_sectors; 1698 1699 intermediate = top; 1700 while (intermediate && intermediate != base) { 1701 int pnum_inter; 1702 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1703 &pnum_inter); 1704 if (ret < 0) { 1705 return ret; 1706 } else if (ret) { 1707 *pnum = pnum_inter; 1708 return 1; 1709 } 1710 1711 /* 1712 * [sector_num, nb_sectors] is unallocated on top but intermediate 1713 * might have 1714 * 1715 * [sector_num+x, nr_sectors] allocated. 1716 */ 1717 if (n > pnum_inter && 1718 (intermediate == top || 1719 sector_num + pnum_inter < intermediate->total_sectors)) { 1720 n = pnum_inter; 1721 } 1722 1723 intermediate = backing_bs(intermediate); 1724 } 1725 1726 *pnum = n; 1727 return 0; 1728 } 1729 1730 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1731 const uint8_t *buf, int nb_sectors) 1732 { 1733 BlockDriver *drv = bs->drv; 1734 int ret; 1735 1736 if (!drv) { 1737 return -ENOMEDIUM; 1738 } 1739 if (!drv->bdrv_write_compressed) { 1740 return -ENOTSUP; 1741 } 1742 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1743 if (ret < 0) { 1744 return ret; 1745 } 1746 1747 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1748 1749 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1750 } 1751 1752 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1753 int64_t pos, int size) 1754 { 1755 QEMUIOVector qiov; 1756 struct iovec iov = { 1757 .iov_base = (void *) buf, 1758 .iov_len = size, 1759 }; 1760 1761 qemu_iovec_init_external(&qiov, &iov, 1); 1762 return bdrv_writev_vmstate(bs, &qiov, pos); 1763 } 1764 1765 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1766 { 1767 BlockDriver *drv = bs->drv; 1768 1769 if (!drv) { 1770 return -ENOMEDIUM; 1771 } else if (drv->bdrv_save_vmstate) { 1772 return drv->bdrv_save_vmstate(bs, qiov, pos); 1773 } else if (bs->file) { 1774 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1775 } 1776 1777 return -ENOTSUP; 1778 } 1779 1780 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1781 int64_t pos, int size) 1782 { 1783 BlockDriver *drv = bs->drv; 1784 if (!drv) 1785 return -ENOMEDIUM; 1786 if (drv->bdrv_load_vmstate) 1787 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1788 if (bs->file) 1789 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1790 return -ENOTSUP; 1791 } 1792 1793 /**************************************************************/ 1794 /* async I/Os */ 1795 1796 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1797 QEMUIOVector *qiov, int nb_sectors, 1798 BlockCompletionFunc *cb, void *opaque) 1799 { 1800 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1801 1802 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1803 cb, opaque, false); 1804 } 1805 1806 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1807 QEMUIOVector *qiov, int nb_sectors, 1808 BlockCompletionFunc *cb, void *opaque) 1809 { 1810 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1811 1812 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1813 cb, opaque, true); 1814 } 1815 1816 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1817 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1818 BlockCompletionFunc *cb, void *opaque) 1819 { 1820 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1821 1822 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1823 BDRV_REQ_ZERO_WRITE | flags, 1824 cb, opaque, true); 1825 } 1826 1827 1828 typedef struct MultiwriteCB { 1829 int error; 1830 int num_requests; 1831 int num_callbacks; 1832 struct { 1833 BlockCompletionFunc *cb; 1834 void *opaque; 1835 QEMUIOVector *free_qiov; 1836 } callbacks[]; 1837 } MultiwriteCB; 1838 1839 static void multiwrite_user_cb(MultiwriteCB *mcb) 1840 { 1841 int i; 1842 1843 for (i = 0; i < mcb->num_callbacks; i++) { 1844 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1845 if (mcb->callbacks[i].free_qiov) { 1846 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1847 } 1848 g_free(mcb->callbacks[i].free_qiov); 1849 } 1850 } 1851 1852 static void multiwrite_cb(void *opaque, int ret) 1853 { 1854 MultiwriteCB *mcb = opaque; 1855 1856 trace_multiwrite_cb(mcb, ret); 1857 1858 if (ret < 0 && !mcb->error) { 1859 mcb->error = ret; 1860 } 1861 1862 mcb->num_requests--; 1863 if (mcb->num_requests == 0) { 1864 multiwrite_user_cb(mcb); 1865 g_free(mcb); 1866 } 1867 } 1868 1869 static int multiwrite_req_compare(const void *a, const void *b) 1870 { 1871 const BlockRequest *req1 = a, *req2 = b; 1872 1873 /* 1874 * Note that we can't simply subtract req2->sector from req1->sector 1875 * here as that could overflow the return value. 1876 */ 1877 if (req1->sector > req2->sector) { 1878 return 1; 1879 } else if (req1->sector < req2->sector) { 1880 return -1; 1881 } else { 1882 return 0; 1883 } 1884 } 1885 1886 /* 1887 * Takes a bunch of requests and tries to merge them. Returns the number of 1888 * requests that remain after merging. 1889 */ 1890 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1891 int num_reqs, MultiwriteCB *mcb) 1892 { 1893 int i, outidx; 1894 1895 // Sort requests by start sector 1896 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1897 1898 // Check if adjacent requests touch the same clusters. If so, combine them, 1899 // filling up gaps with zero sectors. 1900 outidx = 0; 1901 for (i = 1; i < num_reqs; i++) { 1902 int merge = 0; 1903 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1904 1905 // Handle exactly sequential writes and overlapping writes. 1906 if (reqs[i].sector <= oldreq_last) { 1907 merge = 1; 1908 } 1909 1910 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > 1911 bs->bl.max_iov) { 1912 merge = 0; 1913 } 1914 1915 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1916 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1917 merge = 0; 1918 } 1919 1920 if (merge) { 1921 size_t size; 1922 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1923 qemu_iovec_init(qiov, 1924 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1925 1926 // Add the first request to the merged one. If the requests are 1927 // overlapping, drop the last sectors of the first request. 1928 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1929 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1930 1931 // We should need to add any zeros between the two requests 1932 assert (reqs[i].sector <= oldreq_last); 1933 1934 // Add the second request 1935 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1936 1937 // Add tail of first request, if necessary 1938 if (qiov->size < reqs[outidx].qiov->size) { 1939 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1940 reqs[outidx].qiov->size - qiov->size); 1941 } 1942 1943 reqs[outidx].nb_sectors = qiov->size >> 9; 1944 reqs[outidx].qiov = qiov; 1945 1946 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1947 } else { 1948 outidx++; 1949 reqs[outidx].sector = reqs[i].sector; 1950 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1951 reqs[outidx].qiov = reqs[i].qiov; 1952 } 1953 } 1954 1955 if (bs->blk) { 1956 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1957 num_reqs - outidx - 1); 1958 } 1959 1960 return outidx + 1; 1961 } 1962 1963 /* 1964 * Submit multiple AIO write requests at once. 1965 * 1966 * On success, the function returns 0 and all requests in the reqs array have 1967 * been submitted. In error case this function returns -1, and any of the 1968 * requests may or may not be submitted yet. In particular, this means that the 1969 * callback will be called for some of the requests, for others it won't. The 1970 * caller must check the error field of the BlockRequest to wait for the right 1971 * callbacks (if error != 0, no callback will be called). 1972 * 1973 * The implementation may modify the contents of the reqs array, e.g. to merge 1974 * requests. However, the fields opaque and error are left unmodified as they 1975 * are used to signal failure for a single request to the caller. 1976 */ 1977 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1978 { 1979 MultiwriteCB *mcb; 1980 int i; 1981 1982 /* don't submit writes if we don't have a medium */ 1983 if (bs->drv == NULL) { 1984 for (i = 0; i < num_reqs; i++) { 1985 reqs[i].error = -ENOMEDIUM; 1986 } 1987 return -1; 1988 } 1989 1990 if (num_reqs == 0) { 1991 return 0; 1992 } 1993 1994 // Create MultiwriteCB structure 1995 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1996 mcb->num_requests = 0; 1997 mcb->num_callbacks = num_reqs; 1998 1999 for (i = 0; i < num_reqs; i++) { 2000 mcb->callbacks[i].cb = reqs[i].cb; 2001 mcb->callbacks[i].opaque = reqs[i].opaque; 2002 } 2003 2004 // Check for mergable requests 2005 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 2006 2007 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 2008 2009 /* Run the aio requests. */ 2010 mcb->num_requests = num_reqs; 2011 for (i = 0; i < num_reqs; i++) { 2012 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 2013 reqs[i].nb_sectors, reqs[i].flags, 2014 multiwrite_cb, mcb, 2015 true); 2016 } 2017 2018 return 0; 2019 } 2020 2021 void bdrv_aio_cancel(BlockAIOCB *acb) 2022 { 2023 qemu_aio_ref(acb); 2024 bdrv_aio_cancel_async(acb); 2025 while (acb->refcnt > 1) { 2026 if (acb->aiocb_info->get_aio_context) { 2027 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2028 } else if (acb->bs) { 2029 aio_poll(bdrv_get_aio_context(acb->bs), true); 2030 } else { 2031 abort(); 2032 } 2033 } 2034 qemu_aio_unref(acb); 2035 } 2036 2037 /* Async version of aio cancel. The caller is not blocked if the acb implements 2038 * cancel_async, otherwise we do nothing and let the request normally complete. 2039 * In either case the completion callback must be called. */ 2040 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2041 { 2042 if (acb->aiocb_info->cancel_async) { 2043 acb->aiocb_info->cancel_async(acb); 2044 } 2045 } 2046 2047 /**************************************************************/ 2048 /* async block device emulation */ 2049 2050 typedef struct BlockAIOCBSync { 2051 BlockAIOCB common; 2052 QEMUBH *bh; 2053 int ret; 2054 /* vector translation state */ 2055 QEMUIOVector *qiov; 2056 uint8_t *bounce; 2057 int is_write; 2058 } BlockAIOCBSync; 2059 2060 static const AIOCBInfo bdrv_em_aiocb_info = { 2061 .aiocb_size = sizeof(BlockAIOCBSync), 2062 }; 2063 2064 static void bdrv_aio_bh_cb(void *opaque) 2065 { 2066 BlockAIOCBSync *acb = opaque; 2067 2068 if (!acb->is_write && acb->ret >= 0) { 2069 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2070 } 2071 qemu_vfree(acb->bounce); 2072 acb->common.cb(acb->common.opaque, acb->ret); 2073 qemu_bh_delete(acb->bh); 2074 acb->bh = NULL; 2075 qemu_aio_unref(acb); 2076 } 2077 2078 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2079 int64_t sector_num, 2080 QEMUIOVector *qiov, 2081 int nb_sectors, 2082 BlockCompletionFunc *cb, 2083 void *opaque, 2084 int is_write) 2085 2086 { 2087 BlockAIOCBSync *acb; 2088 2089 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2090 acb->is_write = is_write; 2091 acb->qiov = qiov; 2092 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2093 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2094 2095 if (acb->bounce == NULL) { 2096 acb->ret = -ENOMEM; 2097 } else if (is_write) { 2098 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2099 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2100 } else { 2101 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2102 } 2103 2104 qemu_bh_schedule(acb->bh); 2105 2106 return &acb->common; 2107 } 2108 2109 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2110 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2111 BlockCompletionFunc *cb, void *opaque) 2112 { 2113 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2114 } 2115 2116 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2117 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2118 BlockCompletionFunc *cb, void *opaque) 2119 { 2120 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2121 } 2122 2123 2124 typedef struct BlockAIOCBCoroutine { 2125 BlockAIOCB common; 2126 BlockRequest req; 2127 bool is_write; 2128 bool need_bh; 2129 bool *done; 2130 QEMUBH* bh; 2131 } BlockAIOCBCoroutine; 2132 2133 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2134 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2135 }; 2136 2137 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2138 { 2139 if (!acb->need_bh) { 2140 acb->common.cb(acb->common.opaque, acb->req.error); 2141 qemu_aio_unref(acb); 2142 } 2143 } 2144 2145 static void bdrv_co_em_bh(void *opaque) 2146 { 2147 BlockAIOCBCoroutine *acb = opaque; 2148 2149 assert(!acb->need_bh); 2150 qemu_bh_delete(acb->bh); 2151 bdrv_co_complete(acb); 2152 } 2153 2154 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2155 { 2156 acb->need_bh = false; 2157 if (acb->req.error != -EINPROGRESS) { 2158 BlockDriverState *bs = acb->common.bs; 2159 2160 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2161 qemu_bh_schedule(acb->bh); 2162 } 2163 } 2164 2165 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2166 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2167 { 2168 BlockAIOCBCoroutine *acb = opaque; 2169 BlockDriverState *bs = acb->common.bs; 2170 2171 if (!acb->is_write) { 2172 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2173 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2174 } else { 2175 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2176 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2177 } 2178 2179 bdrv_co_complete(acb); 2180 } 2181 2182 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2183 int64_t sector_num, 2184 QEMUIOVector *qiov, 2185 int nb_sectors, 2186 BdrvRequestFlags flags, 2187 BlockCompletionFunc *cb, 2188 void *opaque, 2189 bool is_write) 2190 { 2191 Coroutine *co; 2192 BlockAIOCBCoroutine *acb; 2193 2194 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2195 acb->need_bh = true; 2196 acb->req.error = -EINPROGRESS; 2197 acb->req.sector = sector_num; 2198 acb->req.nb_sectors = nb_sectors; 2199 acb->req.qiov = qiov; 2200 acb->req.flags = flags; 2201 acb->is_write = is_write; 2202 2203 co = qemu_coroutine_create(bdrv_co_do_rw); 2204 qemu_coroutine_enter(co, acb); 2205 2206 bdrv_co_maybe_schedule_bh(acb); 2207 return &acb->common; 2208 } 2209 2210 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2211 { 2212 BlockAIOCBCoroutine *acb = opaque; 2213 BlockDriverState *bs = acb->common.bs; 2214 2215 acb->req.error = bdrv_co_flush(bs); 2216 bdrv_co_complete(acb); 2217 } 2218 2219 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2220 BlockCompletionFunc *cb, void *opaque) 2221 { 2222 trace_bdrv_aio_flush(bs, opaque); 2223 2224 Coroutine *co; 2225 BlockAIOCBCoroutine *acb; 2226 2227 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2228 acb->need_bh = true; 2229 acb->req.error = -EINPROGRESS; 2230 2231 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2232 qemu_coroutine_enter(co, acb); 2233 2234 bdrv_co_maybe_schedule_bh(acb); 2235 return &acb->common; 2236 } 2237 2238 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2239 { 2240 BlockAIOCBCoroutine *acb = opaque; 2241 BlockDriverState *bs = acb->common.bs; 2242 2243 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2244 bdrv_co_complete(acb); 2245 } 2246 2247 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2248 int64_t sector_num, int nb_sectors, 2249 BlockCompletionFunc *cb, void *opaque) 2250 { 2251 Coroutine *co; 2252 BlockAIOCBCoroutine *acb; 2253 2254 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2255 2256 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2257 acb->need_bh = true; 2258 acb->req.error = -EINPROGRESS; 2259 acb->req.sector = sector_num; 2260 acb->req.nb_sectors = nb_sectors; 2261 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2262 qemu_coroutine_enter(co, acb); 2263 2264 bdrv_co_maybe_schedule_bh(acb); 2265 return &acb->common; 2266 } 2267 2268 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2269 BlockCompletionFunc *cb, void *opaque) 2270 { 2271 BlockAIOCB *acb; 2272 2273 acb = g_malloc(aiocb_info->aiocb_size); 2274 acb->aiocb_info = aiocb_info; 2275 acb->bs = bs; 2276 acb->cb = cb; 2277 acb->opaque = opaque; 2278 acb->refcnt = 1; 2279 return acb; 2280 } 2281 2282 void qemu_aio_ref(void *p) 2283 { 2284 BlockAIOCB *acb = p; 2285 acb->refcnt++; 2286 } 2287 2288 void qemu_aio_unref(void *p) 2289 { 2290 BlockAIOCB *acb = p; 2291 assert(acb->refcnt > 0); 2292 if (--acb->refcnt == 0) { 2293 g_free(acb); 2294 } 2295 } 2296 2297 /**************************************************************/ 2298 /* Coroutine block device emulation */ 2299 2300 typedef struct CoroutineIOCompletion { 2301 Coroutine *coroutine; 2302 int ret; 2303 } CoroutineIOCompletion; 2304 2305 static void bdrv_co_io_em_complete(void *opaque, int ret) 2306 { 2307 CoroutineIOCompletion *co = opaque; 2308 2309 co->ret = ret; 2310 qemu_coroutine_enter(co->coroutine, NULL); 2311 } 2312 2313 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2314 int nb_sectors, QEMUIOVector *iov, 2315 bool is_write) 2316 { 2317 CoroutineIOCompletion co = { 2318 .coroutine = qemu_coroutine_self(), 2319 }; 2320 BlockAIOCB *acb; 2321 2322 if (is_write) { 2323 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2324 bdrv_co_io_em_complete, &co); 2325 } else { 2326 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2327 bdrv_co_io_em_complete, &co); 2328 } 2329 2330 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2331 if (!acb) { 2332 return -EIO; 2333 } 2334 qemu_coroutine_yield(); 2335 2336 return co.ret; 2337 } 2338 2339 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2340 int64_t sector_num, int nb_sectors, 2341 QEMUIOVector *iov) 2342 { 2343 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2344 } 2345 2346 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2347 int64_t sector_num, int nb_sectors, 2348 QEMUIOVector *iov) 2349 { 2350 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2351 } 2352 2353 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2354 { 2355 RwCo *rwco = opaque; 2356 2357 rwco->ret = bdrv_co_flush(rwco->bs); 2358 } 2359 2360 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2361 { 2362 int ret; 2363 BdrvTrackedRequest req; 2364 2365 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2366 bdrv_is_sg(bs)) { 2367 return 0; 2368 } 2369 2370 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2371 /* Write back cached data to the OS even with cache=unsafe */ 2372 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2373 if (bs->drv->bdrv_co_flush_to_os) { 2374 ret = bs->drv->bdrv_co_flush_to_os(bs); 2375 if (ret < 0) { 2376 goto out; 2377 } 2378 } 2379 2380 /* But don't actually force it to the disk with cache=unsafe */ 2381 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2382 goto flush_parent; 2383 } 2384 2385 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2386 if (bs->drv->bdrv_co_flush_to_disk) { 2387 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2388 } else if (bs->drv->bdrv_aio_flush) { 2389 BlockAIOCB *acb; 2390 CoroutineIOCompletion co = { 2391 .coroutine = qemu_coroutine_self(), 2392 }; 2393 2394 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2395 if (acb == NULL) { 2396 ret = -EIO; 2397 } else { 2398 qemu_coroutine_yield(); 2399 ret = co.ret; 2400 } 2401 } else { 2402 /* 2403 * Some block drivers always operate in either writethrough or unsafe 2404 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2405 * know how the server works (because the behaviour is hardcoded or 2406 * depends on server-side configuration), so we can't ensure that 2407 * everything is safe on disk. Returning an error doesn't work because 2408 * that would break guests even if the server operates in writethrough 2409 * mode. 2410 * 2411 * Let's hope the user knows what he's doing. 2412 */ 2413 ret = 0; 2414 } 2415 if (ret < 0) { 2416 goto out; 2417 } 2418 2419 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2420 * in the case of cache=unsafe, so there are no useless flushes. 2421 */ 2422 flush_parent: 2423 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2424 out: 2425 tracked_request_end(&req); 2426 return ret; 2427 } 2428 2429 int bdrv_flush(BlockDriverState *bs) 2430 { 2431 Coroutine *co; 2432 RwCo rwco = { 2433 .bs = bs, 2434 .ret = NOT_DONE, 2435 }; 2436 2437 if (qemu_in_coroutine()) { 2438 /* Fast-path if already in coroutine context */ 2439 bdrv_flush_co_entry(&rwco); 2440 } else { 2441 AioContext *aio_context = bdrv_get_aio_context(bs); 2442 2443 co = qemu_coroutine_create(bdrv_flush_co_entry); 2444 qemu_coroutine_enter(co, &rwco); 2445 while (rwco.ret == NOT_DONE) { 2446 aio_poll(aio_context, true); 2447 } 2448 } 2449 2450 return rwco.ret; 2451 } 2452 2453 typedef struct DiscardCo { 2454 BlockDriverState *bs; 2455 int64_t sector_num; 2456 int nb_sectors; 2457 int ret; 2458 } DiscardCo; 2459 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2460 { 2461 DiscardCo *rwco = opaque; 2462 2463 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2464 } 2465 2466 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2467 int nb_sectors) 2468 { 2469 BdrvTrackedRequest req; 2470 int max_discard, ret; 2471 2472 if (!bs->drv) { 2473 return -ENOMEDIUM; 2474 } 2475 2476 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2477 if (ret < 0) { 2478 return ret; 2479 } else if (bs->read_only) { 2480 return -EPERM; 2481 } 2482 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2483 2484 /* Do nothing if disabled. */ 2485 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2486 return 0; 2487 } 2488 2489 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2490 return 0; 2491 } 2492 2493 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2494 BDRV_TRACKED_DISCARD); 2495 bdrv_set_dirty(bs, sector_num, nb_sectors); 2496 2497 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2498 while (nb_sectors > 0) { 2499 int ret; 2500 int num = nb_sectors; 2501 2502 /* align request */ 2503 if (bs->bl.discard_alignment && 2504 num >= bs->bl.discard_alignment && 2505 sector_num % bs->bl.discard_alignment) { 2506 if (num > bs->bl.discard_alignment) { 2507 num = bs->bl.discard_alignment; 2508 } 2509 num -= sector_num % bs->bl.discard_alignment; 2510 } 2511 2512 /* limit request size */ 2513 if (num > max_discard) { 2514 num = max_discard; 2515 } 2516 2517 if (bs->drv->bdrv_co_discard) { 2518 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2519 } else { 2520 BlockAIOCB *acb; 2521 CoroutineIOCompletion co = { 2522 .coroutine = qemu_coroutine_self(), 2523 }; 2524 2525 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2526 bdrv_co_io_em_complete, &co); 2527 if (acb == NULL) { 2528 ret = -EIO; 2529 goto out; 2530 } else { 2531 qemu_coroutine_yield(); 2532 ret = co.ret; 2533 } 2534 } 2535 if (ret && ret != -ENOTSUP) { 2536 goto out; 2537 } 2538 2539 sector_num += num; 2540 nb_sectors -= num; 2541 } 2542 ret = 0; 2543 out: 2544 tracked_request_end(&req); 2545 return ret; 2546 } 2547 2548 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2549 { 2550 Coroutine *co; 2551 DiscardCo rwco = { 2552 .bs = bs, 2553 .sector_num = sector_num, 2554 .nb_sectors = nb_sectors, 2555 .ret = NOT_DONE, 2556 }; 2557 2558 if (qemu_in_coroutine()) { 2559 /* Fast-path if already in coroutine context */ 2560 bdrv_discard_co_entry(&rwco); 2561 } else { 2562 AioContext *aio_context = bdrv_get_aio_context(bs); 2563 2564 co = qemu_coroutine_create(bdrv_discard_co_entry); 2565 qemu_coroutine_enter(co, &rwco); 2566 while (rwco.ret == NOT_DONE) { 2567 aio_poll(aio_context, true); 2568 } 2569 } 2570 2571 return rwco.ret; 2572 } 2573 2574 typedef struct { 2575 CoroutineIOCompletion *co; 2576 QEMUBH *bh; 2577 } BdrvIoctlCompletionData; 2578 2579 static void bdrv_ioctl_bh_cb(void *opaque) 2580 { 2581 BdrvIoctlCompletionData *data = opaque; 2582 2583 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2584 qemu_bh_delete(data->bh); 2585 } 2586 2587 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2588 { 2589 BlockDriver *drv = bs->drv; 2590 BdrvTrackedRequest tracked_req; 2591 CoroutineIOCompletion co = { 2592 .coroutine = qemu_coroutine_self(), 2593 }; 2594 BlockAIOCB *acb; 2595 2596 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2597 if (!drv || !drv->bdrv_aio_ioctl) { 2598 co.ret = -ENOTSUP; 2599 goto out; 2600 } 2601 2602 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2603 if (!acb) { 2604 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2605 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2606 bdrv_ioctl_bh_cb, data); 2607 data->co = &co; 2608 qemu_bh_schedule(data->bh); 2609 } 2610 qemu_coroutine_yield(); 2611 out: 2612 tracked_request_end(&tracked_req); 2613 return co.ret; 2614 } 2615 2616 typedef struct { 2617 BlockDriverState *bs; 2618 int req; 2619 void *buf; 2620 int ret; 2621 } BdrvIoctlCoData; 2622 2623 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2624 { 2625 BdrvIoctlCoData *data = opaque; 2626 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2627 } 2628 2629 /* needed for generic scsi interface */ 2630 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2631 { 2632 BdrvIoctlCoData data = { 2633 .bs = bs, 2634 .req = req, 2635 .buf = buf, 2636 .ret = -EINPROGRESS, 2637 }; 2638 2639 if (qemu_in_coroutine()) { 2640 /* Fast-path if already in coroutine context */ 2641 bdrv_co_ioctl_entry(&data); 2642 } else { 2643 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2644 2645 qemu_coroutine_enter(co, &data); 2646 while (data.ret == -EINPROGRESS) { 2647 aio_poll(bdrv_get_aio_context(bs), true); 2648 } 2649 } 2650 return data.ret; 2651 } 2652 2653 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2654 { 2655 BlockAIOCBCoroutine *acb = opaque; 2656 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2657 acb->req.req, acb->req.buf); 2658 bdrv_co_complete(acb); 2659 } 2660 2661 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2662 unsigned long int req, void *buf, 2663 BlockCompletionFunc *cb, void *opaque) 2664 { 2665 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2666 bs, cb, opaque); 2667 Coroutine *co; 2668 2669 acb->need_bh = true; 2670 acb->req.error = -EINPROGRESS; 2671 acb->req.req = req; 2672 acb->req.buf = buf; 2673 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2674 qemu_coroutine_enter(co, acb); 2675 2676 bdrv_co_maybe_schedule_bh(acb); 2677 return &acb->common; 2678 } 2679 2680 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2681 { 2682 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2683 } 2684 2685 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2686 { 2687 return memset(qemu_blockalign(bs, size), 0, size); 2688 } 2689 2690 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2691 { 2692 size_t align = bdrv_opt_mem_align(bs); 2693 2694 /* Ensure that NULL is never returned on success */ 2695 assert(align > 0); 2696 if (size == 0) { 2697 size = align; 2698 } 2699 2700 return qemu_try_memalign(align, size); 2701 } 2702 2703 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2704 { 2705 void *mem = qemu_try_blockalign(bs, size); 2706 2707 if (mem) { 2708 memset(mem, 0, size); 2709 } 2710 2711 return mem; 2712 } 2713 2714 /* 2715 * Check if all memory in this vector is sector aligned. 2716 */ 2717 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2718 { 2719 int i; 2720 size_t alignment = bdrv_min_mem_align(bs); 2721 2722 for (i = 0; i < qiov->niov; i++) { 2723 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2724 return false; 2725 } 2726 if (qiov->iov[i].iov_len % alignment) { 2727 return false; 2728 } 2729 } 2730 2731 return true; 2732 } 2733 2734 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2735 NotifierWithReturn *notifier) 2736 { 2737 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2738 } 2739 2740 void bdrv_io_plug(BlockDriverState *bs) 2741 { 2742 BlockDriver *drv = bs->drv; 2743 if (drv && drv->bdrv_io_plug) { 2744 drv->bdrv_io_plug(bs); 2745 } else if (bs->file) { 2746 bdrv_io_plug(bs->file->bs); 2747 } 2748 } 2749 2750 void bdrv_io_unplug(BlockDriverState *bs) 2751 { 2752 BlockDriver *drv = bs->drv; 2753 if (drv && drv->bdrv_io_unplug) { 2754 drv->bdrv_io_unplug(bs); 2755 } else if (bs->file) { 2756 bdrv_io_unplug(bs->file->bs); 2757 } 2758 } 2759 2760 void bdrv_flush_io_queue(BlockDriverState *bs) 2761 { 2762 BlockDriver *drv = bs->drv; 2763 if (drv && drv->bdrv_flush_io_queue) { 2764 drv->bdrv_flush_io_queue(bs); 2765 } else if (bs->file) { 2766 bdrv_flush_io_queue(bs->file->bs); 2767 } 2768 bdrv_start_throttled_reqs(bs); 2769 } 2770 2771 void bdrv_drained_begin(BlockDriverState *bs) 2772 { 2773 if (!bs->quiesce_counter++) { 2774 aio_disable_external(bdrv_get_aio_context(bs)); 2775 } 2776 bdrv_drain(bs); 2777 } 2778 2779 void bdrv_drained_end(BlockDriverState *bs) 2780 { 2781 assert(bs->quiesce_counter > 0); 2782 if (--bs->quiesce_counter > 0) { 2783 return; 2784 } 2785 aio_enable_external(bdrv_get_aio_context(bs)); 2786 } 2787