1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "trace.h" 27 #include "sysemu/block-backend.h" 28 #include "block/blockjob.h" 29 #include "block/block_int.h" 30 #include "block/throttle-groups.h" 31 #include "qemu/error-report.h" 32 33 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 34 35 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 36 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 37 BlockCompletionFunc *cb, void *opaque); 38 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 39 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 40 BlockCompletionFunc *cb, void *opaque); 41 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 42 int64_t sector_num, int nb_sectors, 43 QEMUIOVector *iov); 44 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 45 int64_t sector_num, int nb_sectors, 46 QEMUIOVector *iov); 47 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 48 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 49 BdrvRequestFlags flags); 50 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 51 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 52 BdrvRequestFlags flags); 53 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 54 int64_t sector_num, 55 QEMUIOVector *qiov, 56 int nb_sectors, 57 BdrvRequestFlags flags, 58 BlockCompletionFunc *cb, 59 void *opaque, 60 bool is_write); 61 static void coroutine_fn bdrv_co_do_rw(void *opaque); 62 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 63 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 64 65 /* throttling disk I/O limits */ 66 void bdrv_set_io_limits(BlockDriverState *bs, 67 ThrottleConfig *cfg) 68 { 69 int i; 70 71 throttle_group_config(bs, cfg); 72 73 for (i = 0; i < 2; i++) { 74 qemu_co_enter_next(&bs->throttled_reqs[i]); 75 } 76 } 77 78 /* this function drain all the throttled IOs */ 79 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 80 { 81 bool drained = false; 82 bool enabled = bs->io_limits_enabled; 83 int i; 84 85 bs->io_limits_enabled = false; 86 87 for (i = 0; i < 2; i++) { 88 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 89 drained = true; 90 } 91 } 92 93 bs->io_limits_enabled = enabled; 94 95 return drained; 96 } 97 98 void bdrv_io_limits_disable(BlockDriverState *bs) 99 { 100 bs->io_limits_enabled = false; 101 bdrv_start_throttled_reqs(bs); 102 throttle_group_unregister_bs(bs); 103 } 104 105 /* should be called before bdrv_set_io_limits if a limit is set */ 106 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 107 { 108 assert(!bs->io_limits_enabled); 109 throttle_group_register_bs(bs, group); 110 bs->io_limits_enabled = true; 111 } 112 113 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 114 { 115 /* this bs is not part of any group */ 116 if (!bs->throttle_state) { 117 return; 118 } 119 120 /* this bs is a part of the same group than the one we want */ 121 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 122 return; 123 } 124 125 /* need to change the group this bs belong to */ 126 bdrv_io_limits_disable(bs); 127 bdrv_io_limits_enable(bs, group); 128 } 129 130 void bdrv_setup_io_funcs(BlockDriver *bdrv) 131 { 132 /* Block drivers without coroutine functions need emulation */ 133 if (!bdrv->bdrv_co_readv) { 134 bdrv->bdrv_co_readv = bdrv_co_readv_em; 135 bdrv->bdrv_co_writev = bdrv_co_writev_em; 136 137 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 138 * the block driver lacks aio we need to emulate that too. 139 */ 140 if (!bdrv->bdrv_aio_readv) { 141 /* add AIO emulation layer */ 142 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 143 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 144 } 145 } 146 } 147 148 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 149 { 150 BlockDriver *drv = bs->drv; 151 Error *local_err = NULL; 152 153 memset(&bs->bl, 0, sizeof(bs->bl)); 154 155 if (!drv) { 156 return; 157 } 158 159 /* Take some limits from the children as a default */ 160 if (bs->file) { 161 bdrv_refresh_limits(bs->file->bs, &local_err); 162 if (local_err) { 163 error_propagate(errp, local_err); 164 return; 165 } 166 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 167 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 168 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 169 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 170 bs->bl.max_iov = bs->file->bs->bl.max_iov; 171 } else { 172 bs->bl.min_mem_alignment = 512; 173 bs->bl.opt_mem_alignment = getpagesize(); 174 175 /* Safe default since most protocols use readv()/writev()/etc */ 176 bs->bl.max_iov = IOV_MAX; 177 } 178 179 if (bs->backing) { 180 bdrv_refresh_limits(bs->backing->bs, &local_err); 181 if (local_err) { 182 error_propagate(errp, local_err); 183 return; 184 } 185 bs->bl.opt_transfer_length = 186 MAX(bs->bl.opt_transfer_length, 187 bs->backing->bs->bl.opt_transfer_length); 188 bs->bl.max_transfer_length = 189 MIN_NON_ZERO(bs->bl.max_transfer_length, 190 bs->backing->bs->bl.max_transfer_length); 191 bs->bl.opt_mem_alignment = 192 MAX(bs->bl.opt_mem_alignment, 193 bs->backing->bs->bl.opt_mem_alignment); 194 bs->bl.min_mem_alignment = 195 MAX(bs->bl.min_mem_alignment, 196 bs->backing->bs->bl.min_mem_alignment); 197 bs->bl.max_iov = 198 MIN(bs->bl.max_iov, 199 bs->backing->bs->bl.max_iov); 200 } 201 202 /* Then let the driver override it */ 203 if (drv->bdrv_refresh_limits) { 204 drv->bdrv_refresh_limits(bs, errp); 205 } 206 } 207 208 /** 209 * The copy-on-read flag is actually a reference count so multiple users may 210 * use the feature without worrying about clobbering its previous state. 211 * Copy-on-read stays enabled until all users have called to disable it. 212 */ 213 void bdrv_enable_copy_on_read(BlockDriverState *bs) 214 { 215 bs->copy_on_read++; 216 } 217 218 void bdrv_disable_copy_on_read(BlockDriverState *bs) 219 { 220 assert(bs->copy_on_read > 0); 221 bs->copy_on_read--; 222 } 223 224 /* Check if any requests are in-flight (including throttled requests) */ 225 bool bdrv_requests_pending(BlockDriverState *bs) 226 { 227 BdrvChild *child; 228 229 if (!QLIST_EMPTY(&bs->tracked_requests)) { 230 return true; 231 } 232 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 233 return true; 234 } 235 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 236 return true; 237 } 238 239 QLIST_FOREACH(child, &bs->children, next) { 240 if (bdrv_requests_pending(child->bs)) { 241 return true; 242 } 243 } 244 245 return false; 246 } 247 248 static void bdrv_drain_recurse(BlockDriverState *bs) 249 { 250 BdrvChild *child; 251 252 if (bs->drv && bs->drv->bdrv_drain) { 253 bs->drv->bdrv_drain(bs); 254 } 255 QLIST_FOREACH(child, &bs->children, next) { 256 bdrv_drain_recurse(child->bs); 257 } 258 } 259 260 /* 261 * Wait for pending requests to complete on a single BlockDriverState subtree, 262 * and suspend block driver's internal I/O until next request arrives. 263 * 264 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 265 * AioContext. 266 * 267 * Only this BlockDriverState's AioContext is run, so in-flight requests must 268 * not depend on events in other AioContexts. In that case, use 269 * bdrv_drain_all() instead. 270 */ 271 void bdrv_drain(BlockDriverState *bs) 272 { 273 bool busy = true; 274 275 bdrv_drain_recurse(bs); 276 while (busy) { 277 /* Keep iterating */ 278 bdrv_flush_io_queue(bs); 279 busy = bdrv_requests_pending(bs); 280 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 281 } 282 } 283 284 /* 285 * Wait for pending requests to complete across all BlockDriverStates 286 * 287 * This function does not flush data to disk, use bdrv_flush_all() for that 288 * after calling this function. 289 */ 290 void bdrv_drain_all(void) 291 { 292 /* Always run first iteration so any pending completion BHs run */ 293 bool busy = true; 294 BlockDriverState *bs = NULL; 295 GSList *aio_ctxs = NULL, *ctx; 296 297 while ((bs = bdrv_next(bs))) { 298 AioContext *aio_context = bdrv_get_aio_context(bs); 299 300 aio_context_acquire(aio_context); 301 if (bs->job) { 302 block_job_pause(bs->job); 303 } 304 aio_context_release(aio_context); 305 306 if (!g_slist_find(aio_ctxs, aio_context)) { 307 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 308 } 309 } 310 311 /* Note that completion of an asynchronous I/O operation can trigger any 312 * number of other I/O operations on other devices---for example a 313 * coroutine can submit an I/O request to another device in response to 314 * request completion. Therefore we must keep looping until there was no 315 * more activity rather than simply draining each device independently. 316 */ 317 while (busy) { 318 busy = false; 319 320 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 321 AioContext *aio_context = ctx->data; 322 bs = NULL; 323 324 aio_context_acquire(aio_context); 325 while ((bs = bdrv_next(bs))) { 326 if (aio_context == bdrv_get_aio_context(bs)) { 327 bdrv_flush_io_queue(bs); 328 if (bdrv_requests_pending(bs)) { 329 busy = true; 330 aio_poll(aio_context, busy); 331 } 332 } 333 } 334 busy |= aio_poll(aio_context, false); 335 aio_context_release(aio_context); 336 } 337 } 338 339 bs = NULL; 340 while ((bs = bdrv_next(bs))) { 341 AioContext *aio_context = bdrv_get_aio_context(bs); 342 343 aio_context_acquire(aio_context); 344 if (bs->job) { 345 block_job_resume(bs->job); 346 } 347 aio_context_release(aio_context); 348 } 349 g_slist_free(aio_ctxs); 350 } 351 352 /** 353 * Remove an active request from the tracked requests list 354 * 355 * This function should be called when a tracked request is completing. 356 */ 357 static void tracked_request_end(BdrvTrackedRequest *req) 358 { 359 if (req->serialising) { 360 req->bs->serialising_in_flight--; 361 } 362 363 QLIST_REMOVE(req, list); 364 qemu_co_queue_restart_all(&req->wait_queue); 365 } 366 367 /** 368 * Add an active request to the tracked requests list 369 */ 370 static void tracked_request_begin(BdrvTrackedRequest *req, 371 BlockDriverState *bs, 372 int64_t offset, 373 unsigned int bytes, 374 enum BdrvTrackedRequestType type) 375 { 376 *req = (BdrvTrackedRequest){ 377 .bs = bs, 378 .offset = offset, 379 .bytes = bytes, 380 .type = type, 381 .co = qemu_coroutine_self(), 382 .serialising = false, 383 .overlap_offset = offset, 384 .overlap_bytes = bytes, 385 }; 386 387 qemu_co_queue_init(&req->wait_queue); 388 389 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 390 } 391 392 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 393 { 394 int64_t overlap_offset = req->offset & ~(align - 1); 395 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 396 - overlap_offset; 397 398 if (!req->serialising) { 399 req->bs->serialising_in_flight++; 400 req->serialising = true; 401 } 402 403 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 404 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 405 } 406 407 /** 408 * Round a region to cluster boundaries 409 */ 410 void bdrv_round_to_clusters(BlockDriverState *bs, 411 int64_t sector_num, int nb_sectors, 412 int64_t *cluster_sector_num, 413 int *cluster_nb_sectors) 414 { 415 BlockDriverInfo bdi; 416 417 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 418 *cluster_sector_num = sector_num; 419 *cluster_nb_sectors = nb_sectors; 420 } else { 421 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 422 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 423 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 424 nb_sectors, c); 425 } 426 } 427 428 static int bdrv_get_cluster_size(BlockDriverState *bs) 429 { 430 BlockDriverInfo bdi; 431 int ret; 432 433 ret = bdrv_get_info(bs, &bdi); 434 if (ret < 0 || bdi.cluster_size == 0) { 435 return bs->request_alignment; 436 } else { 437 return bdi.cluster_size; 438 } 439 } 440 441 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 442 int64_t offset, unsigned int bytes) 443 { 444 /* aaaa bbbb */ 445 if (offset >= req->overlap_offset + req->overlap_bytes) { 446 return false; 447 } 448 /* bbbb aaaa */ 449 if (req->overlap_offset >= offset + bytes) { 450 return false; 451 } 452 return true; 453 } 454 455 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 456 { 457 BlockDriverState *bs = self->bs; 458 BdrvTrackedRequest *req; 459 bool retry; 460 bool waited = false; 461 462 if (!bs->serialising_in_flight) { 463 return false; 464 } 465 466 do { 467 retry = false; 468 QLIST_FOREACH(req, &bs->tracked_requests, list) { 469 if (req == self || (!req->serialising && !self->serialising)) { 470 continue; 471 } 472 if (tracked_request_overlaps(req, self->overlap_offset, 473 self->overlap_bytes)) 474 { 475 /* Hitting this means there was a reentrant request, for 476 * example, a block driver issuing nested requests. This must 477 * never happen since it means deadlock. 478 */ 479 assert(qemu_coroutine_self() != req->co); 480 481 /* If the request is already (indirectly) waiting for us, or 482 * will wait for us as soon as it wakes up, then just go on 483 * (instead of producing a deadlock in the former case). */ 484 if (!req->waiting_for) { 485 self->waiting_for = req; 486 qemu_co_queue_wait(&req->wait_queue); 487 self->waiting_for = NULL; 488 retry = true; 489 waited = true; 490 break; 491 } 492 } 493 } 494 } while (retry); 495 496 return waited; 497 } 498 499 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 500 size_t size) 501 { 502 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 503 return -EIO; 504 } 505 506 if (!bdrv_is_inserted(bs)) { 507 return -ENOMEDIUM; 508 } 509 510 if (offset < 0) { 511 return -EIO; 512 } 513 514 return 0; 515 } 516 517 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 518 int nb_sectors) 519 { 520 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 521 return -EIO; 522 } 523 524 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 525 nb_sectors * BDRV_SECTOR_SIZE); 526 } 527 528 typedef struct RwCo { 529 BlockDriverState *bs; 530 int64_t offset; 531 QEMUIOVector *qiov; 532 bool is_write; 533 int ret; 534 BdrvRequestFlags flags; 535 } RwCo; 536 537 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 538 { 539 RwCo *rwco = opaque; 540 541 if (!rwco->is_write) { 542 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 543 rwco->qiov->size, rwco->qiov, 544 rwco->flags); 545 } else { 546 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 547 rwco->qiov->size, rwco->qiov, 548 rwco->flags); 549 } 550 } 551 552 /* 553 * Process a vectored synchronous request using coroutines 554 */ 555 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 556 QEMUIOVector *qiov, bool is_write, 557 BdrvRequestFlags flags) 558 { 559 Coroutine *co; 560 RwCo rwco = { 561 .bs = bs, 562 .offset = offset, 563 .qiov = qiov, 564 .is_write = is_write, 565 .ret = NOT_DONE, 566 .flags = flags, 567 }; 568 569 /** 570 * In sync call context, when the vcpu is blocked, this throttling timer 571 * will not fire; so the I/O throttling function has to be disabled here 572 * if it has been enabled. 573 */ 574 if (bs->io_limits_enabled) { 575 fprintf(stderr, "Disabling I/O throttling on '%s' due " 576 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 577 bdrv_io_limits_disable(bs); 578 } 579 580 if (qemu_in_coroutine()) { 581 /* Fast-path if already in coroutine context */ 582 bdrv_rw_co_entry(&rwco); 583 } else { 584 AioContext *aio_context = bdrv_get_aio_context(bs); 585 586 co = qemu_coroutine_create(bdrv_rw_co_entry); 587 qemu_coroutine_enter(co, &rwco); 588 while (rwco.ret == NOT_DONE) { 589 aio_poll(aio_context, true); 590 } 591 } 592 return rwco.ret; 593 } 594 595 /* 596 * Process a synchronous request using coroutines 597 */ 598 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 599 int nb_sectors, bool is_write, BdrvRequestFlags flags) 600 { 601 QEMUIOVector qiov; 602 struct iovec iov = { 603 .iov_base = (void *)buf, 604 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 605 }; 606 607 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 608 return -EINVAL; 609 } 610 611 qemu_iovec_init_external(&qiov, &iov, 1); 612 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 613 &qiov, is_write, flags); 614 } 615 616 /* return < 0 if error. See bdrv_write() for the return codes */ 617 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 618 uint8_t *buf, int nb_sectors) 619 { 620 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 621 } 622 623 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 624 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 625 uint8_t *buf, int nb_sectors) 626 { 627 bool enabled; 628 int ret; 629 630 enabled = bs->io_limits_enabled; 631 bs->io_limits_enabled = false; 632 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 633 bs->io_limits_enabled = enabled; 634 return ret; 635 } 636 637 /* Return < 0 if error. Important errors are: 638 -EIO generic I/O error (may happen for all errors) 639 -ENOMEDIUM No media inserted. 640 -EINVAL Invalid sector number or nb_sectors 641 -EACCES Trying to write a read-only device 642 */ 643 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 644 const uint8_t *buf, int nb_sectors) 645 { 646 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 647 } 648 649 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 650 int nb_sectors, BdrvRequestFlags flags) 651 { 652 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 653 BDRV_REQ_ZERO_WRITE | flags); 654 } 655 656 /* 657 * Completely zero out a block device with the help of bdrv_write_zeroes. 658 * The operation is sped up by checking the block status and only writing 659 * zeroes to the device if they currently do not return zeroes. Optional 660 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 661 * 662 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 663 */ 664 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 665 { 666 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 667 int n; 668 669 target_sectors = bdrv_nb_sectors(bs); 670 if (target_sectors < 0) { 671 return target_sectors; 672 } 673 674 for (;;) { 675 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 676 if (nb_sectors <= 0) { 677 return 0; 678 } 679 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 680 if (ret < 0) { 681 error_report("error getting block status at sector %" PRId64 ": %s", 682 sector_num, strerror(-ret)); 683 return ret; 684 } 685 if (ret & BDRV_BLOCK_ZERO) { 686 sector_num += n; 687 continue; 688 } 689 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 690 if (ret < 0) { 691 error_report("error writing zeroes at sector %" PRId64 ": %s", 692 sector_num, strerror(-ret)); 693 return ret; 694 } 695 sector_num += n; 696 } 697 } 698 699 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 700 { 701 QEMUIOVector qiov; 702 struct iovec iov = { 703 .iov_base = (void *)buf, 704 .iov_len = bytes, 705 }; 706 int ret; 707 708 if (bytes < 0) { 709 return -EINVAL; 710 } 711 712 qemu_iovec_init_external(&qiov, &iov, 1); 713 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 714 if (ret < 0) { 715 return ret; 716 } 717 718 return bytes; 719 } 720 721 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 722 { 723 int ret; 724 725 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 726 if (ret < 0) { 727 return ret; 728 } 729 730 return qiov->size; 731 } 732 733 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 734 const void *buf, int bytes) 735 { 736 QEMUIOVector qiov; 737 struct iovec iov = { 738 .iov_base = (void *) buf, 739 .iov_len = bytes, 740 }; 741 742 if (bytes < 0) { 743 return -EINVAL; 744 } 745 746 qemu_iovec_init_external(&qiov, &iov, 1); 747 return bdrv_pwritev(bs, offset, &qiov); 748 } 749 750 /* 751 * Writes to the file and ensures that no writes are reordered across this 752 * request (acts as a barrier) 753 * 754 * Returns 0 on success, -errno in error cases. 755 */ 756 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 757 const void *buf, int count) 758 { 759 int ret; 760 761 ret = bdrv_pwrite(bs, offset, buf, count); 762 if (ret < 0) { 763 return ret; 764 } 765 766 /* No flush needed for cache modes that already do it */ 767 if (bs->enable_write_cache) { 768 bdrv_flush(bs); 769 } 770 771 return 0; 772 } 773 774 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 775 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 776 { 777 /* Perform I/O through a temporary buffer so that users who scribble over 778 * their read buffer while the operation is in progress do not end up 779 * modifying the image file. This is critical for zero-copy guest I/O 780 * where anything might happen inside guest memory. 781 */ 782 void *bounce_buffer; 783 784 BlockDriver *drv = bs->drv; 785 struct iovec iov; 786 QEMUIOVector bounce_qiov; 787 int64_t cluster_sector_num; 788 int cluster_nb_sectors; 789 size_t skip_bytes; 790 int ret; 791 792 /* Cover entire cluster so no additional backing file I/O is required when 793 * allocating cluster in the image file. 794 */ 795 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 796 &cluster_sector_num, &cluster_nb_sectors); 797 798 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 799 cluster_sector_num, cluster_nb_sectors); 800 801 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 802 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 803 if (bounce_buffer == NULL) { 804 ret = -ENOMEM; 805 goto err; 806 } 807 808 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 809 810 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 811 &bounce_qiov); 812 if (ret < 0) { 813 goto err; 814 } 815 816 if (drv->bdrv_co_write_zeroes && 817 buffer_is_zero(bounce_buffer, iov.iov_len)) { 818 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 819 cluster_nb_sectors, 0); 820 } else { 821 /* This does not change the data on the disk, it is not necessary 822 * to flush even in cache=writethrough mode. 823 */ 824 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 825 &bounce_qiov); 826 } 827 828 if (ret < 0) { 829 /* It might be okay to ignore write errors for guest requests. If this 830 * is a deliberate copy-on-read then we don't want to ignore the error. 831 * Simply report it in all cases. 832 */ 833 goto err; 834 } 835 836 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 837 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 838 nb_sectors * BDRV_SECTOR_SIZE); 839 840 err: 841 qemu_vfree(bounce_buffer); 842 return ret; 843 } 844 845 /* 846 * Forwards an already correctly aligned request to the BlockDriver. This 847 * handles copy on read and zeroing after EOF; any other features must be 848 * implemented by the caller. 849 */ 850 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 851 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 852 int64_t align, QEMUIOVector *qiov, int flags) 853 { 854 BlockDriver *drv = bs->drv; 855 int ret; 856 857 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 858 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 859 860 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 861 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 862 assert(!qiov || bytes == qiov->size); 863 864 /* Handle Copy on Read and associated serialisation */ 865 if (flags & BDRV_REQ_COPY_ON_READ) { 866 /* If we touch the same cluster it counts as an overlap. This 867 * guarantees that allocating writes will be serialized and not race 868 * with each other for the same cluster. For example, in copy-on-read 869 * it ensures that the CoR read and write operations are atomic and 870 * guest writes cannot interleave between them. */ 871 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 872 } 873 874 if (!(flags & BDRV_REQ_NO_SERIALISING)) { 875 wait_serialising_requests(req); 876 } 877 878 if (flags & BDRV_REQ_COPY_ON_READ) { 879 int pnum; 880 881 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 882 if (ret < 0) { 883 goto out; 884 } 885 886 if (!ret || pnum != nb_sectors) { 887 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 888 goto out; 889 } 890 } 891 892 /* Forward the request to the BlockDriver */ 893 if (!bs->zero_beyond_eof) { 894 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 895 } else { 896 /* Read zeros after EOF */ 897 int64_t total_sectors, max_nb_sectors; 898 899 total_sectors = bdrv_nb_sectors(bs); 900 if (total_sectors < 0) { 901 ret = total_sectors; 902 goto out; 903 } 904 905 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 906 align >> BDRV_SECTOR_BITS); 907 if (nb_sectors < max_nb_sectors) { 908 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 909 } else if (max_nb_sectors > 0) { 910 QEMUIOVector local_qiov; 911 912 qemu_iovec_init(&local_qiov, qiov->niov); 913 qemu_iovec_concat(&local_qiov, qiov, 0, 914 max_nb_sectors * BDRV_SECTOR_SIZE); 915 916 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 917 &local_qiov); 918 919 qemu_iovec_destroy(&local_qiov); 920 } else { 921 ret = 0; 922 } 923 924 /* Reading beyond end of file is supposed to produce zeroes */ 925 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 926 uint64_t offset = MAX(0, total_sectors - sector_num); 927 uint64_t bytes = (sector_num + nb_sectors - offset) * 928 BDRV_SECTOR_SIZE; 929 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 930 } 931 } 932 933 out: 934 return ret; 935 } 936 937 /* 938 * Handle a read request in coroutine context 939 */ 940 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 941 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 942 BdrvRequestFlags flags) 943 { 944 BlockDriver *drv = bs->drv; 945 BdrvTrackedRequest req; 946 947 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 948 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 949 uint8_t *head_buf = NULL; 950 uint8_t *tail_buf = NULL; 951 QEMUIOVector local_qiov; 952 bool use_local_qiov = false; 953 int ret; 954 955 if (!drv) { 956 return -ENOMEDIUM; 957 } 958 959 ret = bdrv_check_byte_request(bs, offset, bytes); 960 if (ret < 0) { 961 return ret; 962 } 963 964 /* Don't do copy-on-read if we read data before write operation */ 965 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { 966 flags |= BDRV_REQ_COPY_ON_READ; 967 } 968 969 /* throttling disk I/O */ 970 if (bs->io_limits_enabled) { 971 throttle_group_co_io_limits_intercept(bs, bytes, false); 972 } 973 974 /* Align read if necessary by padding qiov */ 975 if (offset & (align - 1)) { 976 head_buf = qemu_blockalign(bs, align); 977 qemu_iovec_init(&local_qiov, qiov->niov + 2); 978 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 979 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 980 use_local_qiov = true; 981 982 bytes += offset & (align - 1); 983 offset = offset & ~(align - 1); 984 } 985 986 if ((offset + bytes) & (align - 1)) { 987 if (!use_local_qiov) { 988 qemu_iovec_init(&local_qiov, qiov->niov + 1); 989 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 990 use_local_qiov = true; 991 } 992 tail_buf = qemu_blockalign(bs, align); 993 qemu_iovec_add(&local_qiov, tail_buf, 994 align - ((offset + bytes) & (align - 1))); 995 996 bytes = ROUND_UP(bytes, align); 997 } 998 999 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 1000 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 1001 use_local_qiov ? &local_qiov : qiov, 1002 flags); 1003 tracked_request_end(&req); 1004 1005 if (use_local_qiov) { 1006 qemu_iovec_destroy(&local_qiov); 1007 qemu_vfree(head_buf); 1008 qemu_vfree(tail_buf); 1009 } 1010 1011 return ret; 1012 } 1013 1014 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1015 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1016 BdrvRequestFlags flags) 1017 { 1018 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1019 return -EINVAL; 1020 } 1021 1022 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1023 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1024 } 1025 1026 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1027 int nb_sectors, QEMUIOVector *qiov) 1028 { 1029 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1030 1031 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1032 } 1033 1034 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, 1035 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1036 { 1037 trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); 1038 1039 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1040 BDRV_REQ_NO_SERIALISING); 1041 } 1042 1043 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1044 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1045 { 1046 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1047 1048 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1049 BDRV_REQ_COPY_ON_READ); 1050 } 1051 1052 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1053 1054 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1055 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1056 { 1057 BlockDriver *drv = bs->drv; 1058 QEMUIOVector qiov; 1059 struct iovec iov = {0}; 1060 int ret = 0; 1061 1062 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1063 BDRV_REQUEST_MAX_SECTORS); 1064 1065 while (nb_sectors > 0 && !ret) { 1066 int num = nb_sectors; 1067 1068 /* Align request. Block drivers can expect the "bulk" of the request 1069 * to be aligned. 1070 */ 1071 if (bs->bl.write_zeroes_alignment 1072 && num > bs->bl.write_zeroes_alignment) { 1073 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1074 /* Make a small request up to the first aligned sector. */ 1075 num = bs->bl.write_zeroes_alignment; 1076 num -= sector_num % bs->bl.write_zeroes_alignment; 1077 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1078 /* Shorten the request to the last aligned sector. num cannot 1079 * underflow because num > bs->bl.write_zeroes_alignment. 1080 */ 1081 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1082 } 1083 } 1084 1085 /* limit request size */ 1086 if (num > max_write_zeroes) { 1087 num = max_write_zeroes; 1088 } 1089 1090 ret = -ENOTSUP; 1091 /* First try the efficient write zeroes operation */ 1092 if (drv->bdrv_co_write_zeroes) { 1093 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1094 } 1095 1096 if (ret == -ENOTSUP) { 1097 /* Fall back to bounce buffer if write zeroes is unsupported */ 1098 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1099 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1100 num = MIN(num, max_xfer_len); 1101 iov.iov_len = num * BDRV_SECTOR_SIZE; 1102 if (iov.iov_base == NULL) { 1103 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1104 if (iov.iov_base == NULL) { 1105 ret = -ENOMEM; 1106 goto fail; 1107 } 1108 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1109 } 1110 qemu_iovec_init_external(&qiov, &iov, 1); 1111 1112 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1113 1114 /* Keep bounce buffer around if it is big enough for all 1115 * all future requests. 1116 */ 1117 if (num < max_xfer_len) { 1118 qemu_vfree(iov.iov_base); 1119 iov.iov_base = NULL; 1120 } 1121 } 1122 1123 sector_num += num; 1124 nb_sectors -= num; 1125 } 1126 1127 fail: 1128 qemu_vfree(iov.iov_base); 1129 return ret; 1130 } 1131 1132 /* 1133 * Forwards an already correctly aligned write request to the BlockDriver. 1134 */ 1135 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1136 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1137 QEMUIOVector *qiov, int flags) 1138 { 1139 BlockDriver *drv = bs->drv; 1140 bool waited; 1141 int ret; 1142 1143 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1144 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1145 1146 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1147 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1148 assert(!qiov || bytes == qiov->size); 1149 1150 waited = wait_serialising_requests(req); 1151 assert(!waited || !req->serialising); 1152 assert(req->overlap_offset <= offset); 1153 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1154 1155 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1156 1157 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1158 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1159 qemu_iovec_is_zero(qiov)) { 1160 flags |= BDRV_REQ_ZERO_WRITE; 1161 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1162 flags |= BDRV_REQ_MAY_UNMAP; 1163 } 1164 } 1165 1166 if (ret < 0) { 1167 /* Do nothing, write notifier decided to fail this request */ 1168 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1169 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1170 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1171 } else { 1172 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1173 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1174 } 1175 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1176 1177 if (ret == 0 && !bs->enable_write_cache) { 1178 ret = bdrv_co_flush(bs); 1179 } 1180 1181 bdrv_set_dirty(bs, sector_num, nb_sectors); 1182 1183 if (bs->wr_highest_offset < offset + bytes) { 1184 bs->wr_highest_offset = offset + bytes; 1185 } 1186 1187 if (ret >= 0) { 1188 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1189 } 1190 1191 return ret; 1192 } 1193 1194 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1195 int64_t offset, 1196 unsigned int bytes, 1197 BdrvRequestFlags flags, 1198 BdrvTrackedRequest *req) 1199 { 1200 uint8_t *buf = NULL; 1201 QEMUIOVector local_qiov; 1202 struct iovec iov; 1203 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1204 unsigned int head_padding_bytes, tail_padding_bytes; 1205 int ret = 0; 1206 1207 head_padding_bytes = offset & (align - 1); 1208 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1209 1210 1211 assert(flags & BDRV_REQ_ZERO_WRITE); 1212 if (head_padding_bytes || tail_padding_bytes) { 1213 buf = qemu_blockalign(bs, align); 1214 iov = (struct iovec) { 1215 .iov_base = buf, 1216 .iov_len = align, 1217 }; 1218 qemu_iovec_init_external(&local_qiov, &iov, 1); 1219 } 1220 if (head_padding_bytes) { 1221 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1222 1223 /* RMW the unaligned part before head. */ 1224 mark_request_serialising(req, align); 1225 wait_serialising_requests(req); 1226 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1227 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1228 align, &local_qiov, 0); 1229 if (ret < 0) { 1230 goto fail; 1231 } 1232 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1233 1234 memset(buf + head_padding_bytes, 0, zero_bytes); 1235 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1236 &local_qiov, 1237 flags & ~BDRV_REQ_ZERO_WRITE); 1238 if (ret < 0) { 1239 goto fail; 1240 } 1241 offset += zero_bytes; 1242 bytes -= zero_bytes; 1243 } 1244 1245 assert(!bytes || (offset & (align - 1)) == 0); 1246 if (bytes >= align) { 1247 /* Write the aligned part in the middle. */ 1248 uint64_t aligned_bytes = bytes & ~(align - 1); 1249 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1250 NULL, flags); 1251 if (ret < 0) { 1252 goto fail; 1253 } 1254 bytes -= aligned_bytes; 1255 offset += aligned_bytes; 1256 } 1257 1258 assert(!bytes || (offset & (align - 1)) == 0); 1259 if (bytes) { 1260 assert(align == tail_padding_bytes + bytes); 1261 /* RMW the unaligned part after tail. */ 1262 mark_request_serialising(req, align); 1263 wait_serialising_requests(req); 1264 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1265 ret = bdrv_aligned_preadv(bs, req, offset, align, 1266 align, &local_qiov, 0); 1267 if (ret < 0) { 1268 goto fail; 1269 } 1270 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1271 1272 memset(buf, 0, bytes); 1273 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1274 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1275 } 1276 fail: 1277 qemu_vfree(buf); 1278 return ret; 1279 1280 } 1281 1282 /* 1283 * Handle a write request in coroutine context 1284 */ 1285 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1286 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1287 BdrvRequestFlags flags) 1288 { 1289 BdrvTrackedRequest req; 1290 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1291 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1292 uint8_t *head_buf = NULL; 1293 uint8_t *tail_buf = NULL; 1294 QEMUIOVector local_qiov; 1295 bool use_local_qiov = false; 1296 int ret; 1297 1298 if (!bs->drv) { 1299 return -ENOMEDIUM; 1300 } 1301 if (bs->read_only) { 1302 return -EPERM; 1303 } 1304 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 1305 1306 ret = bdrv_check_byte_request(bs, offset, bytes); 1307 if (ret < 0) { 1308 return ret; 1309 } 1310 1311 /* throttling disk I/O */ 1312 if (bs->io_limits_enabled) { 1313 throttle_group_co_io_limits_intercept(bs, bytes, true); 1314 } 1315 1316 /* 1317 * Align write if necessary by performing a read-modify-write cycle. 1318 * Pad qiov with the read parts and be sure to have a tracked request not 1319 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1320 */ 1321 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 1322 1323 if (!qiov) { 1324 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1325 goto out; 1326 } 1327 1328 if (offset & (align - 1)) { 1329 QEMUIOVector head_qiov; 1330 struct iovec head_iov; 1331 1332 mark_request_serialising(&req, align); 1333 wait_serialising_requests(&req); 1334 1335 head_buf = qemu_blockalign(bs, align); 1336 head_iov = (struct iovec) { 1337 .iov_base = head_buf, 1338 .iov_len = align, 1339 }; 1340 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1341 1342 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1343 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1344 align, &head_qiov, 0); 1345 if (ret < 0) { 1346 goto fail; 1347 } 1348 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1349 1350 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1351 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1352 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1353 use_local_qiov = true; 1354 1355 bytes += offset & (align - 1); 1356 offset = offset & ~(align - 1); 1357 } 1358 1359 if ((offset + bytes) & (align - 1)) { 1360 QEMUIOVector tail_qiov; 1361 struct iovec tail_iov; 1362 size_t tail_bytes; 1363 bool waited; 1364 1365 mark_request_serialising(&req, align); 1366 waited = wait_serialising_requests(&req); 1367 assert(!waited || !use_local_qiov); 1368 1369 tail_buf = qemu_blockalign(bs, align); 1370 tail_iov = (struct iovec) { 1371 .iov_base = tail_buf, 1372 .iov_len = align, 1373 }; 1374 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1375 1376 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1377 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1378 align, &tail_qiov, 0); 1379 if (ret < 0) { 1380 goto fail; 1381 } 1382 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1383 1384 if (!use_local_qiov) { 1385 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1386 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1387 use_local_qiov = true; 1388 } 1389 1390 tail_bytes = (offset + bytes) & (align - 1); 1391 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1392 1393 bytes = ROUND_UP(bytes, align); 1394 } 1395 1396 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1397 use_local_qiov ? &local_qiov : qiov, 1398 flags); 1399 1400 fail: 1401 1402 if (use_local_qiov) { 1403 qemu_iovec_destroy(&local_qiov); 1404 } 1405 qemu_vfree(head_buf); 1406 qemu_vfree(tail_buf); 1407 out: 1408 tracked_request_end(&req); 1409 return ret; 1410 } 1411 1412 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1413 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1414 BdrvRequestFlags flags) 1415 { 1416 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1417 return -EINVAL; 1418 } 1419 1420 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1421 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1422 } 1423 1424 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1425 int nb_sectors, QEMUIOVector *qiov) 1426 { 1427 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1428 1429 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1430 } 1431 1432 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1433 int64_t sector_num, int nb_sectors, 1434 BdrvRequestFlags flags) 1435 { 1436 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1437 1438 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1439 flags &= ~BDRV_REQ_MAY_UNMAP; 1440 } 1441 1442 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1443 BDRV_REQ_ZERO_WRITE | flags); 1444 } 1445 1446 int bdrv_flush_all(void) 1447 { 1448 BlockDriverState *bs = NULL; 1449 int result = 0; 1450 1451 while ((bs = bdrv_next(bs))) { 1452 AioContext *aio_context = bdrv_get_aio_context(bs); 1453 int ret; 1454 1455 aio_context_acquire(aio_context); 1456 ret = bdrv_flush(bs); 1457 if (ret < 0 && !result) { 1458 result = ret; 1459 } 1460 aio_context_release(aio_context); 1461 } 1462 1463 return result; 1464 } 1465 1466 typedef struct BdrvCoGetBlockStatusData { 1467 BlockDriverState *bs; 1468 BlockDriverState *base; 1469 int64_t sector_num; 1470 int nb_sectors; 1471 int *pnum; 1472 int64_t ret; 1473 bool done; 1474 } BdrvCoGetBlockStatusData; 1475 1476 /* 1477 * Returns the allocation status of the specified sectors. 1478 * Drivers not implementing the functionality are assumed to not support 1479 * backing files, hence all their sectors are reported as allocated. 1480 * 1481 * If 'sector_num' is beyond the end of the disk image the return value is 0 1482 * and 'pnum' is set to 0. 1483 * 1484 * 'pnum' is set to the number of sectors (including and immediately following 1485 * the specified sector) that are known to be in the same 1486 * allocated/unallocated state. 1487 * 1488 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1489 * beyond the end of the disk image it will be clamped. 1490 */ 1491 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1492 int64_t sector_num, 1493 int nb_sectors, int *pnum) 1494 { 1495 int64_t total_sectors; 1496 int64_t n; 1497 int64_t ret, ret2; 1498 1499 total_sectors = bdrv_nb_sectors(bs); 1500 if (total_sectors < 0) { 1501 return total_sectors; 1502 } 1503 1504 if (sector_num >= total_sectors) { 1505 *pnum = 0; 1506 return 0; 1507 } 1508 1509 n = total_sectors - sector_num; 1510 if (n < nb_sectors) { 1511 nb_sectors = n; 1512 } 1513 1514 if (!bs->drv->bdrv_co_get_block_status) { 1515 *pnum = nb_sectors; 1516 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1517 if (bs->drv->protocol_name) { 1518 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1519 } 1520 return ret; 1521 } 1522 1523 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1524 if (ret < 0) { 1525 *pnum = 0; 1526 return ret; 1527 } 1528 1529 if (ret & BDRV_BLOCK_RAW) { 1530 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1531 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1532 *pnum, pnum); 1533 } 1534 1535 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1536 ret |= BDRV_BLOCK_ALLOCATED; 1537 } else { 1538 if (bdrv_unallocated_blocks_are_zero(bs)) { 1539 ret |= BDRV_BLOCK_ZERO; 1540 } else if (bs->backing) { 1541 BlockDriverState *bs2 = bs->backing->bs; 1542 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1543 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1544 ret |= BDRV_BLOCK_ZERO; 1545 } 1546 } 1547 } 1548 1549 if (bs->file && 1550 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1551 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1552 int file_pnum; 1553 1554 ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1555 *pnum, &file_pnum); 1556 if (ret2 >= 0) { 1557 /* Ignore errors. This is just providing extra information, it 1558 * is useful but not necessary. 1559 */ 1560 if (!file_pnum) { 1561 /* !file_pnum indicates an offset at or beyond the EOF; it is 1562 * perfectly valid for the format block driver to point to such 1563 * offsets, so catch it and mark everything as zero */ 1564 ret |= BDRV_BLOCK_ZERO; 1565 } else { 1566 /* Limit request to the range reported by the protocol driver */ 1567 *pnum = file_pnum; 1568 ret |= (ret2 & BDRV_BLOCK_ZERO); 1569 } 1570 } 1571 } 1572 1573 return ret; 1574 } 1575 1576 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1577 BlockDriverState *base, 1578 int64_t sector_num, 1579 int nb_sectors, 1580 int *pnum) 1581 { 1582 BlockDriverState *p; 1583 int64_t ret = 0; 1584 1585 assert(bs != base); 1586 for (p = bs; p != base; p = backing_bs(p)) { 1587 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); 1588 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1589 break; 1590 } 1591 /* [sector_num, pnum] unallocated on this layer, which could be only 1592 * the first part of [sector_num, nb_sectors]. */ 1593 nb_sectors = MIN(nb_sectors, *pnum); 1594 } 1595 return ret; 1596 } 1597 1598 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1599 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1600 { 1601 BdrvCoGetBlockStatusData *data = opaque; 1602 1603 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1604 data->sector_num, 1605 data->nb_sectors, 1606 data->pnum); 1607 data->done = true; 1608 } 1609 1610 /* 1611 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1612 * 1613 * See bdrv_co_get_block_status_above() for details. 1614 */ 1615 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1616 BlockDriverState *base, 1617 int64_t sector_num, 1618 int nb_sectors, int *pnum) 1619 { 1620 Coroutine *co; 1621 BdrvCoGetBlockStatusData data = { 1622 .bs = bs, 1623 .base = base, 1624 .sector_num = sector_num, 1625 .nb_sectors = nb_sectors, 1626 .pnum = pnum, 1627 .done = false, 1628 }; 1629 1630 if (qemu_in_coroutine()) { 1631 /* Fast-path if already in coroutine context */ 1632 bdrv_get_block_status_above_co_entry(&data); 1633 } else { 1634 AioContext *aio_context = bdrv_get_aio_context(bs); 1635 1636 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1637 qemu_coroutine_enter(co, &data); 1638 while (!data.done) { 1639 aio_poll(aio_context, true); 1640 } 1641 } 1642 return data.ret; 1643 } 1644 1645 int64_t bdrv_get_block_status(BlockDriverState *bs, 1646 int64_t sector_num, 1647 int nb_sectors, int *pnum) 1648 { 1649 return bdrv_get_block_status_above(bs, backing_bs(bs), 1650 sector_num, nb_sectors, pnum); 1651 } 1652 1653 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1654 int nb_sectors, int *pnum) 1655 { 1656 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1657 if (ret < 0) { 1658 return ret; 1659 } 1660 return !!(ret & BDRV_BLOCK_ALLOCATED); 1661 } 1662 1663 /* 1664 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1665 * 1666 * Return true if the given sector is allocated in any image between 1667 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1668 * sector is allocated in any image of the chain. Return false otherwise. 1669 * 1670 * 'pnum' is set to the number of sectors (including and immediately following 1671 * the specified sector) that are known to be in the same 1672 * allocated/unallocated state. 1673 * 1674 */ 1675 int bdrv_is_allocated_above(BlockDriverState *top, 1676 BlockDriverState *base, 1677 int64_t sector_num, 1678 int nb_sectors, int *pnum) 1679 { 1680 BlockDriverState *intermediate; 1681 int ret, n = nb_sectors; 1682 1683 intermediate = top; 1684 while (intermediate && intermediate != base) { 1685 int pnum_inter; 1686 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1687 &pnum_inter); 1688 if (ret < 0) { 1689 return ret; 1690 } else if (ret) { 1691 *pnum = pnum_inter; 1692 return 1; 1693 } 1694 1695 /* 1696 * [sector_num, nb_sectors] is unallocated on top but intermediate 1697 * might have 1698 * 1699 * [sector_num+x, nr_sectors] allocated. 1700 */ 1701 if (n > pnum_inter && 1702 (intermediate == top || 1703 sector_num + pnum_inter < intermediate->total_sectors)) { 1704 n = pnum_inter; 1705 } 1706 1707 intermediate = backing_bs(intermediate); 1708 } 1709 1710 *pnum = n; 1711 return 0; 1712 } 1713 1714 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1715 const uint8_t *buf, int nb_sectors) 1716 { 1717 BlockDriver *drv = bs->drv; 1718 int ret; 1719 1720 if (!drv) { 1721 return -ENOMEDIUM; 1722 } 1723 if (!drv->bdrv_write_compressed) { 1724 return -ENOTSUP; 1725 } 1726 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1727 if (ret < 0) { 1728 return ret; 1729 } 1730 1731 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1732 1733 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1734 } 1735 1736 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1737 int64_t pos, int size) 1738 { 1739 QEMUIOVector qiov; 1740 struct iovec iov = { 1741 .iov_base = (void *) buf, 1742 .iov_len = size, 1743 }; 1744 1745 qemu_iovec_init_external(&qiov, &iov, 1); 1746 return bdrv_writev_vmstate(bs, &qiov, pos); 1747 } 1748 1749 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1750 { 1751 BlockDriver *drv = bs->drv; 1752 1753 if (!drv) { 1754 return -ENOMEDIUM; 1755 } else if (drv->bdrv_save_vmstate) { 1756 return drv->bdrv_save_vmstate(bs, qiov, pos); 1757 } else if (bs->file) { 1758 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1759 } 1760 1761 return -ENOTSUP; 1762 } 1763 1764 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1765 int64_t pos, int size) 1766 { 1767 BlockDriver *drv = bs->drv; 1768 if (!drv) 1769 return -ENOMEDIUM; 1770 if (drv->bdrv_load_vmstate) 1771 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1772 if (bs->file) 1773 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1774 return -ENOTSUP; 1775 } 1776 1777 /**************************************************************/ 1778 /* async I/Os */ 1779 1780 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1781 QEMUIOVector *qiov, int nb_sectors, 1782 BlockCompletionFunc *cb, void *opaque) 1783 { 1784 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1785 1786 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1787 cb, opaque, false); 1788 } 1789 1790 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1791 QEMUIOVector *qiov, int nb_sectors, 1792 BlockCompletionFunc *cb, void *opaque) 1793 { 1794 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1795 1796 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1797 cb, opaque, true); 1798 } 1799 1800 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1801 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1802 BlockCompletionFunc *cb, void *opaque) 1803 { 1804 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1805 1806 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1807 BDRV_REQ_ZERO_WRITE | flags, 1808 cb, opaque, true); 1809 } 1810 1811 1812 typedef struct MultiwriteCB { 1813 int error; 1814 int num_requests; 1815 int num_callbacks; 1816 struct { 1817 BlockCompletionFunc *cb; 1818 void *opaque; 1819 QEMUIOVector *free_qiov; 1820 } callbacks[]; 1821 } MultiwriteCB; 1822 1823 static void multiwrite_user_cb(MultiwriteCB *mcb) 1824 { 1825 int i; 1826 1827 for (i = 0; i < mcb->num_callbacks; i++) { 1828 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1829 if (mcb->callbacks[i].free_qiov) { 1830 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1831 } 1832 g_free(mcb->callbacks[i].free_qiov); 1833 } 1834 } 1835 1836 static void multiwrite_cb(void *opaque, int ret) 1837 { 1838 MultiwriteCB *mcb = opaque; 1839 1840 trace_multiwrite_cb(mcb, ret); 1841 1842 if (ret < 0 && !mcb->error) { 1843 mcb->error = ret; 1844 } 1845 1846 mcb->num_requests--; 1847 if (mcb->num_requests == 0) { 1848 multiwrite_user_cb(mcb); 1849 g_free(mcb); 1850 } 1851 } 1852 1853 static int multiwrite_req_compare(const void *a, const void *b) 1854 { 1855 const BlockRequest *req1 = a, *req2 = b; 1856 1857 /* 1858 * Note that we can't simply subtract req2->sector from req1->sector 1859 * here as that could overflow the return value. 1860 */ 1861 if (req1->sector > req2->sector) { 1862 return 1; 1863 } else if (req1->sector < req2->sector) { 1864 return -1; 1865 } else { 1866 return 0; 1867 } 1868 } 1869 1870 /* 1871 * Takes a bunch of requests and tries to merge them. Returns the number of 1872 * requests that remain after merging. 1873 */ 1874 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1875 int num_reqs, MultiwriteCB *mcb) 1876 { 1877 int i, outidx; 1878 1879 // Sort requests by start sector 1880 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1881 1882 // Check if adjacent requests touch the same clusters. If so, combine them, 1883 // filling up gaps with zero sectors. 1884 outidx = 0; 1885 for (i = 1; i < num_reqs; i++) { 1886 int merge = 0; 1887 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1888 1889 // Handle exactly sequential writes and overlapping writes. 1890 if (reqs[i].sector <= oldreq_last) { 1891 merge = 1; 1892 } 1893 1894 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > 1895 bs->bl.max_iov) { 1896 merge = 0; 1897 } 1898 1899 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1900 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1901 merge = 0; 1902 } 1903 1904 if (merge) { 1905 size_t size; 1906 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1907 qemu_iovec_init(qiov, 1908 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1909 1910 // Add the first request to the merged one. If the requests are 1911 // overlapping, drop the last sectors of the first request. 1912 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1913 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1914 1915 // We should need to add any zeros between the two requests 1916 assert (reqs[i].sector <= oldreq_last); 1917 1918 // Add the second request 1919 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1920 1921 // Add tail of first request, if necessary 1922 if (qiov->size < reqs[outidx].qiov->size) { 1923 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1924 reqs[outidx].qiov->size - qiov->size); 1925 } 1926 1927 reqs[outidx].nb_sectors = qiov->size >> 9; 1928 reqs[outidx].qiov = qiov; 1929 1930 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1931 } else { 1932 outidx++; 1933 reqs[outidx].sector = reqs[i].sector; 1934 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1935 reqs[outidx].qiov = reqs[i].qiov; 1936 } 1937 } 1938 1939 if (bs->blk) { 1940 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1941 num_reqs - outidx - 1); 1942 } 1943 1944 return outidx + 1; 1945 } 1946 1947 /* 1948 * Submit multiple AIO write requests at once. 1949 * 1950 * On success, the function returns 0 and all requests in the reqs array have 1951 * been submitted. In error case this function returns -1, and any of the 1952 * requests may or may not be submitted yet. In particular, this means that the 1953 * callback will be called for some of the requests, for others it won't. The 1954 * caller must check the error field of the BlockRequest to wait for the right 1955 * callbacks (if error != 0, no callback will be called). 1956 * 1957 * The implementation may modify the contents of the reqs array, e.g. to merge 1958 * requests. However, the fields opaque and error are left unmodified as they 1959 * are used to signal failure for a single request to the caller. 1960 */ 1961 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1962 { 1963 MultiwriteCB *mcb; 1964 int i; 1965 1966 /* don't submit writes if we don't have a medium */ 1967 if (bs->drv == NULL) { 1968 for (i = 0; i < num_reqs; i++) { 1969 reqs[i].error = -ENOMEDIUM; 1970 } 1971 return -1; 1972 } 1973 1974 if (num_reqs == 0) { 1975 return 0; 1976 } 1977 1978 // Create MultiwriteCB structure 1979 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1980 mcb->num_requests = 0; 1981 mcb->num_callbacks = num_reqs; 1982 1983 for (i = 0; i < num_reqs; i++) { 1984 mcb->callbacks[i].cb = reqs[i].cb; 1985 mcb->callbacks[i].opaque = reqs[i].opaque; 1986 } 1987 1988 // Check for mergable requests 1989 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1990 1991 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1992 1993 /* Run the aio requests. */ 1994 mcb->num_requests = num_reqs; 1995 for (i = 0; i < num_reqs; i++) { 1996 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1997 reqs[i].nb_sectors, reqs[i].flags, 1998 multiwrite_cb, mcb, 1999 true); 2000 } 2001 2002 return 0; 2003 } 2004 2005 void bdrv_aio_cancel(BlockAIOCB *acb) 2006 { 2007 qemu_aio_ref(acb); 2008 bdrv_aio_cancel_async(acb); 2009 while (acb->refcnt > 1) { 2010 if (acb->aiocb_info->get_aio_context) { 2011 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 2012 } else if (acb->bs) { 2013 aio_poll(bdrv_get_aio_context(acb->bs), true); 2014 } else { 2015 abort(); 2016 } 2017 } 2018 qemu_aio_unref(acb); 2019 } 2020 2021 /* Async version of aio cancel. The caller is not blocked if the acb implements 2022 * cancel_async, otherwise we do nothing and let the request normally complete. 2023 * In either case the completion callback must be called. */ 2024 void bdrv_aio_cancel_async(BlockAIOCB *acb) 2025 { 2026 if (acb->aiocb_info->cancel_async) { 2027 acb->aiocb_info->cancel_async(acb); 2028 } 2029 } 2030 2031 /**************************************************************/ 2032 /* async block device emulation */ 2033 2034 typedef struct BlockAIOCBSync { 2035 BlockAIOCB common; 2036 QEMUBH *bh; 2037 int ret; 2038 /* vector translation state */ 2039 QEMUIOVector *qiov; 2040 uint8_t *bounce; 2041 int is_write; 2042 } BlockAIOCBSync; 2043 2044 static const AIOCBInfo bdrv_em_aiocb_info = { 2045 .aiocb_size = sizeof(BlockAIOCBSync), 2046 }; 2047 2048 static void bdrv_aio_bh_cb(void *opaque) 2049 { 2050 BlockAIOCBSync *acb = opaque; 2051 2052 if (!acb->is_write && acb->ret >= 0) { 2053 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2054 } 2055 qemu_vfree(acb->bounce); 2056 acb->common.cb(acb->common.opaque, acb->ret); 2057 qemu_bh_delete(acb->bh); 2058 acb->bh = NULL; 2059 qemu_aio_unref(acb); 2060 } 2061 2062 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2063 int64_t sector_num, 2064 QEMUIOVector *qiov, 2065 int nb_sectors, 2066 BlockCompletionFunc *cb, 2067 void *opaque, 2068 int is_write) 2069 2070 { 2071 BlockAIOCBSync *acb; 2072 2073 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2074 acb->is_write = is_write; 2075 acb->qiov = qiov; 2076 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2077 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2078 2079 if (acb->bounce == NULL) { 2080 acb->ret = -ENOMEM; 2081 } else if (is_write) { 2082 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2083 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2084 } else { 2085 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2086 } 2087 2088 qemu_bh_schedule(acb->bh); 2089 2090 return &acb->common; 2091 } 2092 2093 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2094 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2095 BlockCompletionFunc *cb, void *opaque) 2096 { 2097 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2098 } 2099 2100 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2101 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2102 BlockCompletionFunc *cb, void *opaque) 2103 { 2104 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2105 } 2106 2107 2108 typedef struct BlockAIOCBCoroutine { 2109 BlockAIOCB common; 2110 BlockRequest req; 2111 bool is_write; 2112 bool need_bh; 2113 bool *done; 2114 QEMUBH* bh; 2115 } BlockAIOCBCoroutine; 2116 2117 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2118 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2119 }; 2120 2121 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2122 { 2123 if (!acb->need_bh) { 2124 acb->common.cb(acb->common.opaque, acb->req.error); 2125 qemu_aio_unref(acb); 2126 } 2127 } 2128 2129 static void bdrv_co_em_bh(void *opaque) 2130 { 2131 BlockAIOCBCoroutine *acb = opaque; 2132 2133 assert(!acb->need_bh); 2134 qemu_bh_delete(acb->bh); 2135 bdrv_co_complete(acb); 2136 } 2137 2138 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2139 { 2140 acb->need_bh = false; 2141 if (acb->req.error != -EINPROGRESS) { 2142 BlockDriverState *bs = acb->common.bs; 2143 2144 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2145 qemu_bh_schedule(acb->bh); 2146 } 2147 } 2148 2149 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2150 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2151 { 2152 BlockAIOCBCoroutine *acb = opaque; 2153 BlockDriverState *bs = acb->common.bs; 2154 2155 if (!acb->is_write) { 2156 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2157 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2158 } else { 2159 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2160 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2161 } 2162 2163 bdrv_co_complete(acb); 2164 } 2165 2166 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2167 int64_t sector_num, 2168 QEMUIOVector *qiov, 2169 int nb_sectors, 2170 BdrvRequestFlags flags, 2171 BlockCompletionFunc *cb, 2172 void *opaque, 2173 bool is_write) 2174 { 2175 Coroutine *co; 2176 BlockAIOCBCoroutine *acb; 2177 2178 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2179 acb->need_bh = true; 2180 acb->req.error = -EINPROGRESS; 2181 acb->req.sector = sector_num; 2182 acb->req.nb_sectors = nb_sectors; 2183 acb->req.qiov = qiov; 2184 acb->req.flags = flags; 2185 acb->is_write = is_write; 2186 2187 co = qemu_coroutine_create(bdrv_co_do_rw); 2188 qemu_coroutine_enter(co, acb); 2189 2190 bdrv_co_maybe_schedule_bh(acb); 2191 return &acb->common; 2192 } 2193 2194 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2195 { 2196 BlockAIOCBCoroutine *acb = opaque; 2197 BlockDriverState *bs = acb->common.bs; 2198 2199 acb->req.error = bdrv_co_flush(bs); 2200 bdrv_co_complete(acb); 2201 } 2202 2203 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2204 BlockCompletionFunc *cb, void *opaque) 2205 { 2206 trace_bdrv_aio_flush(bs, opaque); 2207 2208 Coroutine *co; 2209 BlockAIOCBCoroutine *acb; 2210 2211 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2212 acb->need_bh = true; 2213 acb->req.error = -EINPROGRESS; 2214 2215 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2216 qemu_coroutine_enter(co, acb); 2217 2218 bdrv_co_maybe_schedule_bh(acb); 2219 return &acb->common; 2220 } 2221 2222 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2223 { 2224 BlockAIOCBCoroutine *acb = opaque; 2225 BlockDriverState *bs = acb->common.bs; 2226 2227 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2228 bdrv_co_complete(acb); 2229 } 2230 2231 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2232 int64_t sector_num, int nb_sectors, 2233 BlockCompletionFunc *cb, void *opaque) 2234 { 2235 Coroutine *co; 2236 BlockAIOCBCoroutine *acb; 2237 2238 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2239 2240 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2241 acb->need_bh = true; 2242 acb->req.error = -EINPROGRESS; 2243 acb->req.sector = sector_num; 2244 acb->req.nb_sectors = nb_sectors; 2245 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2246 qemu_coroutine_enter(co, acb); 2247 2248 bdrv_co_maybe_schedule_bh(acb); 2249 return &acb->common; 2250 } 2251 2252 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2253 BlockCompletionFunc *cb, void *opaque) 2254 { 2255 BlockAIOCB *acb; 2256 2257 acb = g_malloc(aiocb_info->aiocb_size); 2258 acb->aiocb_info = aiocb_info; 2259 acb->bs = bs; 2260 acb->cb = cb; 2261 acb->opaque = opaque; 2262 acb->refcnt = 1; 2263 return acb; 2264 } 2265 2266 void qemu_aio_ref(void *p) 2267 { 2268 BlockAIOCB *acb = p; 2269 acb->refcnt++; 2270 } 2271 2272 void qemu_aio_unref(void *p) 2273 { 2274 BlockAIOCB *acb = p; 2275 assert(acb->refcnt > 0); 2276 if (--acb->refcnt == 0) { 2277 g_free(acb); 2278 } 2279 } 2280 2281 /**************************************************************/ 2282 /* Coroutine block device emulation */ 2283 2284 typedef struct CoroutineIOCompletion { 2285 Coroutine *coroutine; 2286 int ret; 2287 } CoroutineIOCompletion; 2288 2289 static void bdrv_co_io_em_complete(void *opaque, int ret) 2290 { 2291 CoroutineIOCompletion *co = opaque; 2292 2293 co->ret = ret; 2294 qemu_coroutine_enter(co->coroutine, NULL); 2295 } 2296 2297 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2298 int nb_sectors, QEMUIOVector *iov, 2299 bool is_write) 2300 { 2301 CoroutineIOCompletion co = { 2302 .coroutine = qemu_coroutine_self(), 2303 }; 2304 BlockAIOCB *acb; 2305 2306 if (is_write) { 2307 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2308 bdrv_co_io_em_complete, &co); 2309 } else { 2310 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2311 bdrv_co_io_em_complete, &co); 2312 } 2313 2314 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2315 if (!acb) { 2316 return -EIO; 2317 } 2318 qemu_coroutine_yield(); 2319 2320 return co.ret; 2321 } 2322 2323 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2324 int64_t sector_num, int nb_sectors, 2325 QEMUIOVector *iov) 2326 { 2327 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2328 } 2329 2330 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2331 int64_t sector_num, int nb_sectors, 2332 QEMUIOVector *iov) 2333 { 2334 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2335 } 2336 2337 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2338 { 2339 RwCo *rwco = opaque; 2340 2341 rwco->ret = bdrv_co_flush(rwco->bs); 2342 } 2343 2344 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2345 { 2346 int ret; 2347 BdrvTrackedRequest req; 2348 2349 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2350 bdrv_is_sg(bs)) { 2351 return 0; 2352 } 2353 2354 tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); 2355 /* Write back cached data to the OS even with cache=unsafe */ 2356 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2357 if (bs->drv->bdrv_co_flush_to_os) { 2358 ret = bs->drv->bdrv_co_flush_to_os(bs); 2359 if (ret < 0) { 2360 goto out; 2361 } 2362 } 2363 2364 /* But don't actually force it to the disk with cache=unsafe */ 2365 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2366 goto flush_parent; 2367 } 2368 2369 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2370 if (bs->drv->bdrv_co_flush_to_disk) { 2371 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2372 } else if (bs->drv->bdrv_aio_flush) { 2373 BlockAIOCB *acb; 2374 CoroutineIOCompletion co = { 2375 .coroutine = qemu_coroutine_self(), 2376 }; 2377 2378 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2379 if (acb == NULL) { 2380 ret = -EIO; 2381 } else { 2382 qemu_coroutine_yield(); 2383 ret = co.ret; 2384 } 2385 } else { 2386 /* 2387 * Some block drivers always operate in either writethrough or unsafe 2388 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2389 * know how the server works (because the behaviour is hardcoded or 2390 * depends on server-side configuration), so we can't ensure that 2391 * everything is safe on disk. Returning an error doesn't work because 2392 * that would break guests even if the server operates in writethrough 2393 * mode. 2394 * 2395 * Let's hope the user knows what he's doing. 2396 */ 2397 ret = 0; 2398 } 2399 if (ret < 0) { 2400 goto out; 2401 } 2402 2403 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2404 * in the case of cache=unsafe, so there are no useless flushes. 2405 */ 2406 flush_parent: 2407 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2408 out: 2409 tracked_request_end(&req); 2410 return ret; 2411 } 2412 2413 int bdrv_flush(BlockDriverState *bs) 2414 { 2415 Coroutine *co; 2416 RwCo rwco = { 2417 .bs = bs, 2418 .ret = NOT_DONE, 2419 }; 2420 2421 if (qemu_in_coroutine()) { 2422 /* Fast-path if already in coroutine context */ 2423 bdrv_flush_co_entry(&rwco); 2424 } else { 2425 AioContext *aio_context = bdrv_get_aio_context(bs); 2426 2427 co = qemu_coroutine_create(bdrv_flush_co_entry); 2428 qemu_coroutine_enter(co, &rwco); 2429 while (rwco.ret == NOT_DONE) { 2430 aio_poll(aio_context, true); 2431 } 2432 } 2433 2434 return rwco.ret; 2435 } 2436 2437 typedef struct DiscardCo { 2438 BlockDriverState *bs; 2439 int64_t sector_num; 2440 int nb_sectors; 2441 int ret; 2442 } DiscardCo; 2443 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2444 { 2445 DiscardCo *rwco = opaque; 2446 2447 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2448 } 2449 2450 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2451 int nb_sectors) 2452 { 2453 BdrvTrackedRequest req; 2454 int max_discard, ret; 2455 2456 if (!bs->drv) { 2457 return -ENOMEDIUM; 2458 } 2459 2460 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2461 if (ret < 0) { 2462 return ret; 2463 } else if (bs->read_only) { 2464 return -EPERM; 2465 } 2466 assert(!(bs->open_flags & BDRV_O_INACTIVE)); 2467 2468 /* Do nothing if disabled. */ 2469 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2470 return 0; 2471 } 2472 2473 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2474 return 0; 2475 } 2476 2477 tracked_request_begin(&req, bs, sector_num, nb_sectors, 2478 BDRV_TRACKED_DISCARD); 2479 bdrv_set_dirty(bs, sector_num, nb_sectors); 2480 2481 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2482 while (nb_sectors > 0) { 2483 int ret; 2484 int num = nb_sectors; 2485 2486 /* align request */ 2487 if (bs->bl.discard_alignment && 2488 num >= bs->bl.discard_alignment && 2489 sector_num % bs->bl.discard_alignment) { 2490 if (num > bs->bl.discard_alignment) { 2491 num = bs->bl.discard_alignment; 2492 } 2493 num -= sector_num % bs->bl.discard_alignment; 2494 } 2495 2496 /* limit request size */ 2497 if (num > max_discard) { 2498 num = max_discard; 2499 } 2500 2501 if (bs->drv->bdrv_co_discard) { 2502 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2503 } else { 2504 BlockAIOCB *acb; 2505 CoroutineIOCompletion co = { 2506 .coroutine = qemu_coroutine_self(), 2507 }; 2508 2509 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2510 bdrv_co_io_em_complete, &co); 2511 if (acb == NULL) { 2512 ret = -EIO; 2513 goto out; 2514 } else { 2515 qemu_coroutine_yield(); 2516 ret = co.ret; 2517 } 2518 } 2519 if (ret && ret != -ENOTSUP) { 2520 goto out; 2521 } 2522 2523 sector_num += num; 2524 nb_sectors -= num; 2525 } 2526 ret = 0; 2527 out: 2528 tracked_request_end(&req); 2529 return ret; 2530 } 2531 2532 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2533 { 2534 Coroutine *co; 2535 DiscardCo rwco = { 2536 .bs = bs, 2537 .sector_num = sector_num, 2538 .nb_sectors = nb_sectors, 2539 .ret = NOT_DONE, 2540 }; 2541 2542 if (qemu_in_coroutine()) { 2543 /* Fast-path if already in coroutine context */ 2544 bdrv_discard_co_entry(&rwco); 2545 } else { 2546 AioContext *aio_context = bdrv_get_aio_context(bs); 2547 2548 co = qemu_coroutine_create(bdrv_discard_co_entry); 2549 qemu_coroutine_enter(co, &rwco); 2550 while (rwco.ret == NOT_DONE) { 2551 aio_poll(aio_context, true); 2552 } 2553 } 2554 2555 return rwco.ret; 2556 } 2557 2558 typedef struct { 2559 CoroutineIOCompletion *co; 2560 QEMUBH *bh; 2561 } BdrvIoctlCompletionData; 2562 2563 static void bdrv_ioctl_bh_cb(void *opaque) 2564 { 2565 BdrvIoctlCompletionData *data = opaque; 2566 2567 bdrv_co_io_em_complete(data->co, -ENOTSUP); 2568 qemu_bh_delete(data->bh); 2569 } 2570 2571 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) 2572 { 2573 BlockDriver *drv = bs->drv; 2574 BdrvTrackedRequest tracked_req; 2575 CoroutineIOCompletion co = { 2576 .coroutine = qemu_coroutine_self(), 2577 }; 2578 BlockAIOCB *acb; 2579 2580 tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); 2581 if (!drv || !drv->bdrv_aio_ioctl) { 2582 co.ret = -ENOTSUP; 2583 goto out; 2584 } 2585 2586 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 2587 if (!acb) { 2588 BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); 2589 data->bh = aio_bh_new(bdrv_get_aio_context(bs), 2590 bdrv_ioctl_bh_cb, data); 2591 data->co = &co; 2592 qemu_bh_schedule(data->bh); 2593 } 2594 qemu_coroutine_yield(); 2595 out: 2596 tracked_request_end(&tracked_req); 2597 return co.ret; 2598 } 2599 2600 typedef struct { 2601 BlockDriverState *bs; 2602 int req; 2603 void *buf; 2604 int ret; 2605 } BdrvIoctlCoData; 2606 2607 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) 2608 { 2609 BdrvIoctlCoData *data = opaque; 2610 data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); 2611 } 2612 2613 /* needed for generic scsi interface */ 2614 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2615 { 2616 BdrvIoctlCoData data = { 2617 .bs = bs, 2618 .req = req, 2619 .buf = buf, 2620 .ret = -EINPROGRESS, 2621 }; 2622 2623 if (qemu_in_coroutine()) { 2624 /* Fast-path if already in coroutine context */ 2625 bdrv_co_ioctl_entry(&data); 2626 } else { 2627 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); 2628 2629 qemu_coroutine_enter(co, &data); 2630 while (data.ret == -EINPROGRESS) { 2631 aio_poll(bdrv_get_aio_context(bs), true); 2632 } 2633 } 2634 return data.ret; 2635 } 2636 2637 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) 2638 { 2639 BlockAIOCBCoroutine *acb = opaque; 2640 acb->req.error = bdrv_co_do_ioctl(acb->common.bs, 2641 acb->req.req, acb->req.buf); 2642 bdrv_co_complete(acb); 2643 } 2644 2645 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2646 unsigned long int req, void *buf, 2647 BlockCompletionFunc *cb, void *opaque) 2648 { 2649 BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, 2650 bs, cb, opaque); 2651 Coroutine *co; 2652 2653 acb->need_bh = true; 2654 acb->req.error = -EINPROGRESS; 2655 acb->req.req = req; 2656 acb->req.buf = buf; 2657 co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); 2658 qemu_coroutine_enter(co, acb); 2659 2660 bdrv_co_maybe_schedule_bh(acb); 2661 return &acb->common; 2662 } 2663 2664 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2665 { 2666 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2667 } 2668 2669 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2670 { 2671 return memset(qemu_blockalign(bs, size), 0, size); 2672 } 2673 2674 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2675 { 2676 size_t align = bdrv_opt_mem_align(bs); 2677 2678 /* Ensure that NULL is never returned on success */ 2679 assert(align > 0); 2680 if (size == 0) { 2681 size = align; 2682 } 2683 2684 return qemu_try_memalign(align, size); 2685 } 2686 2687 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2688 { 2689 void *mem = qemu_try_blockalign(bs, size); 2690 2691 if (mem) { 2692 memset(mem, 0, size); 2693 } 2694 2695 return mem; 2696 } 2697 2698 /* 2699 * Check if all memory in this vector is sector aligned. 2700 */ 2701 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2702 { 2703 int i; 2704 size_t alignment = bdrv_min_mem_align(bs); 2705 2706 for (i = 0; i < qiov->niov; i++) { 2707 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2708 return false; 2709 } 2710 if (qiov->iov[i].iov_len % alignment) { 2711 return false; 2712 } 2713 } 2714 2715 return true; 2716 } 2717 2718 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2719 NotifierWithReturn *notifier) 2720 { 2721 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2722 } 2723 2724 void bdrv_io_plug(BlockDriverState *bs) 2725 { 2726 BlockDriver *drv = bs->drv; 2727 if (drv && drv->bdrv_io_plug) { 2728 drv->bdrv_io_plug(bs); 2729 } else if (bs->file) { 2730 bdrv_io_plug(bs->file->bs); 2731 } 2732 } 2733 2734 void bdrv_io_unplug(BlockDriverState *bs) 2735 { 2736 BlockDriver *drv = bs->drv; 2737 if (drv && drv->bdrv_io_unplug) { 2738 drv->bdrv_io_unplug(bs); 2739 } else if (bs->file) { 2740 bdrv_io_unplug(bs->file->bs); 2741 } 2742 } 2743 2744 void bdrv_flush_io_queue(BlockDriverState *bs) 2745 { 2746 BlockDriver *drv = bs->drv; 2747 if (drv && drv->bdrv_flush_io_queue) { 2748 drv->bdrv_flush_io_queue(bs); 2749 } else if (bs->file) { 2750 bdrv_flush_io_queue(bs->file->bs); 2751 } 2752 bdrv_start_throttled_reqs(bs); 2753 } 2754 2755 void bdrv_drained_begin(BlockDriverState *bs) 2756 { 2757 if (!bs->quiesce_counter++) { 2758 aio_disable_external(bdrv_get_aio_context(bs)); 2759 } 2760 bdrv_drain(bs); 2761 } 2762 2763 void bdrv_drained_end(BlockDriverState *bs) 2764 { 2765 assert(bs->quiesce_counter > 0); 2766 if (--bs->quiesce_counter > 0) { 2767 return; 2768 } 2769 aio_enable_external(bdrv_get_aio_context(bs)); 2770 } 2771