1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "sysemu/block-backend.h" 27 #include "block/blockjob.h" 28 #include "block/block_int.h" 29 #include "block/throttle-groups.h" 30 #include "qemu/error-report.h" 31 32 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 33 34 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 35 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 36 BlockCompletionFunc *cb, void *opaque); 37 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 38 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 39 BlockCompletionFunc *cb, void *opaque); 40 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 41 int64_t sector_num, int nb_sectors, 42 QEMUIOVector *iov); 43 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 44 int64_t sector_num, int nb_sectors, 45 QEMUIOVector *iov); 46 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 47 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 48 BdrvRequestFlags flags); 49 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 50 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 51 BdrvRequestFlags flags); 52 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 53 int64_t sector_num, 54 QEMUIOVector *qiov, 55 int nb_sectors, 56 BdrvRequestFlags flags, 57 BlockCompletionFunc *cb, 58 void *opaque, 59 bool is_write); 60 static void coroutine_fn bdrv_co_do_rw(void *opaque); 61 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 62 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 63 64 /* throttling disk I/O limits */ 65 void bdrv_set_io_limits(BlockDriverState *bs, 66 ThrottleConfig *cfg) 67 { 68 int i; 69 70 throttle_group_config(bs, cfg); 71 72 for (i = 0; i < 2; i++) { 73 qemu_co_enter_next(&bs->throttled_reqs[i]); 74 } 75 } 76 77 /* this function drain all the throttled IOs */ 78 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 79 { 80 bool drained = false; 81 bool enabled = bs->io_limits_enabled; 82 int i; 83 84 bs->io_limits_enabled = false; 85 86 for (i = 0; i < 2; i++) { 87 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 88 drained = true; 89 } 90 } 91 92 bs->io_limits_enabled = enabled; 93 94 return drained; 95 } 96 97 void bdrv_io_limits_disable(BlockDriverState *bs) 98 { 99 bs->io_limits_enabled = false; 100 bdrv_start_throttled_reqs(bs); 101 throttle_group_unregister_bs(bs); 102 } 103 104 /* should be called before bdrv_set_io_limits if a limit is set */ 105 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 106 { 107 assert(!bs->io_limits_enabled); 108 throttle_group_register_bs(bs, group); 109 bs->io_limits_enabled = true; 110 } 111 112 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 113 { 114 /* this bs is not part of any group */ 115 if (!bs->throttle_state) { 116 return; 117 } 118 119 /* this bs is a part of the same group than the one we want */ 120 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 121 return; 122 } 123 124 /* need to change the group this bs belong to */ 125 bdrv_io_limits_disable(bs); 126 bdrv_io_limits_enable(bs, group); 127 } 128 129 void bdrv_setup_io_funcs(BlockDriver *bdrv) 130 { 131 /* Block drivers without coroutine functions need emulation */ 132 if (!bdrv->bdrv_co_readv) { 133 bdrv->bdrv_co_readv = bdrv_co_readv_em; 134 bdrv->bdrv_co_writev = bdrv_co_writev_em; 135 136 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 137 * the block driver lacks aio we need to emulate that too. 138 */ 139 if (!bdrv->bdrv_aio_readv) { 140 /* add AIO emulation layer */ 141 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 142 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 143 } 144 } 145 } 146 147 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 148 { 149 BlockDriver *drv = bs->drv; 150 Error *local_err = NULL; 151 152 memset(&bs->bl, 0, sizeof(bs->bl)); 153 154 if (!drv) { 155 return; 156 } 157 158 /* Take some limits from the children as a default */ 159 if (bs->file) { 160 bdrv_refresh_limits(bs->file->bs, &local_err); 161 if (local_err) { 162 error_propagate(errp, local_err); 163 return; 164 } 165 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 166 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 167 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 168 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 169 } else { 170 bs->bl.min_mem_alignment = 512; 171 bs->bl.opt_mem_alignment = getpagesize(); 172 } 173 174 if (bs->backing) { 175 bdrv_refresh_limits(bs->backing->bs, &local_err); 176 if (local_err) { 177 error_propagate(errp, local_err); 178 return; 179 } 180 bs->bl.opt_transfer_length = 181 MAX(bs->bl.opt_transfer_length, 182 bs->backing->bs->bl.opt_transfer_length); 183 bs->bl.max_transfer_length = 184 MIN_NON_ZERO(bs->bl.max_transfer_length, 185 bs->backing->bs->bl.max_transfer_length); 186 bs->bl.opt_mem_alignment = 187 MAX(bs->bl.opt_mem_alignment, 188 bs->backing->bs->bl.opt_mem_alignment); 189 bs->bl.min_mem_alignment = 190 MAX(bs->bl.min_mem_alignment, 191 bs->backing->bs->bl.min_mem_alignment); 192 } 193 194 /* Then let the driver override it */ 195 if (drv->bdrv_refresh_limits) { 196 drv->bdrv_refresh_limits(bs, errp); 197 } 198 } 199 200 /** 201 * The copy-on-read flag is actually a reference count so multiple users may 202 * use the feature without worrying about clobbering its previous state. 203 * Copy-on-read stays enabled until all users have called to disable it. 204 */ 205 void bdrv_enable_copy_on_read(BlockDriverState *bs) 206 { 207 bs->copy_on_read++; 208 } 209 210 void bdrv_disable_copy_on_read(BlockDriverState *bs) 211 { 212 assert(bs->copy_on_read > 0); 213 bs->copy_on_read--; 214 } 215 216 /* Check if any requests are in-flight (including throttled requests) */ 217 bool bdrv_requests_pending(BlockDriverState *bs) 218 { 219 BdrvChild *child; 220 221 if (!QLIST_EMPTY(&bs->tracked_requests)) { 222 return true; 223 } 224 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 225 return true; 226 } 227 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 228 return true; 229 } 230 231 QLIST_FOREACH(child, &bs->children, next) { 232 if (bdrv_requests_pending(child->bs)) { 233 return true; 234 } 235 } 236 237 return false; 238 } 239 240 /* 241 * Wait for pending requests to complete on a single BlockDriverState subtree 242 * 243 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 244 * AioContext. 245 * 246 * Only this BlockDriverState's AioContext is run, so in-flight requests must 247 * not depend on events in other AioContexts. In that case, use 248 * bdrv_drain_all() instead. 249 */ 250 void bdrv_drain(BlockDriverState *bs) 251 { 252 bool busy = true; 253 254 while (busy) { 255 /* Keep iterating */ 256 bdrv_flush_io_queue(bs); 257 busy = bdrv_requests_pending(bs); 258 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 259 } 260 } 261 262 /* 263 * Wait for pending requests to complete across all BlockDriverStates 264 * 265 * This function does not flush data to disk, use bdrv_flush_all() for that 266 * after calling this function. 267 */ 268 void bdrv_drain_all(void) 269 { 270 /* Always run first iteration so any pending completion BHs run */ 271 bool busy = true; 272 BlockDriverState *bs = NULL; 273 GSList *aio_ctxs = NULL, *ctx; 274 275 while ((bs = bdrv_next(bs))) { 276 AioContext *aio_context = bdrv_get_aio_context(bs); 277 278 aio_context_acquire(aio_context); 279 if (bs->job) { 280 block_job_pause(bs->job); 281 } 282 aio_context_release(aio_context); 283 284 if (!g_slist_find(aio_ctxs, aio_context)) { 285 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 286 } 287 } 288 289 /* Note that completion of an asynchronous I/O operation can trigger any 290 * number of other I/O operations on other devices---for example a 291 * coroutine can submit an I/O request to another device in response to 292 * request completion. Therefore we must keep looping until there was no 293 * more activity rather than simply draining each device independently. 294 */ 295 while (busy) { 296 busy = false; 297 298 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 299 AioContext *aio_context = ctx->data; 300 bs = NULL; 301 302 aio_context_acquire(aio_context); 303 while ((bs = bdrv_next(bs))) { 304 if (aio_context == bdrv_get_aio_context(bs)) { 305 bdrv_flush_io_queue(bs); 306 if (bdrv_requests_pending(bs)) { 307 busy = true; 308 aio_poll(aio_context, busy); 309 } 310 } 311 } 312 busy |= aio_poll(aio_context, false); 313 aio_context_release(aio_context); 314 } 315 } 316 317 bs = NULL; 318 while ((bs = bdrv_next(bs))) { 319 AioContext *aio_context = bdrv_get_aio_context(bs); 320 321 aio_context_acquire(aio_context); 322 if (bs->job) { 323 block_job_resume(bs->job); 324 } 325 aio_context_release(aio_context); 326 } 327 g_slist_free(aio_ctxs); 328 } 329 330 /** 331 * Remove an active request from the tracked requests list 332 * 333 * This function should be called when a tracked request is completing. 334 */ 335 static void tracked_request_end(BdrvTrackedRequest *req) 336 { 337 if (req->serialising) { 338 req->bs->serialising_in_flight--; 339 } 340 341 QLIST_REMOVE(req, list); 342 qemu_co_queue_restart_all(&req->wait_queue); 343 } 344 345 /** 346 * Add an active request to the tracked requests list 347 */ 348 static void tracked_request_begin(BdrvTrackedRequest *req, 349 BlockDriverState *bs, 350 int64_t offset, 351 unsigned int bytes, bool is_write) 352 { 353 *req = (BdrvTrackedRequest){ 354 .bs = bs, 355 .offset = offset, 356 .bytes = bytes, 357 .is_write = is_write, 358 .co = qemu_coroutine_self(), 359 .serialising = false, 360 .overlap_offset = offset, 361 .overlap_bytes = bytes, 362 }; 363 364 qemu_co_queue_init(&req->wait_queue); 365 366 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 367 } 368 369 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 370 { 371 int64_t overlap_offset = req->offset & ~(align - 1); 372 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 373 - overlap_offset; 374 375 if (!req->serialising) { 376 req->bs->serialising_in_flight++; 377 req->serialising = true; 378 } 379 380 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 381 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 382 } 383 384 /** 385 * Round a region to cluster boundaries 386 */ 387 void bdrv_round_to_clusters(BlockDriverState *bs, 388 int64_t sector_num, int nb_sectors, 389 int64_t *cluster_sector_num, 390 int *cluster_nb_sectors) 391 { 392 BlockDriverInfo bdi; 393 394 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 395 *cluster_sector_num = sector_num; 396 *cluster_nb_sectors = nb_sectors; 397 } else { 398 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 399 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 400 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 401 nb_sectors, c); 402 } 403 } 404 405 static int bdrv_get_cluster_size(BlockDriverState *bs) 406 { 407 BlockDriverInfo bdi; 408 int ret; 409 410 ret = bdrv_get_info(bs, &bdi); 411 if (ret < 0 || bdi.cluster_size == 0) { 412 return bs->request_alignment; 413 } else { 414 return bdi.cluster_size; 415 } 416 } 417 418 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 419 int64_t offset, unsigned int bytes) 420 { 421 /* aaaa bbbb */ 422 if (offset >= req->overlap_offset + req->overlap_bytes) { 423 return false; 424 } 425 /* bbbb aaaa */ 426 if (req->overlap_offset >= offset + bytes) { 427 return false; 428 } 429 return true; 430 } 431 432 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 433 { 434 BlockDriverState *bs = self->bs; 435 BdrvTrackedRequest *req; 436 bool retry; 437 bool waited = false; 438 439 if (!bs->serialising_in_flight) { 440 return false; 441 } 442 443 do { 444 retry = false; 445 QLIST_FOREACH(req, &bs->tracked_requests, list) { 446 if (req == self || (!req->serialising && !self->serialising)) { 447 continue; 448 } 449 if (tracked_request_overlaps(req, self->overlap_offset, 450 self->overlap_bytes)) 451 { 452 /* Hitting this means there was a reentrant request, for 453 * example, a block driver issuing nested requests. This must 454 * never happen since it means deadlock. 455 */ 456 assert(qemu_coroutine_self() != req->co); 457 458 /* If the request is already (indirectly) waiting for us, or 459 * will wait for us as soon as it wakes up, then just go on 460 * (instead of producing a deadlock in the former case). */ 461 if (!req->waiting_for) { 462 self->waiting_for = req; 463 qemu_co_queue_wait(&req->wait_queue); 464 self->waiting_for = NULL; 465 retry = true; 466 waited = true; 467 break; 468 } 469 } 470 } 471 } while (retry); 472 473 return waited; 474 } 475 476 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 477 size_t size) 478 { 479 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 480 return -EIO; 481 } 482 483 if (!bdrv_is_inserted(bs)) { 484 return -ENOMEDIUM; 485 } 486 487 if (offset < 0) { 488 return -EIO; 489 } 490 491 return 0; 492 } 493 494 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 495 int nb_sectors) 496 { 497 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 498 return -EIO; 499 } 500 501 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 502 nb_sectors * BDRV_SECTOR_SIZE); 503 } 504 505 typedef struct RwCo { 506 BlockDriverState *bs; 507 int64_t offset; 508 QEMUIOVector *qiov; 509 bool is_write; 510 int ret; 511 BdrvRequestFlags flags; 512 } RwCo; 513 514 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 515 { 516 RwCo *rwco = opaque; 517 518 if (!rwco->is_write) { 519 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 520 rwco->qiov->size, rwco->qiov, 521 rwco->flags); 522 } else { 523 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 524 rwco->qiov->size, rwco->qiov, 525 rwco->flags); 526 } 527 } 528 529 /* 530 * Process a vectored synchronous request using coroutines 531 */ 532 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 533 QEMUIOVector *qiov, bool is_write, 534 BdrvRequestFlags flags) 535 { 536 Coroutine *co; 537 RwCo rwco = { 538 .bs = bs, 539 .offset = offset, 540 .qiov = qiov, 541 .is_write = is_write, 542 .ret = NOT_DONE, 543 .flags = flags, 544 }; 545 546 /** 547 * In sync call context, when the vcpu is blocked, this throttling timer 548 * will not fire; so the I/O throttling function has to be disabled here 549 * if it has been enabled. 550 */ 551 if (bs->io_limits_enabled) { 552 fprintf(stderr, "Disabling I/O throttling on '%s' due " 553 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 554 bdrv_io_limits_disable(bs); 555 } 556 557 if (qemu_in_coroutine()) { 558 /* Fast-path if already in coroutine context */ 559 bdrv_rw_co_entry(&rwco); 560 } else { 561 AioContext *aio_context = bdrv_get_aio_context(bs); 562 563 co = qemu_coroutine_create(bdrv_rw_co_entry); 564 qemu_coroutine_enter(co, &rwco); 565 while (rwco.ret == NOT_DONE) { 566 aio_poll(aio_context, true); 567 } 568 } 569 return rwco.ret; 570 } 571 572 /* 573 * Process a synchronous request using coroutines 574 */ 575 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 576 int nb_sectors, bool is_write, BdrvRequestFlags flags) 577 { 578 QEMUIOVector qiov; 579 struct iovec iov = { 580 .iov_base = (void *)buf, 581 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 582 }; 583 584 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 585 return -EINVAL; 586 } 587 588 qemu_iovec_init_external(&qiov, &iov, 1); 589 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 590 &qiov, is_write, flags); 591 } 592 593 /* return < 0 if error. See bdrv_write() for the return codes */ 594 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 595 uint8_t *buf, int nb_sectors) 596 { 597 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 598 } 599 600 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 601 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 602 uint8_t *buf, int nb_sectors) 603 { 604 bool enabled; 605 int ret; 606 607 enabled = bs->io_limits_enabled; 608 bs->io_limits_enabled = false; 609 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 610 bs->io_limits_enabled = enabled; 611 return ret; 612 } 613 614 /* Return < 0 if error. Important errors are: 615 -EIO generic I/O error (may happen for all errors) 616 -ENOMEDIUM No media inserted. 617 -EINVAL Invalid sector number or nb_sectors 618 -EACCES Trying to write a read-only device 619 */ 620 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 621 const uint8_t *buf, int nb_sectors) 622 { 623 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 624 } 625 626 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 627 int nb_sectors, BdrvRequestFlags flags) 628 { 629 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 630 BDRV_REQ_ZERO_WRITE | flags); 631 } 632 633 /* 634 * Completely zero out a block device with the help of bdrv_write_zeroes. 635 * The operation is sped up by checking the block status and only writing 636 * zeroes to the device if they currently do not return zeroes. Optional 637 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 638 * 639 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 640 */ 641 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 642 { 643 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 644 int n; 645 646 target_sectors = bdrv_nb_sectors(bs); 647 if (target_sectors < 0) { 648 return target_sectors; 649 } 650 651 for (;;) { 652 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 653 if (nb_sectors <= 0) { 654 return 0; 655 } 656 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 657 if (ret < 0) { 658 error_report("error getting block status at sector %" PRId64 ": %s", 659 sector_num, strerror(-ret)); 660 return ret; 661 } 662 if (ret & BDRV_BLOCK_ZERO) { 663 sector_num += n; 664 continue; 665 } 666 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 667 if (ret < 0) { 668 error_report("error writing zeroes at sector %" PRId64 ": %s", 669 sector_num, strerror(-ret)); 670 return ret; 671 } 672 sector_num += n; 673 } 674 } 675 676 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 677 { 678 QEMUIOVector qiov; 679 struct iovec iov = { 680 .iov_base = (void *)buf, 681 .iov_len = bytes, 682 }; 683 int ret; 684 685 if (bytes < 0) { 686 return -EINVAL; 687 } 688 689 qemu_iovec_init_external(&qiov, &iov, 1); 690 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 691 if (ret < 0) { 692 return ret; 693 } 694 695 return bytes; 696 } 697 698 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 699 { 700 int ret; 701 702 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 703 if (ret < 0) { 704 return ret; 705 } 706 707 return qiov->size; 708 } 709 710 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 711 const void *buf, int bytes) 712 { 713 QEMUIOVector qiov; 714 struct iovec iov = { 715 .iov_base = (void *) buf, 716 .iov_len = bytes, 717 }; 718 719 if (bytes < 0) { 720 return -EINVAL; 721 } 722 723 qemu_iovec_init_external(&qiov, &iov, 1); 724 return bdrv_pwritev(bs, offset, &qiov); 725 } 726 727 /* 728 * Writes to the file and ensures that no writes are reordered across this 729 * request (acts as a barrier) 730 * 731 * Returns 0 on success, -errno in error cases. 732 */ 733 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 734 const void *buf, int count) 735 { 736 int ret; 737 738 ret = bdrv_pwrite(bs, offset, buf, count); 739 if (ret < 0) { 740 return ret; 741 } 742 743 /* No flush needed for cache modes that already do it */ 744 if (bs->enable_write_cache) { 745 bdrv_flush(bs); 746 } 747 748 return 0; 749 } 750 751 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 752 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 753 { 754 /* Perform I/O through a temporary buffer so that users who scribble over 755 * their read buffer while the operation is in progress do not end up 756 * modifying the image file. This is critical for zero-copy guest I/O 757 * where anything might happen inside guest memory. 758 */ 759 void *bounce_buffer; 760 761 BlockDriver *drv = bs->drv; 762 struct iovec iov; 763 QEMUIOVector bounce_qiov; 764 int64_t cluster_sector_num; 765 int cluster_nb_sectors; 766 size_t skip_bytes; 767 int ret; 768 769 /* Cover entire cluster so no additional backing file I/O is required when 770 * allocating cluster in the image file. 771 */ 772 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 773 &cluster_sector_num, &cluster_nb_sectors); 774 775 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 776 cluster_sector_num, cluster_nb_sectors); 777 778 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 779 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 780 if (bounce_buffer == NULL) { 781 ret = -ENOMEM; 782 goto err; 783 } 784 785 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 786 787 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 788 &bounce_qiov); 789 if (ret < 0) { 790 goto err; 791 } 792 793 if (drv->bdrv_co_write_zeroes && 794 buffer_is_zero(bounce_buffer, iov.iov_len)) { 795 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 796 cluster_nb_sectors, 0); 797 } else { 798 /* This does not change the data on the disk, it is not necessary 799 * to flush even in cache=writethrough mode. 800 */ 801 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 802 &bounce_qiov); 803 } 804 805 if (ret < 0) { 806 /* It might be okay to ignore write errors for guest requests. If this 807 * is a deliberate copy-on-read then we don't want to ignore the error. 808 * Simply report it in all cases. 809 */ 810 goto err; 811 } 812 813 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 814 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 815 nb_sectors * BDRV_SECTOR_SIZE); 816 817 err: 818 qemu_vfree(bounce_buffer); 819 return ret; 820 } 821 822 /* 823 * Forwards an already correctly aligned request to the BlockDriver. This 824 * handles copy on read and zeroing after EOF; any other features must be 825 * implemented by the caller. 826 */ 827 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 828 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 829 int64_t align, QEMUIOVector *qiov, int flags) 830 { 831 BlockDriver *drv = bs->drv; 832 int ret; 833 834 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 835 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 836 837 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 838 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 839 assert(!qiov || bytes == qiov->size); 840 841 /* Handle Copy on Read and associated serialisation */ 842 if (flags & BDRV_REQ_COPY_ON_READ) { 843 /* If we touch the same cluster it counts as an overlap. This 844 * guarantees that allocating writes will be serialized and not race 845 * with each other for the same cluster. For example, in copy-on-read 846 * it ensures that the CoR read and write operations are atomic and 847 * guest writes cannot interleave between them. */ 848 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 849 } 850 851 wait_serialising_requests(req); 852 853 if (flags & BDRV_REQ_COPY_ON_READ) { 854 int pnum; 855 856 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 857 if (ret < 0) { 858 goto out; 859 } 860 861 if (!ret || pnum != nb_sectors) { 862 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 863 goto out; 864 } 865 } 866 867 /* Forward the request to the BlockDriver */ 868 if (!bs->zero_beyond_eof) { 869 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 870 } else { 871 /* Read zeros after EOF */ 872 int64_t total_sectors, max_nb_sectors; 873 874 total_sectors = bdrv_nb_sectors(bs); 875 if (total_sectors < 0) { 876 ret = total_sectors; 877 goto out; 878 } 879 880 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 881 align >> BDRV_SECTOR_BITS); 882 if (nb_sectors < max_nb_sectors) { 883 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 884 } else if (max_nb_sectors > 0) { 885 QEMUIOVector local_qiov; 886 887 qemu_iovec_init(&local_qiov, qiov->niov); 888 qemu_iovec_concat(&local_qiov, qiov, 0, 889 max_nb_sectors * BDRV_SECTOR_SIZE); 890 891 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 892 &local_qiov); 893 894 qemu_iovec_destroy(&local_qiov); 895 } else { 896 ret = 0; 897 } 898 899 /* Reading beyond end of file is supposed to produce zeroes */ 900 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 901 uint64_t offset = MAX(0, total_sectors - sector_num); 902 uint64_t bytes = (sector_num + nb_sectors - offset) * 903 BDRV_SECTOR_SIZE; 904 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 905 } 906 } 907 908 out: 909 return ret; 910 } 911 912 /* 913 * Handle a read request in coroutine context 914 */ 915 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 916 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 917 BdrvRequestFlags flags) 918 { 919 BlockDriver *drv = bs->drv; 920 BdrvTrackedRequest req; 921 922 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 923 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 924 uint8_t *head_buf = NULL; 925 uint8_t *tail_buf = NULL; 926 QEMUIOVector local_qiov; 927 bool use_local_qiov = false; 928 int ret; 929 930 if (!drv) { 931 return -ENOMEDIUM; 932 } 933 934 ret = bdrv_check_byte_request(bs, offset, bytes); 935 if (ret < 0) { 936 return ret; 937 } 938 939 /* Don't do copy-on-read if we read data before write operation */ 940 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) { 941 flags |= BDRV_REQ_COPY_ON_READ; 942 } 943 944 /* throttling disk I/O */ 945 if (bs->io_limits_enabled) { 946 throttle_group_co_io_limits_intercept(bs, bytes, false); 947 } 948 949 /* Align read if necessary by padding qiov */ 950 if (offset & (align - 1)) { 951 head_buf = qemu_blockalign(bs, align); 952 qemu_iovec_init(&local_qiov, qiov->niov + 2); 953 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 954 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 955 use_local_qiov = true; 956 957 bytes += offset & (align - 1); 958 offset = offset & ~(align - 1); 959 } 960 961 if ((offset + bytes) & (align - 1)) { 962 if (!use_local_qiov) { 963 qemu_iovec_init(&local_qiov, qiov->niov + 1); 964 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 965 use_local_qiov = true; 966 } 967 tail_buf = qemu_blockalign(bs, align); 968 qemu_iovec_add(&local_qiov, tail_buf, 969 align - ((offset + bytes) & (align - 1))); 970 971 bytes = ROUND_UP(bytes, align); 972 } 973 974 tracked_request_begin(&req, bs, offset, bytes, false); 975 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 976 use_local_qiov ? &local_qiov : qiov, 977 flags); 978 tracked_request_end(&req); 979 980 if (use_local_qiov) { 981 qemu_iovec_destroy(&local_qiov); 982 qemu_vfree(head_buf); 983 qemu_vfree(tail_buf); 984 } 985 986 return ret; 987 } 988 989 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 990 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 991 BdrvRequestFlags flags) 992 { 993 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 994 return -EINVAL; 995 } 996 997 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 998 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 999 } 1000 1001 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1002 int nb_sectors, QEMUIOVector *qiov) 1003 { 1004 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1005 1006 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1007 } 1008 1009 int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs, 1010 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1011 { 1012 trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors); 1013 1014 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1015 BDRV_REQ_NO_COPY_ON_READ); 1016 } 1017 1018 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1019 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1020 { 1021 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1022 1023 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1024 BDRV_REQ_COPY_ON_READ); 1025 } 1026 1027 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1028 1029 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1030 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1031 { 1032 BlockDriver *drv = bs->drv; 1033 QEMUIOVector qiov; 1034 struct iovec iov = {0}; 1035 int ret = 0; 1036 1037 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1038 BDRV_REQUEST_MAX_SECTORS); 1039 1040 while (nb_sectors > 0 && !ret) { 1041 int num = nb_sectors; 1042 1043 /* Align request. Block drivers can expect the "bulk" of the request 1044 * to be aligned. 1045 */ 1046 if (bs->bl.write_zeroes_alignment 1047 && num > bs->bl.write_zeroes_alignment) { 1048 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1049 /* Make a small request up to the first aligned sector. */ 1050 num = bs->bl.write_zeroes_alignment; 1051 num -= sector_num % bs->bl.write_zeroes_alignment; 1052 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1053 /* Shorten the request to the last aligned sector. num cannot 1054 * underflow because num > bs->bl.write_zeroes_alignment. 1055 */ 1056 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1057 } 1058 } 1059 1060 /* limit request size */ 1061 if (num > max_write_zeroes) { 1062 num = max_write_zeroes; 1063 } 1064 1065 ret = -ENOTSUP; 1066 /* First try the efficient write zeroes operation */ 1067 if (drv->bdrv_co_write_zeroes) { 1068 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1069 } 1070 1071 if (ret == -ENOTSUP) { 1072 /* Fall back to bounce buffer if write zeroes is unsupported */ 1073 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1074 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1075 num = MIN(num, max_xfer_len); 1076 iov.iov_len = num * BDRV_SECTOR_SIZE; 1077 if (iov.iov_base == NULL) { 1078 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1079 if (iov.iov_base == NULL) { 1080 ret = -ENOMEM; 1081 goto fail; 1082 } 1083 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1084 } 1085 qemu_iovec_init_external(&qiov, &iov, 1); 1086 1087 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1088 1089 /* Keep bounce buffer around if it is big enough for all 1090 * all future requests. 1091 */ 1092 if (num < max_xfer_len) { 1093 qemu_vfree(iov.iov_base); 1094 iov.iov_base = NULL; 1095 } 1096 } 1097 1098 sector_num += num; 1099 nb_sectors -= num; 1100 } 1101 1102 fail: 1103 qemu_vfree(iov.iov_base); 1104 return ret; 1105 } 1106 1107 /* 1108 * Forwards an already correctly aligned write request to the BlockDriver. 1109 */ 1110 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1111 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1112 QEMUIOVector *qiov, int flags) 1113 { 1114 BlockDriver *drv = bs->drv; 1115 bool waited; 1116 int ret; 1117 1118 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1119 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1120 1121 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1122 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1123 assert(!qiov || bytes == qiov->size); 1124 1125 waited = wait_serialising_requests(req); 1126 assert(!waited || !req->serialising); 1127 assert(req->overlap_offset <= offset); 1128 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1129 1130 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1131 1132 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1133 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1134 qemu_iovec_is_zero(qiov)) { 1135 flags |= BDRV_REQ_ZERO_WRITE; 1136 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1137 flags |= BDRV_REQ_MAY_UNMAP; 1138 } 1139 } 1140 1141 if (ret < 0) { 1142 /* Do nothing, write notifier decided to fail this request */ 1143 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1144 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1145 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1146 } else { 1147 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1148 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1149 } 1150 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1151 1152 if (ret == 0 && !bs->enable_write_cache) { 1153 ret = bdrv_co_flush(bs); 1154 } 1155 1156 bdrv_set_dirty(bs, sector_num, nb_sectors); 1157 1158 if (bs->wr_highest_offset < offset + bytes) { 1159 bs->wr_highest_offset = offset + bytes; 1160 } 1161 1162 if (ret >= 0) { 1163 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1164 } 1165 1166 return ret; 1167 } 1168 1169 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1170 int64_t offset, 1171 unsigned int bytes, 1172 BdrvRequestFlags flags, 1173 BdrvTrackedRequest *req) 1174 { 1175 uint8_t *buf = NULL; 1176 QEMUIOVector local_qiov; 1177 struct iovec iov; 1178 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1179 unsigned int head_padding_bytes, tail_padding_bytes; 1180 int ret = 0; 1181 1182 head_padding_bytes = offset & (align - 1); 1183 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1184 1185 1186 assert(flags & BDRV_REQ_ZERO_WRITE); 1187 if (head_padding_bytes || tail_padding_bytes) { 1188 buf = qemu_blockalign(bs, align); 1189 iov = (struct iovec) { 1190 .iov_base = buf, 1191 .iov_len = align, 1192 }; 1193 qemu_iovec_init_external(&local_qiov, &iov, 1); 1194 } 1195 if (head_padding_bytes) { 1196 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1197 1198 /* RMW the unaligned part before head. */ 1199 mark_request_serialising(req, align); 1200 wait_serialising_requests(req); 1201 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1202 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1203 align, &local_qiov, 0); 1204 if (ret < 0) { 1205 goto fail; 1206 } 1207 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1208 1209 memset(buf + head_padding_bytes, 0, zero_bytes); 1210 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1211 &local_qiov, 1212 flags & ~BDRV_REQ_ZERO_WRITE); 1213 if (ret < 0) { 1214 goto fail; 1215 } 1216 offset += zero_bytes; 1217 bytes -= zero_bytes; 1218 } 1219 1220 assert(!bytes || (offset & (align - 1)) == 0); 1221 if (bytes >= align) { 1222 /* Write the aligned part in the middle. */ 1223 uint64_t aligned_bytes = bytes & ~(align - 1); 1224 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1225 NULL, flags); 1226 if (ret < 0) { 1227 goto fail; 1228 } 1229 bytes -= aligned_bytes; 1230 offset += aligned_bytes; 1231 } 1232 1233 assert(!bytes || (offset & (align - 1)) == 0); 1234 if (bytes) { 1235 assert(align == tail_padding_bytes + bytes); 1236 /* RMW the unaligned part after tail. */ 1237 mark_request_serialising(req, align); 1238 wait_serialising_requests(req); 1239 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1240 ret = bdrv_aligned_preadv(bs, req, offset, align, 1241 align, &local_qiov, 0); 1242 if (ret < 0) { 1243 goto fail; 1244 } 1245 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1246 1247 memset(buf, 0, bytes); 1248 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1249 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1250 } 1251 fail: 1252 qemu_vfree(buf); 1253 return ret; 1254 1255 } 1256 1257 /* 1258 * Handle a write request in coroutine context 1259 */ 1260 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1261 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1262 BdrvRequestFlags flags) 1263 { 1264 BdrvTrackedRequest req; 1265 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1266 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1267 uint8_t *head_buf = NULL; 1268 uint8_t *tail_buf = NULL; 1269 QEMUIOVector local_qiov; 1270 bool use_local_qiov = false; 1271 int ret; 1272 1273 if (!bs->drv) { 1274 return -ENOMEDIUM; 1275 } 1276 if (bs->read_only) { 1277 return -EPERM; 1278 } 1279 1280 ret = bdrv_check_byte_request(bs, offset, bytes); 1281 if (ret < 0) { 1282 return ret; 1283 } 1284 1285 /* throttling disk I/O */ 1286 if (bs->io_limits_enabled) { 1287 throttle_group_co_io_limits_intercept(bs, bytes, true); 1288 } 1289 1290 /* 1291 * Align write if necessary by performing a read-modify-write cycle. 1292 * Pad qiov with the read parts and be sure to have a tracked request not 1293 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1294 */ 1295 tracked_request_begin(&req, bs, offset, bytes, true); 1296 1297 if (!qiov) { 1298 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1299 goto out; 1300 } 1301 1302 if (offset & (align - 1)) { 1303 QEMUIOVector head_qiov; 1304 struct iovec head_iov; 1305 1306 mark_request_serialising(&req, align); 1307 wait_serialising_requests(&req); 1308 1309 head_buf = qemu_blockalign(bs, align); 1310 head_iov = (struct iovec) { 1311 .iov_base = head_buf, 1312 .iov_len = align, 1313 }; 1314 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1315 1316 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1317 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1318 align, &head_qiov, 0); 1319 if (ret < 0) { 1320 goto fail; 1321 } 1322 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1323 1324 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1325 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1326 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1327 use_local_qiov = true; 1328 1329 bytes += offset & (align - 1); 1330 offset = offset & ~(align - 1); 1331 } 1332 1333 if ((offset + bytes) & (align - 1)) { 1334 QEMUIOVector tail_qiov; 1335 struct iovec tail_iov; 1336 size_t tail_bytes; 1337 bool waited; 1338 1339 mark_request_serialising(&req, align); 1340 waited = wait_serialising_requests(&req); 1341 assert(!waited || !use_local_qiov); 1342 1343 tail_buf = qemu_blockalign(bs, align); 1344 tail_iov = (struct iovec) { 1345 .iov_base = tail_buf, 1346 .iov_len = align, 1347 }; 1348 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1349 1350 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1351 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1352 align, &tail_qiov, 0); 1353 if (ret < 0) { 1354 goto fail; 1355 } 1356 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1357 1358 if (!use_local_qiov) { 1359 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1360 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1361 use_local_qiov = true; 1362 } 1363 1364 tail_bytes = (offset + bytes) & (align - 1); 1365 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1366 1367 bytes = ROUND_UP(bytes, align); 1368 } 1369 1370 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1371 use_local_qiov ? &local_qiov : qiov, 1372 flags); 1373 1374 fail: 1375 1376 if (use_local_qiov) { 1377 qemu_iovec_destroy(&local_qiov); 1378 } 1379 qemu_vfree(head_buf); 1380 qemu_vfree(tail_buf); 1381 out: 1382 tracked_request_end(&req); 1383 return ret; 1384 } 1385 1386 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1387 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1388 BdrvRequestFlags flags) 1389 { 1390 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1391 return -EINVAL; 1392 } 1393 1394 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1395 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1396 } 1397 1398 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1399 int nb_sectors, QEMUIOVector *qiov) 1400 { 1401 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1402 1403 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1404 } 1405 1406 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1407 int64_t sector_num, int nb_sectors, 1408 BdrvRequestFlags flags) 1409 { 1410 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1411 1412 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1413 flags &= ~BDRV_REQ_MAY_UNMAP; 1414 } 1415 1416 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1417 BDRV_REQ_ZERO_WRITE | flags); 1418 } 1419 1420 int bdrv_flush_all(void) 1421 { 1422 BlockDriverState *bs = NULL; 1423 int result = 0; 1424 1425 while ((bs = bdrv_next(bs))) { 1426 AioContext *aio_context = bdrv_get_aio_context(bs); 1427 int ret; 1428 1429 aio_context_acquire(aio_context); 1430 ret = bdrv_flush(bs); 1431 if (ret < 0 && !result) { 1432 result = ret; 1433 } 1434 aio_context_release(aio_context); 1435 } 1436 1437 return result; 1438 } 1439 1440 typedef struct BdrvCoGetBlockStatusData { 1441 BlockDriverState *bs; 1442 BlockDriverState *base; 1443 int64_t sector_num; 1444 int nb_sectors; 1445 int *pnum; 1446 int64_t ret; 1447 bool done; 1448 } BdrvCoGetBlockStatusData; 1449 1450 /* 1451 * Returns the allocation status of the specified sectors. 1452 * Drivers not implementing the functionality are assumed to not support 1453 * backing files, hence all their sectors are reported as allocated. 1454 * 1455 * If 'sector_num' is beyond the end of the disk image the return value is 0 1456 * and 'pnum' is set to 0. 1457 * 1458 * 'pnum' is set to the number of sectors (including and immediately following 1459 * the specified sector) that are known to be in the same 1460 * allocated/unallocated state. 1461 * 1462 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1463 * beyond the end of the disk image it will be clamped. 1464 */ 1465 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1466 int64_t sector_num, 1467 int nb_sectors, int *pnum) 1468 { 1469 int64_t total_sectors; 1470 int64_t n; 1471 int64_t ret, ret2; 1472 1473 total_sectors = bdrv_nb_sectors(bs); 1474 if (total_sectors < 0) { 1475 return total_sectors; 1476 } 1477 1478 if (sector_num >= total_sectors) { 1479 *pnum = 0; 1480 return 0; 1481 } 1482 1483 n = total_sectors - sector_num; 1484 if (n < nb_sectors) { 1485 nb_sectors = n; 1486 } 1487 1488 if (!bs->drv->bdrv_co_get_block_status) { 1489 *pnum = nb_sectors; 1490 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1491 if (bs->drv->protocol_name) { 1492 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1493 } 1494 return ret; 1495 } 1496 1497 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1498 if (ret < 0) { 1499 *pnum = 0; 1500 return ret; 1501 } 1502 1503 if (ret & BDRV_BLOCK_RAW) { 1504 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1505 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1506 *pnum, pnum); 1507 } 1508 1509 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1510 ret |= BDRV_BLOCK_ALLOCATED; 1511 } else { 1512 if (bdrv_unallocated_blocks_are_zero(bs)) { 1513 ret |= BDRV_BLOCK_ZERO; 1514 } else if (bs->backing) { 1515 BlockDriverState *bs2 = bs->backing->bs; 1516 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1517 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1518 ret |= BDRV_BLOCK_ZERO; 1519 } 1520 } 1521 } 1522 1523 if (bs->file && 1524 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1525 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1526 int file_pnum; 1527 1528 ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1529 *pnum, &file_pnum); 1530 if (ret2 >= 0) { 1531 /* Ignore errors. This is just providing extra information, it 1532 * is useful but not necessary. 1533 */ 1534 if (!file_pnum) { 1535 /* !file_pnum indicates an offset at or beyond the EOF; it is 1536 * perfectly valid for the format block driver to point to such 1537 * offsets, so catch it and mark everything as zero */ 1538 ret |= BDRV_BLOCK_ZERO; 1539 } else { 1540 /* Limit request to the range reported by the protocol driver */ 1541 *pnum = file_pnum; 1542 ret |= (ret2 & BDRV_BLOCK_ZERO); 1543 } 1544 } 1545 } 1546 1547 return ret; 1548 } 1549 1550 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1551 BlockDriverState *base, 1552 int64_t sector_num, 1553 int nb_sectors, 1554 int *pnum) 1555 { 1556 BlockDriverState *p; 1557 int64_t ret = 0; 1558 1559 assert(bs != base); 1560 for (p = bs; p != base; p = backing_bs(p)) { 1561 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); 1562 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1563 break; 1564 } 1565 /* [sector_num, pnum] unallocated on this layer, which could be only 1566 * the first part of [sector_num, nb_sectors]. */ 1567 nb_sectors = MIN(nb_sectors, *pnum); 1568 } 1569 return ret; 1570 } 1571 1572 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1573 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1574 { 1575 BdrvCoGetBlockStatusData *data = opaque; 1576 1577 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1578 data->sector_num, 1579 data->nb_sectors, 1580 data->pnum); 1581 data->done = true; 1582 } 1583 1584 /* 1585 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1586 * 1587 * See bdrv_co_get_block_status_above() for details. 1588 */ 1589 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1590 BlockDriverState *base, 1591 int64_t sector_num, 1592 int nb_sectors, int *pnum) 1593 { 1594 Coroutine *co; 1595 BdrvCoGetBlockStatusData data = { 1596 .bs = bs, 1597 .base = base, 1598 .sector_num = sector_num, 1599 .nb_sectors = nb_sectors, 1600 .pnum = pnum, 1601 .done = false, 1602 }; 1603 1604 if (qemu_in_coroutine()) { 1605 /* Fast-path if already in coroutine context */ 1606 bdrv_get_block_status_above_co_entry(&data); 1607 } else { 1608 AioContext *aio_context = bdrv_get_aio_context(bs); 1609 1610 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1611 qemu_coroutine_enter(co, &data); 1612 while (!data.done) { 1613 aio_poll(aio_context, true); 1614 } 1615 } 1616 return data.ret; 1617 } 1618 1619 int64_t bdrv_get_block_status(BlockDriverState *bs, 1620 int64_t sector_num, 1621 int nb_sectors, int *pnum) 1622 { 1623 return bdrv_get_block_status_above(bs, backing_bs(bs), 1624 sector_num, nb_sectors, pnum); 1625 } 1626 1627 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1628 int nb_sectors, int *pnum) 1629 { 1630 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1631 if (ret < 0) { 1632 return ret; 1633 } 1634 return !!(ret & BDRV_BLOCK_ALLOCATED); 1635 } 1636 1637 /* 1638 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1639 * 1640 * Return true if the given sector is allocated in any image between 1641 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1642 * sector is allocated in any image of the chain. Return false otherwise. 1643 * 1644 * 'pnum' is set to the number of sectors (including and immediately following 1645 * the specified sector) that are known to be in the same 1646 * allocated/unallocated state. 1647 * 1648 */ 1649 int bdrv_is_allocated_above(BlockDriverState *top, 1650 BlockDriverState *base, 1651 int64_t sector_num, 1652 int nb_sectors, int *pnum) 1653 { 1654 BlockDriverState *intermediate; 1655 int ret, n = nb_sectors; 1656 1657 intermediate = top; 1658 while (intermediate && intermediate != base) { 1659 int pnum_inter; 1660 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1661 &pnum_inter); 1662 if (ret < 0) { 1663 return ret; 1664 } else if (ret) { 1665 *pnum = pnum_inter; 1666 return 1; 1667 } 1668 1669 /* 1670 * [sector_num, nb_sectors] is unallocated on top but intermediate 1671 * might have 1672 * 1673 * [sector_num+x, nr_sectors] allocated. 1674 */ 1675 if (n > pnum_inter && 1676 (intermediate == top || 1677 sector_num + pnum_inter < intermediate->total_sectors)) { 1678 n = pnum_inter; 1679 } 1680 1681 intermediate = backing_bs(intermediate); 1682 } 1683 1684 *pnum = n; 1685 return 0; 1686 } 1687 1688 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1689 const uint8_t *buf, int nb_sectors) 1690 { 1691 BlockDriver *drv = bs->drv; 1692 int ret; 1693 1694 if (!drv) { 1695 return -ENOMEDIUM; 1696 } 1697 if (!drv->bdrv_write_compressed) { 1698 return -ENOTSUP; 1699 } 1700 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1701 if (ret < 0) { 1702 return ret; 1703 } 1704 1705 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1706 1707 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1708 } 1709 1710 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1711 int64_t pos, int size) 1712 { 1713 QEMUIOVector qiov; 1714 struct iovec iov = { 1715 .iov_base = (void *) buf, 1716 .iov_len = size, 1717 }; 1718 1719 qemu_iovec_init_external(&qiov, &iov, 1); 1720 return bdrv_writev_vmstate(bs, &qiov, pos); 1721 } 1722 1723 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1724 { 1725 BlockDriver *drv = bs->drv; 1726 1727 if (!drv) { 1728 return -ENOMEDIUM; 1729 } else if (drv->bdrv_save_vmstate) { 1730 return drv->bdrv_save_vmstate(bs, qiov, pos); 1731 } else if (bs->file) { 1732 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1733 } 1734 1735 return -ENOTSUP; 1736 } 1737 1738 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1739 int64_t pos, int size) 1740 { 1741 BlockDriver *drv = bs->drv; 1742 if (!drv) 1743 return -ENOMEDIUM; 1744 if (drv->bdrv_load_vmstate) 1745 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1746 if (bs->file) 1747 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1748 return -ENOTSUP; 1749 } 1750 1751 /**************************************************************/ 1752 /* async I/Os */ 1753 1754 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1755 QEMUIOVector *qiov, int nb_sectors, 1756 BlockCompletionFunc *cb, void *opaque) 1757 { 1758 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1759 1760 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1761 cb, opaque, false); 1762 } 1763 1764 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1765 QEMUIOVector *qiov, int nb_sectors, 1766 BlockCompletionFunc *cb, void *opaque) 1767 { 1768 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1769 1770 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1771 cb, opaque, true); 1772 } 1773 1774 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1775 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1776 BlockCompletionFunc *cb, void *opaque) 1777 { 1778 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1779 1780 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1781 BDRV_REQ_ZERO_WRITE | flags, 1782 cb, opaque, true); 1783 } 1784 1785 1786 typedef struct MultiwriteCB { 1787 int error; 1788 int num_requests; 1789 int num_callbacks; 1790 struct { 1791 BlockCompletionFunc *cb; 1792 void *opaque; 1793 QEMUIOVector *free_qiov; 1794 } callbacks[]; 1795 } MultiwriteCB; 1796 1797 static void multiwrite_user_cb(MultiwriteCB *mcb) 1798 { 1799 int i; 1800 1801 for (i = 0; i < mcb->num_callbacks; i++) { 1802 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1803 if (mcb->callbacks[i].free_qiov) { 1804 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1805 } 1806 g_free(mcb->callbacks[i].free_qiov); 1807 } 1808 } 1809 1810 static void multiwrite_cb(void *opaque, int ret) 1811 { 1812 MultiwriteCB *mcb = opaque; 1813 1814 trace_multiwrite_cb(mcb, ret); 1815 1816 if (ret < 0 && !mcb->error) { 1817 mcb->error = ret; 1818 } 1819 1820 mcb->num_requests--; 1821 if (mcb->num_requests == 0) { 1822 multiwrite_user_cb(mcb); 1823 g_free(mcb); 1824 } 1825 } 1826 1827 static int multiwrite_req_compare(const void *a, const void *b) 1828 { 1829 const BlockRequest *req1 = a, *req2 = b; 1830 1831 /* 1832 * Note that we can't simply subtract req2->sector from req1->sector 1833 * here as that could overflow the return value. 1834 */ 1835 if (req1->sector > req2->sector) { 1836 return 1; 1837 } else if (req1->sector < req2->sector) { 1838 return -1; 1839 } else { 1840 return 0; 1841 } 1842 } 1843 1844 /* 1845 * Takes a bunch of requests and tries to merge them. Returns the number of 1846 * requests that remain after merging. 1847 */ 1848 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1849 int num_reqs, MultiwriteCB *mcb) 1850 { 1851 int i, outidx; 1852 1853 // Sort requests by start sector 1854 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1855 1856 // Check if adjacent requests touch the same clusters. If so, combine them, 1857 // filling up gaps with zero sectors. 1858 outidx = 0; 1859 for (i = 1; i < num_reqs; i++) { 1860 int merge = 0; 1861 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1862 1863 // Handle exactly sequential writes and overlapping writes. 1864 if (reqs[i].sector <= oldreq_last) { 1865 merge = 1; 1866 } 1867 1868 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1869 merge = 0; 1870 } 1871 1872 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1873 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1874 merge = 0; 1875 } 1876 1877 if (merge) { 1878 size_t size; 1879 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1880 qemu_iovec_init(qiov, 1881 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1882 1883 // Add the first request to the merged one. If the requests are 1884 // overlapping, drop the last sectors of the first request. 1885 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1886 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1887 1888 // We should need to add any zeros between the two requests 1889 assert (reqs[i].sector <= oldreq_last); 1890 1891 // Add the second request 1892 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1893 1894 // Add tail of first request, if necessary 1895 if (qiov->size < reqs[outidx].qiov->size) { 1896 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1897 reqs[outidx].qiov->size - qiov->size); 1898 } 1899 1900 reqs[outidx].nb_sectors = qiov->size >> 9; 1901 reqs[outidx].qiov = qiov; 1902 1903 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1904 } else { 1905 outidx++; 1906 reqs[outidx].sector = reqs[i].sector; 1907 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1908 reqs[outidx].qiov = reqs[i].qiov; 1909 } 1910 } 1911 1912 if (bs->blk) { 1913 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1914 num_reqs - outidx - 1); 1915 } 1916 1917 return outidx + 1; 1918 } 1919 1920 /* 1921 * Submit multiple AIO write requests at once. 1922 * 1923 * On success, the function returns 0 and all requests in the reqs array have 1924 * been submitted. In error case this function returns -1, and any of the 1925 * requests may or may not be submitted yet. In particular, this means that the 1926 * callback will be called for some of the requests, for others it won't. The 1927 * caller must check the error field of the BlockRequest to wait for the right 1928 * callbacks (if error != 0, no callback will be called). 1929 * 1930 * The implementation may modify the contents of the reqs array, e.g. to merge 1931 * requests. However, the fields opaque and error are left unmodified as they 1932 * are used to signal failure for a single request to the caller. 1933 */ 1934 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1935 { 1936 MultiwriteCB *mcb; 1937 int i; 1938 1939 /* don't submit writes if we don't have a medium */ 1940 if (bs->drv == NULL) { 1941 for (i = 0; i < num_reqs; i++) { 1942 reqs[i].error = -ENOMEDIUM; 1943 } 1944 return -1; 1945 } 1946 1947 if (num_reqs == 0) { 1948 return 0; 1949 } 1950 1951 // Create MultiwriteCB structure 1952 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1953 mcb->num_requests = 0; 1954 mcb->num_callbacks = num_reqs; 1955 1956 for (i = 0; i < num_reqs; i++) { 1957 mcb->callbacks[i].cb = reqs[i].cb; 1958 mcb->callbacks[i].opaque = reqs[i].opaque; 1959 } 1960 1961 // Check for mergable requests 1962 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1963 1964 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1965 1966 /* Run the aio requests. */ 1967 mcb->num_requests = num_reqs; 1968 for (i = 0; i < num_reqs; i++) { 1969 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1970 reqs[i].nb_sectors, reqs[i].flags, 1971 multiwrite_cb, mcb, 1972 true); 1973 } 1974 1975 return 0; 1976 } 1977 1978 void bdrv_aio_cancel(BlockAIOCB *acb) 1979 { 1980 qemu_aio_ref(acb); 1981 bdrv_aio_cancel_async(acb); 1982 while (acb->refcnt > 1) { 1983 if (acb->aiocb_info->get_aio_context) { 1984 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1985 } else if (acb->bs) { 1986 aio_poll(bdrv_get_aio_context(acb->bs), true); 1987 } else { 1988 abort(); 1989 } 1990 } 1991 qemu_aio_unref(acb); 1992 } 1993 1994 /* Async version of aio cancel. The caller is not blocked if the acb implements 1995 * cancel_async, otherwise we do nothing and let the request normally complete. 1996 * In either case the completion callback must be called. */ 1997 void bdrv_aio_cancel_async(BlockAIOCB *acb) 1998 { 1999 if (acb->aiocb_info->cancel_async) { 2000 acb->aiocb_info->cancel_async(acb); 2001 } 2002 } 2003 2004 /**************************************************************/ 2005 /* async block device emulation */ 2006 2007 typedef struct BlockAIOCBSync { 2008 BlockAIOCB common; 2009 QEMUBH *bh; 2010 int ret; 2011 /* vector translation state */ 2012 QEMUIOVector *qiov; 2013 uint8_t *bounce; 2014 int is_write; 2015 } BlockAIOCBSync; 2016 2017 static const AIOCBInfo bdrv_em_aiocb_info = { 2018 .aiocb_size = sizeof(BlockAIOCBSync), 2019 }; 2020 2021 static void bdrv_aio_bh_cb(void *opaque) 2022 { 2023 BlockAIOCBSync *acb = opaque; 2024 2025 if (!acb->is_write && acb->ret >= 0) { 2026 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2027 } 2028 qemu_vfree(acb->bounce); 2029 acb->common.cb(acb->common.opaque, acb->ret); 2030 qemu_bh_delete(acb->bh); 2031 acb->bh = NULL; 2032 qemu_aio_unref(acb); 2033 } 2034 2035 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2036 int64_t sector_num, 2037 QEMUIOVector *qiov, 2038 int nb_sectors, 2039 BlockCompletionFunc *cb, 2040 void *opaque, 2041 int is_write) 2042 2043 { 2044 BlockAIOCBSync *acb; 2045 2046 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2047 acb->is_write = is_write; 2048 acb->qiov = qiov; 2049 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2050 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2051 2052 if (acb->bounce == NULL) { 2053 acb->ret = -ENOMEM; 2054 } else if (is_write) { 2055 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2056 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2057 } else { 2058 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2059 } 2060 2061 qemu_bh_schedule(acb->bh); 2062 2063 return &acb->common; 2064 } 2065 2066 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2067 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2068 BlockCompletionFunc *cb, void *opaque) 2069 { 2070 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2071 } 2072 2073 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2074 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2075 BlockCompletionFunc *cb, void *opaque) 2076 { 2077 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2078 } 2079 2080 2081 typedef struct BlockAIOCBCoroutine { 2082 BlockAIOCB common; 2083 BlockRequest req; 2084 bool is_write; 2085 bool need_bh; 2086 bool *done; 2087 QEMUBH* bh; 2088 } BlockAIOCBCoroutine; 2089 2090 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2091 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2092 }; 2093 2094 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2095 { 2096 if (!acb->need_bh) { 2097 acb->common.cb(acb->common.opaque, acb->req.error); 2098 qemu_aio_unref(acb); 2099 } 2100 } 2101 2102 static void bdrv_co_em_bh(void *opaque) 2103 { 2104 BlockAIOCBCoroutine *acb = opaque; 2105 2106 assert(!acb->need_bh); 2107 qemu_bh_delete(acb->bh); 2108 bdrv_co_complete(acb); 2109 } 2110 2111 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2112 { 2113 acb->need_bh = false; 2114 if (acb->req.error != -EINPROGRESS) { 2115 BlockDriverState *bs = acb->common.bs; 2116 2117 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2118 qemu_bh_schedule(acb->bh); 2119 } 2120 } 2121 2122 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2123 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2124 { 2125 BlockAIOCBCoroutine *acb = opaque; 2126 BlockDriverState *bs = acb->common.bs; 2127 2128 if (!acb->is_write) { 2129 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2130 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2131 } else { 2132 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2133 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2134 } 2135 2136 bdrv_co_complete(acb); 2137 } 2138 2139 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2140 int64_t sector_num, 2141 QEMUIOVector *qiov, 2142 int nb_sectors, 2143 BdrvRequestFlags flags, 2144 BlockCompletionFunc *cb, 2145 void *opaque, 2146 bool is_write) 2147 { 2148 Coroutine *co; 2149 BlockAIOCBCoroutine *acb; 2150 2151 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2152 acb->need_bh = true; 2153 acb->req.error = -EINPROGRESS; 2154 acb->req.sector = sector_num; 2155 acb->req.nb_sectors = nb_sectors; 2156 acb->req.qiov = qiov; 2157 acb->req.flags = flags; 2158 acb->is_write = is_write; 2159 2160 co = qemu_coroutine_create(bdrv_co_do_rw); 2161 qemu_coroutine_enter(co, acb); 2162 2163 bdrv_co_maybe_schedule_bh(acb); 2164 return &acb->common; 2165 } 2166 2167 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2168 { 2169 BlockAIOCBCoroutine *acb = opaque; 2170 BlockDriverState *bs = acb->common.bs; 2171 2172 acb->req.error = bdrv_co_flush(bs); 2173 bdrv_co_complete(acb); 2174 } 2175 2176 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2177 BlockCompletionFunc *cb, void *opaque) 2178 { 2179 trace_bdrv_aio_flush(bs, opaque); 2180 2181 Coroutine *co; 2182 BlockAIOCBCoroutine *acb; 2183 2184 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2185 acb->need_bh = true; 2186 acb->req.error = -EINPROGRESS; 2187 2188 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2189 qemu_coroutine_enter(co, acb); 2190 2191 bdrv_co_maybe_schedule_bh(acb); 2192 return &acb->common; 2193 } 2194 2195 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2196 { 2197 BlockAIOCBCoroutine *acb = opaque; 2198 BlockDriverState *bs = acb->common.bs; 2199 2200 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2201 bdrv_co_complete(acb); 2202 } 2203 2204 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2205 int64_t sector_num, int nb_sectors, 2206 BlockCompletionFunc *cb, void *opaque) 2207 { 2208 Coroutine *co; 2209 BlockAIOCBCoroutine *acb; 2210 2211 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2212 2213 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2214 acb->need_bh = true; 2215 acb->req.error = -EINPROGRESS; 2216 acb->req.sector = sector_num; 2217 acb->req.nb_sectors = nb_sectors; 2218 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2219 qemu_coroutine_enter(co, acb); 2220 2221 bdrv_co_maybe_schedule_bh(acb); 2222 return &acb->common; 2223 } 2224 2225 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2226 BlockCompletionFunc *cb, void *opaque) 2227 { 2228 BlockAIOCB *acb; 2229 2230 acb = g_malloc(aiocb_info->aiocb_size); 2231 acb->aiocb_info = aiocb_info; 2232 acb->bs = bs; 2233 acb->cb = cb; 2234 acb->opaque = opaque; 2235 acb->refcnt = 1; 2236 return acb; 2237 } 2238 2239 void qemu_aio_ref(void *p) 2240 { 2241 BlockAIOCB *acb = p; 2242 acb->refcnt++; 2243 } 2244 2245 void qemu_aio_unref(void *p) 2246 { 2247 BlockAIOCB *acb = p; 2248 assert(acb->refcnt > 0); 2249 if (--acb->refcnt == 0) { 2250 g_free(acb); 2251 } 2252 } 2253 2254 /**************************************************************/ 2255 /* Coroutine block device emulation */ 2256 2257 typedef struct CoroutineIOCompletion { 2258 Coroutine *coroutine; 2259 int ret; 2260 } CoroutineIOCompletion; 2261 2262 static void bdrv_co_io_em_complete(void *opaque, int ret) 2263 { 2264 CoroutineIOCompletion *co = opaque; 2265 2266 co->ret = ret; 2267 qemu_coroutine_enter(co->coroutine, NULL); 2268 } 2269 2270 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2271 int nb_sectors, QEMUIOVector *iov, 2272 bool is_write) 2273 { 2274 CoroutineIOCompletion co = { 2275 .coroutine = qemu_coroutine_self(), 2276 }; 2277 BlockAIOCB *acb; 2278 2279 if (is_write) { 2280 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2281 bdrv_co_io_em_complete, &co); 2282 } else { 2283 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2284 bdrv_co_io_em_complete, &co); 2285 } 2286 2287 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2288 if (!acb) { 2289 return -EIO; 2290 } 2291 qemu_coroutine_yield(); 2292 2293 return co.ret; 2294 } 2295 2296 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2297 int64_t sector_num, int nb_sectors, 2298 QEMUIOVector *iov) 2299 { 2300 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2301 } 2302 2303 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2304 int64_t sector_num, int nb_sectors, 2305 QEMUIOVector *iov) 2306 { 2307 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2308 } 2309 2310 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2311 { 2312 RwCo *rwco = opaque; 2313 2314 rwco->ret = bdrv_co_flush(rwco->bs); 2315 } 2316 2317 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2318 { 2319 int ret; 2320 2321 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2322 bdrv_is_sg(bs)) { 2323 return 0; 2324 } 2325 2326 /* Write back cached data to the OS even with cache=unsafe */ 2327 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2328 if (bs->drv->bdrv_co_flush_to_os) { 2329 ret = bs->drv->bdrv_co_flush_to_os(bs); 2330 if (ret < 0) { 2331 return ret; 2332 } 2333 } 2334 2335 /* But don't actually force it to the disk with cache=unsafe */ 2336 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2337 goto flush_parent; 2338 } 2339 2340 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2341 if (bs->drv->bdrv_co_flush_to_disk) { 2342 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2343 } else if (bs->drv->bdrv_aio_flush) { 2344 BlockAIOCB *acb; 2345 CoroutineIOCompletion co = { 2346 .coroutine = qemu_coroutine_self(), 2347 }; 2348 2349 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2350 if (acb == NULL) { 2351 ret = -EIO; 2352 } else { 2353 qemu_coroutine_yield(); 2354 ret = co.ret; 2355 } 2356 } else { 2357 /* 2358 * Some block drivers always operate in either writethrough or unsafe 2359 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2360 * know how the server works (because the behaviour is hardcoded or 2361 * depends on server-side configuration), so we can't ensure that 2362 * everything is safe on disk. Returning an error doesn't work because 2363 * that would break guests even if the server operates in writethrough 2364 * mode. 2365 * 2366 * Let's hope the user knows what he's doing. 2367 */ 2368 ret = 0; 2369 } 2370 if (ret < 0) { 2371 return ret; 2372 } 2373 2374 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2375 * in the case of cache=unsafe, so there are no useless flushes. 2376 */ 2377 flush_parent: 2378 return bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2379 } 2380 2381 int bdrv_flush(BlockDriverState *bs) 2382 { 2383 Coroutine *co; 2384 RwCo rwco = { 2385 .bs = bs, 2386 .ret = NOT_DONE, 2387 }; 2388 2389 if (qemu_in_coroutine()) { 2390 /* Fast-path if already in coroutine context */ 2391 bdrv_flush_co_entry(&rwco); 2392 } else { 2393 AioContext *aio_context = bdrv_get_aio_context(bs); 2394 2395 co = qemu_coroutine_create(bdrv_flush_co_entry); 2396 qemu_coroutine_enter(co, &rwco); 2397 while (rwco.ret == NOT_DONE) { 2398 aio_poll(aio_context, true); 2399 } 2400 } 2401 2402 return rwco.ret; 2403 } 2404 2405 typedef struct DiscardCo { 2406 BlockDriverState *bs; 2407 int64_t sector_num; 2408 int nb_sectors; 2409 int ret; 2410 } DiscardCo; 2411 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2412 { 2413 DiscardCo *rwco = opaque; 2414 2415 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2416 } 2417 2418 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2419 int nb_sectors) 2420 { 2421 int max_discard, ret; 2422 2423 if (!bs->drv) { 2424 return -ENOMEDIUM; 2425 } 2426 2427 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2428 if (ret < 0) { 2429 return ret; 2430 } else if (bs->read_only) { 2431 return -EPERM; 2432 } 2433 2434 /* Do nothing if disabled. */ 2435 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2436 return 0; 2437 } 2438 2439 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2440 return 0; 2441 } 2442 2443 bdrv_set_dirty(bs, sector_num, nb_sectors); 2444 2445 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2446 while (nb_sectors > 0) { 2447 int ret; 2448 int num = nb_sectors; 2449 2450 /* align request */ 2451 if (bs->bl.discard_alignment && 2452 num >= bs->bl.discard_alignment && 2453 sector_num % bs->bl.discard_alignment) { 2454 if (num > bs->bl.discard_alignment) { 2455 num = bs->bl.discard_alignment; 2456 } 2457 num -= sector_num % bs->bl.discard_alignment; 2458 } 2459 2460 /* limit request size */ 2461 if (num > max_discard) { 2462 num = max_discard; 2463 } 2464 2465 if (bs->drv->bdrv_co_discard) { 2466 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2467 } else { 2468 BlockAIOCB *acb; 2469 CoroutineIOCompletion co = { 2470 .coroutine = qemu_coroutine_self(), 2471 }; 2472 2473 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2474 bdrv_co_io_em_complete, &co); 2475 if (acb == NULL) { 2476 return -EIO; 2477 } else { 2478 qemu_coroutine_yield(); 2479 ret = co.ret; 2480 } 2481 } 2482 if (ret && ret != -ENOTSUP) { 2483 return ret; 2484 } 2485 2486 sector_num += num; 2487 nb_sectors -= num; 2488 } 2489 return 0; 2490 } 2491 2492 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2493 { 2494 Coroutine *co; 2495 DiscardCo rwco = { 2496 .bs = bs, 2497 .sector_num = sector_num, 2498 .nb_sectors = nb_sectors, 2499 .ret = NOT_DONE, 2500 }; 2501 2502 if (qemu_in_coroutine()) { 2503 /* Fast-path if already in coroutine context */ 2504 bdrv_discard_co_entry(&rwco); 2505 } else { 2506 AioContext *aio_context = bdrv_get_aio_context(bs); 2507 2508 co = qemu_coroutine_create(bdrv_discard_co_entry); 2509 qemu_coroutine_enter(co, &rwco); 2510 while (rwco.ret == NOT_DONE) { 2511 aio_poll(aio_context, true); 2512 } 2513 } 2514 2515 return rwco.ret; 2516 } 2517 2518 /* needed for generic scsi interface */ 2519 2520 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2521 { 2522 BlockDriver *drv = bs->drv; 2523 2524 if (drv && drv->bdrv_ioctl) 2525 return drv->bdrv_ioctl(bs, req, buf); 2526 return -ENOTSUP; 2527 } 2528 2529 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2530 unsigned long int req, void *buf, 2531 BlockCompletionFunc *cb, void *opaque) 2532 { 2533 BlockDriver *drv = bs->drv; 2534 2535 if (drv && drv->bdrv_aio_ioctl) 2536 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 2537 return NULL; 2538 } 2539 2540 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2541 { 2542 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2543 } 2544 2545 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2546 { 2547 return memset(qemu_blockalign(bs, size), 0, size); 2548 } 2549 2550 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2551 { 2552 size_t align = bdrv_opt_mem_align(bs); 2553 2554 /* Ensure that NULL is never returned on success */ 2555 assert(align > 0); 2556 if (size == 0) { 2557 size = align; 2558 } 2559 2560 return qemu_try_memalign(align, size); 2561 } 2562 2563 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2564 { 2565 void *mem = qemu_try_blockalign(bs, size); 2566 2567 if (mem) { 2568 memset(mem, 0, size); 2569 } 2570 2571 return mem; 2572 } 2573 2574 /* 2575 * Check if all memory in this vector is sector aligned. 2576 */ 2577 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2578 { 2579 int i; 2580 size_t alignment = bdrv_min_mem_align(bs); 2581 2582 for (i = 0; i < qiov->niov; i++) { 2583 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2584 return false; 2585 } 2586 if (qiov->iov[i].iov_len % alignment) { 2587 return false; 2588 } 2589 } 2590 2591 return true; 2592 } 2593 2594 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2595 NotifierWithReturn *notifier) 2596 { 2597 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2598 } 2599 2600 void bdrv_io_plug(BlockDriverState *bs) 2601 { 2602 BlockDriver *drv = bs->drv; 2603 if (drv && drv->bdrv_io_plug) { 2604 drv->bdrv_io_plug(bs); 2605 } else if (bs->file) { 2606 bdrv_io_plug(bs->file->bs); 2607 } 2608 } 2609 2610 void bdrv_io_unplug(BlockDriverState *bs) 2611 { 2612 BlockDriver *drv = bs->drv; 2613 if (drv && drv->bdrv_io_unplug) { 2614 drv->bdrv_io_unplug(bs); 2615 } else if (bs->file) { 2616 bdrv_io_unplug(bs->file->bs); 2617 } 2618 } 2619 2620 void bdrv_flush_io_queue(BlockDriverState *bs) 2621 { 2622 BlockDriver *drv = bs->drv; 2623 if (drv && drv->bdrv_flush_io_queue) { 2624 drv->bdrv_flush_io_queue(bs); 2625 } else if (bs->file) { 2626 bdrv_flush_io_queue(bs->file->bs); 2627 } 2628 bdrv_start_throttled_reqs(bs); 2629 } 2630 2631 void bdrv_drained_begin(BlockDriverState *bs) 2632 { 2633 if (!bs->quiesce_counter++) { 2634 aio_disable_external(bdrv_get_aio_context(bs)); 2635 } 2636 bdrv_drain(bs); 2637 } 2638 2639 void bdrv_drained_end(BlockDriverState *bs) 2640 { 2641 assert(bs->quiesce_counter > 0); 2642 if (--bs->quiesce_counter > 0) { 2643 return; 2644 } 2645 aio_enable_external(bdrv_get_aio_context(bs)); 2646 } 2647