1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "block/blockjob.h" 27 #include "block/block_int.h" 28 #include "block/throttle-groups.h" 29 #include "qemu/error-report.h" 30 31 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 32 33 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 34 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 35 BlockCompletionFunc *cb, void *opaque); 36 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 37 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 38 BlockCompletionFunc *cb, void *opaque); 39 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 40 int64_t sector_num, int nb_sectors, 41 QEMUIOVector *iov); 42 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 43 int64_t sector_num, int nb_sectors, 44 QEMUIOVector *iov); 45 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 46 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 47 BdrvRequestFlags flags); 48 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 49 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 50 BdrvRequestFlags flags); 51 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 52 int64_t sector_num, 53 QEMUIOVector *qiov, 54 int nb_sectors, 55 BdrvRequestFlags flags, 56 BlockCompletionFunc *cb, 57 void *opaque, 58 bool is_write); 59 static void coroutine_fn bdrv_co_do_rw(void *opaque); 60 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 61 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 62 63 /* throttling disk I/O limits */ 64 void bdrv_set_io_limits(BlockDriverState *bs, 65 ThrottleConfig *cfg) 66 { 67 int i; 68 69 throttle_group_config(bs, cfg); 70 71 for (i = 0; i < 2; i++) { 72 qemu_co_enter_next(&bs->throttled_reqs[i]); 73 } 74 } 75 76 /* this function drain all the throttled IOs */ 77 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 78 { 79 bool drained = false; 80 bool enabled = bs->io_limits_enabled; 81 int i; 82 83 bs->io_limits_enabled = false; 84 85 for (i = 0; i < 2; i++) { 86 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 87 drained = true; 88 } 89 } 90 91 bs->io_limits_enabled = enabled; 92 93 return drained; 94 } 95 96 void bdrv_io_limits_disable(BlockDriverState *bs) 97 { 98 bs->io_limits_enabled = false; 99 bdrv_start_throttled_reqs(bs); 100 throttle_group_unregister_bs(bs); 101 } 102 103 /* should be called before bdrv_set_io_limits if a limit is set */ 104 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 105 { 106 assert(!bs->io_limits_enabled); 107 throttle_group_register_bs(bs, group); 108 bs->io_limits_enabled = true; 109 } 110 111 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 112 { 113 /* this bs is not part of any group */ 114 if (!bs->throttle_state) { 115 return; 116 } 117 118 /* this bs is a part of the same group than the one we want */ 119 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 120 return; 121 } 122 123 /* need to change the group this bs belong to */ 124 bdrv_io_limits_disable(bs); 125 bdrv_io_limits_enable(bs, group); 126 } 127 128 void bdrv_setup_io_funcs(BlockDriver *bdrv) 129 { 130 /* Block drivers without coroutine functions need emulation */ 131 if (!bdrv->bdrv_co_readv) { 132 bdrv->bdrv_co_readv = bdrv_co_readv_em; 133 bdrv->bdrv_co_writev = bdrv_co_writev_em; 134 135 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 136 * the block driver lacks aio we need to emulate that too. 137 */ 138 if (!bdrv->bdrv_aio_readv) { 139 /* add AIO emulation layer */ 140 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 141 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 142 } 143 } 144 } 145 146 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 147 { 148 BlockDriver *drv = bs->drv; 149 Error *local_err = NULL; 150 151 memset(&bs->bl, 0, sizeof(bs->bl)); 152 153 if (!drv) { 154 return; 155 } 156 157 /* Take some limits from the children as a default */ 158 if (bs->file) { 159 bdrv_refresh_limits(bs->file, &local_err); 160 if (local_err) { 161 error_propagate(errp, local_err); 162 return; 163 } 164 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; 165 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; 166 bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; 167 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; 168 } else { 169 bs->bl.min_mem_alignment = 512; 170 bs->bl.opt_mem_alignment = getpagesize(); 171 } 172 173 if (bs->backing_hd) { 174 bdrv_refresh_limits(bs->backing_hd, &local_err); 175 if (local_err) { 176 error_propagate(errp, local_err); 177 return; 178 } 179 bs->bl.opt_transfer_length = 180 MAX(bs->bl.opt_transfer_length, 181 bs->backing_hd->bl.opt_transfer_length); 182 bs->bl.max_transfer_length = 183 MIN_NON_ZERO(bs->bl.max_transfer_length, 184 bs->backing_hd->bl.max_transfer_length); 185 bs->bl.opt_mem_alignment = 186 MAX(bs->bl.opt_mem_alignment, 187 bs->backing_hd->bl.opt_mem_alignment); 188 bs->bl.min_mem_alignment = 189 MAX(bs->bl.min_mem_alignment, 190 bs->backing_hd->bl.min_mem_alignment); 191 } 192 193 /* Then let the driver override it */ 194 if (drv->bdrv_refresh_limits) { 195 drv->bdrv_refresh_limits(bs, errp); 196 } 197 } 198 199 /** 200 * The copy-on-read flag is actually a reference count so multiple users may 201 * use the feature without worrying about clobbering its previous state. 202 * Copy-on-read stays enabled until all users have called to disable it. 203 */ 204 void bdrv_enable_copy_on_read(BlockDriverState *bs) 205 { 206 bs->copy_on_read++; 207 } 208 209 void bdrv_disable_copy_on_read(BlockDriverState *bs) 210 { 211 assert(bs->copy_on_read > 0); 212 bs->copy_on_read--; 213 } 214 215 /* Check if any requests are in-flight (including throttled requests) */ 216 static bool bdrv_requests_pending(BlockDriverState *bs) 217 { 218 if (!QLIST_EMPTY(&bs->tracked_requests)) { 219 return true; 220 } 221 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 222 return true; 223 } 224 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 225 return true; 226 } 227 if (bs->file && bdrv_requests_pending(bs->file)) { 228 return true; 229 } 230 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 231 return true; 232 } 233 return false; 234 } 235 236 /* 237 * Wait for pending requests to complete on a single BlockDriverState subtree 238 * 239 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 240 * AioContext. 241 * 242 * Only this BlockDriverState's AioContext is run, so in-flight requests must 243 * not depend on events in other AioContexts. In that case, use 244 * bdrv_drain_all() instead. 245 */ 246 void bdrv_drain(BlockDriverState *bs) 247 { 248 bool busy = true; 249 250 while (busy) { 251 /* Keep iterating */ 252 bdrv_flush_io_queue(bs); 253 busy = bdrv_requests_pending(bs); 254 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 255 } 256 } 257 258 /* 259 * Wait for pending requests to complete across all BlockDriverStates 260 * 261 * This function does not flush data to disk, use bdrv_flush_all() for that 262 * after calling this function. 263 */ 264 void bdrv_drain_all(void) 265 { 266 /* Always run first iteration so any pending completion BHs run */ 267 bool busy = true; 268 BlockDriverState *bs = NULL; 269 GSList *aio_ctxs = NULL, *ctx; 270 271 while ((bs = bdrv_next(bs))) { 272 AioContext *aio_context = bdrv_get_aio_context(bs); 273 274 aio_context_acquire(aio_context); 275 if (bs->job) { 276 block_job_pause(bs->job); 277 } 278 aio_context_release(aio_context); 279 280 if (!g_slist_find(aio_ctxs, aio_context)) { 281 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 282 } 283 } 284 285 /* Note that completion of an asynchronous I/O operation can trigger any 286 * number of other I/O operations on other devices---for example a 287 * coroutine can submit an I/O request to another device in response to 288 * request completion. Therefore we must keep looping until there was no 289 * more activity rather than simply draining each device independently. 290 */ 291 while (busy) { 292 busy = false; 293 294 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 295 AioContext *aio_context = ctx->data; 296 bs = NULL; 297 298 aio_context_acquire(aio_context); 299 while ((bs = bdrv_next(bs))) { 300 if (aio_context == bdrv_get_aio_context(bs)) { 301 bdrv_flush_io_queue(bs); 302 if (bdrv_requests_pending(bs)) { 303 busy = true; 304 aio_poll(aio_context, busy); 305 } 306 } 307 } 308 busy |= aio_poll(aio_context, false); 309 aio_context_release(aio_context); 310 } 311 } 312 313 bs = NULL; 314 while ((bs = bdrv_next(bs))) { 315 AioContext *aio_context = bdrv_get_aio_context(bs); 316 317 aio_context_acquire(aio_context); 318 if (bs->job) { 319 block_job_resume(bs->job); 320 } 321 aio_context_release(aio_context); 322 } 323 g_slist_free(aio_ctxs); 324 } 325 326 /** 327 * Remove an active request from the tracked requests list 328 * 329 * This function should be called when a tracked request is completing. 330 */ 331 static void tracked_request_end(BdrvTrackedRequest *req) 332 { 333 if (req->serialising) { 334 req->bs->serialising_in_flight--; 335 } 336 337 QLIST_REMOVE(req, list); 338 qemu_co_queue_restart_all(&req->wait_queue); 339 } 340 341 /** 342 * Add an active request to the tracked requests list 343 */ 344 static void tracked_request_begin(BdrvTrackedRequest *req, 345 BlockDriverState *bs, 346 int64_t offset, 347 unsigned int bytes, bool is_write) 348 { 349 *req = (BdrvTrackedRequest){ 350 .bs = bs, 351 .offset = offset, 352 .bytes = bytes, 353 .is_write = is_write, 354 .co = qemu_coroutine_self(), 355 .serialising = false, 356 .overlap_offset = offset, 357 .overlap_bytes = bytes, 358 }; 359 360 qemu_co_queue_init(&req->wait_queue); 361 362 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 363 } 364 365 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 366 { 367 int64_t overlap_offset = req->offset & ~(align - 1); 368 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 369 - overlap_offset; 370 371 if (!req->serialising) { 372 req->bs->serialising_in_flight++; 373 req->serialising = true; 374 } 375 376 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 377 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 378 } 379 380 /** 381 * Round a region to cluster boundaries 382 */ 383 void bdrv_round_to_clusters(BlockDriverState *bs, 384 int64_t sector_num, int nb_sectors, 385 int64_t *cluster_sector_num, 386 int *cluster_nb_sectors) 387 { 388 BlockDriverInfo bdi; 389 390 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 391 *cluster_sector_num = sector_num; 392 *cluster_nb_sectors = nb_sectors; 393 } else { 394 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 395 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 396 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 397 nb_sectors, c); 398 } 399 } 400 401 static int bdrv_get_cluster_size(BlockDriverState *bs) 402 { 403 BlockDriverInfo bdi; 404 int ret; 405 406 ret = bdrv_get_info(bs, &bdi); 407 if (ret < 0 || bdi.cluster_size == 0) { 408 return bs->request_alignment; 409 } else { 410 return bdi.cluster_size; 411 } 412 } 413 414 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 415 int64_t offset, unsigned int bytes) 416 { 417 /* aaaa bbbb */ 418 if (offset >= req->overlap_offset + req->overlap_bytes) { 419 return false; 420 } 421 /* bbbb aaaa */ 422 if (req->overlap_offset >= offset + bytes) { 423 return false; 424 } 425 return true; 426 } 427 428 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 429 { 430 BlockDriverState *bs = self->bs; 431 BdrvTrackedRequest *req; 432 bool retry; 433 bool waited = false; 434 435 if (!bs->serialising_in_flight) { 436 return false; 437 } 438 439 do { 440 retry = false; 441 QLIST_FOREACH(req, &bs->tracked_requests, list) { 442 if (req == self || (!req->serialising && !self->serialising)) { 443 continue; 444 } 445 if (tracked_request_overlaps(req, self->overlap_offset, 446 self->overlap_bytes)) 447 { 448 /* Hitting this means there was a reentrant request, for 449 * example, a block driver issuing nested requests. This must 450 * never happen since it means deadlock. 451 */ 452 assert(qemu_coroutine_self() != req->co); 453 454 /* If the request is already (indirectly) waiting for us, or 455 * will wait for us as soon as it wakes up, then just go on 456 * (instead of producing a deadlock in the former case). */ 457 if (!req->waiting_for) { 458 self->waiting_for = req; 459 qemu_co_queue_wait(&req->wait_queue); 460 self->waiting_for = NULL; 461 retry = true; 462 waited = true; 463 break; 464 } 465 } 466 } 467 } while (retry); 468 469 return waited; 470 } 471 472 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 473 size_t size) 474 { 475 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 476 return -EIO; 477 } 478 479 if (!bdrv_is_inserted(bs)) { 480 return -ENOMEDIUM; 481 } 482 483 if (offset < 0) { 484 return -EIO; 485 } 486 487 return 0; 488 } 489 490 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 491 int nb_sectors) 492 { 493 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 494 return -EIO; 495 } 496 497 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 498 nb_sectors * BDRV_SECTOR_SIZE); 499 } 500 501 typedef struct RwCo { 502 BlockDriverState *bs; 503 int64_t offset; 504 QEMUIOVector *qiov; 505 bool is_write; 506 int ret; 507 BdrvRequestFlags flags; 508 } RwCo; 509 510 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 511 { 512 RwCo *rwco = opaque; 513 514 if (!rwco->is_write) { 515 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 516 rwco->qiov->size, rwco->qiov, 517 rwco->flags); 518 } else { 519 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 520 rwco->qiov->size, rwco->qiov, 521 rwco->flags); 522 } 523 } 524 525 /* 526 * Process a vectored synchronous request using coroutines 527 */ 528 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 529 QEMUIOVector *qiov, bool is_write, 530 BdrvRequestFlags flags) 531 { 532 Coroutine *co; 533 RwCo rwco = { 534 .bs = bs, 535 .offset = offset, 536 .qiov = qiov, 537 .is_write = is_write, 538 .ret = NOT_DONE, 539 .flags = flags, 540 }; 541 542 /** 543 * In sync call context, when the vcpu is blocked, this throttling timer 544 * will not fire; so the I/O throttling function has to be disabled here 545 * if it has been enabled. 546 */ 547 if (bs->io_limits_enabled) { 548 fprintf(stderr, "Disabling I/O throttling on '%s' due " 549 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 550 bdrv_io_limits_disable(bs); 551 } 552 553 if (qemu_in_coroutine()) { 554 /* Fast-path if already in coroutine context */ 555 bdrv_rw_co_entry(&rwco); 556 } else { 557 AioContext *aio_context = bdrv_get_aio_context(bs); 558 559 co = qemu_coroutine_create(bdrv_rw_co_entry); 560 qemu_coroutine_enter(co, &rwco); 561 while (rwco.ret == NOT_DONE) { 562 aio_poll(aio_context, true); 563 } 564 } 565 return rwco.ret; 566 } 567 568 /* 569 * Process a synchronous request using coroutines 570 */ 571 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 572 int nb_sectors, bool is_write, BdrvRequestFlags flags) 573 { 574 QEMUIOVector qiov; 575 struct iovec iov = { 576 .iov_base = (void *)buf, 577 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 578 }; 579 580 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 581 return -EINVAL; 582 } 583 584 qemu_iovec_init_external(&qiov, &iov, 1); 585 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 586 &qiov, is_write, flags); 587 } 588 589 /* return < 0 if error. See bdrv_write() for the return codes */ 590 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 591 uint8_t *buf, int nb_sectors) 592 { 593 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 594 } 595 596 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 597 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 598 uint8_t *buf, int nb_sectors) 599 { 600 bool enabled; 601 int ret; 602 603 enabled = bs->io_limits_enabled; 604 bs->io_limits_enabled = false; 605 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 606 bs->io_limits_enabled = enabled; 607 return ret; 608 } 609 610 /* Return < 0 if error. Important errors are: 611 -EIO generic I/O error (may happen for all errors) 612 -ENOMEDIUM No media inserted. 613 -EINVAL Invalid sector number or nb_sectors 614 -EACCES Trying to write a read-only device 615 */ 616 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 617 const uint8_t *buf, int nb_sectors) 618 { 619 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 620 } 621 622 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 623 int nb_sectors, BdrvRequestFlags flags) 624 { 625 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 626 BDRV_REQ_ZERO_WRITE | flags); 627 } 628 629 /* 630 * Completely zero out a block device with the help of bdrv_write_zeroes. 631 * The operation is sped up by checking the block status and only writing 632 * zeroes to the device if they currently do not return zeroes. Optional 633 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 634 * 635 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 636 */ 637 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 638 { 639 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 640 int n; 641 642 target_sectors = bdrv_nb_sectors(bs); 643 if (target_sectors < 0) { 644 return target_sectors; 645 } 646 647 for (;;) { 648 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 649 if (nb_sectors <= 0) { 650 return 0; 651 } 652 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 653 if (ret < 0) { 654 error_report("error getting block status at sector %" PRId64 ": %s", 655 sector_num, strerror(-ret)); 656 return ret; 657 } 658 if (ret & BDRV_BLOCK_ZERO) { 659 sector_num += n; 660 continue; 661 } 662 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 663 if (ret < 0) { 664 error_report("error writing zeroes at sector %" PRId64 ": %s", 665 sector_num, strerror(-ret)); 666 return ret; 667 } 668 sector_num += n; 669 } 670 } 671 672 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 673 { 674 QEMUIOVector qiov; 675 struct iovec iov = { 676 .iov_base = (void *)buf, 677 .iov_len = bytes, 678 }; 679 int ret; 680 681 if (bytes < 0) { 682 return -EINVAL; 683 } 684 685 qemu_iovec_init_external(&qiov, &iov, 1); 686 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 687 if (ret < 0) { 688 return ret; 689 } 690 691 return bytes; 692 } 693 694 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 695 { 696 int ret; 697 698 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 699 if (ret < 0) { 700 return ret; 701 } 702 703 return qiov->size; 704 } 705 706 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 707 const void *buf, int bytes) 708 { 709 QEMUIOVector qiov; 710 struct iovec iov = { 711 .iov_base = (void *) buf, 712 .iov_len = bytes, 713 }; 714 715 if (bytes < 0) { 716 return -EINVAL; 717 } 718 719 qemu_iovec_init_external(&qiov, &iov, 1); 720 return bdrv_pwritev(bs, offset, &qiov); 721 } 722 723 /* 724 * Writes to the file and ensures that no writes are reordered across this 725 * request (acts as a barrier) 726 * 727 * Returns 0 on success, -errno in error cases. 728 */ 729 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 730 const void *buf, int count) 731 { 732 int ret; 733 734 ret = bdrv_pwrite(bs, offset, buf, count); 735 if (ret < 0) { 736 return ret; 737 } 738 739 /* No flush needed for cache modes that already do it */ 740 if (bs->enable_write_cache) { 741 bdrv_flush(bs); 742 } 743 744 return 0; 745 } 746 747 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 748 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 749 { 750 /* Perform I/O through a temporary buffer so that users who scribble over 751 * their read buffer while the operation is in progress do not end up 752 * modifying the image file. This is critical for zero-copy guest I/O 753 * where anything might happen inside guest memory. 754 */ 755 void *bounce_buffer; 756 757 BlockDriver *drv = bs->drv; 758 struct iovec iov; 759 QEMUIOVector bounce_qiov; 760 int64_t cluster_sector_num; 761 int cluster_nb_sectors; 762 size_t skip_bytes; 763 int ret; 764 765 /* Cover entire cluster so no additional backing file I/O is required when 766 * allocating cluster in the image file. 767 */ 768 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 769 &cluster_sector_num, &cluster_nb_sectors); 770 771 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 772 cluster_sector_num, cluster_nb_sectors); 773 774 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 775 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 776 if (bounce_buffer == NULL) { 777 ret = -ENOMEM; 778 goto err; 779 } 780 781 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 782 783 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 784 &bounce_qiov); 785 if (ret < 0) { 786 goto err; 787 } 788 789 if (drv->bdrv_co_write_zeroes && 790 buffer_is_zero(bounce_buffer, iov.iov_len)) { 791 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 792 cluster_nb_sectors, 0); 793 } else { 794 /* This does not change the data on the disk, it is not necessary 795 * to flush even in cache=writethrough mode. 796 */ 797 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 798 &bounce_qiov); 799 } 800 801 if (ret < 0) { 802 /* It might be okay to ignore write errors for guest requests. If this 803 * is a deliberate copy-on-read then we don't want to ignore the error. 804 * Simply report it in all cases. 805 */ 806 goto err; 807 } 808 809 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 810 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 811 nb_sectors * BDRV_SECTOR_SIZE); 812 813 err: 814 qemu_vfree(bounce_buffer); 815 return ret; 816 } 817 818 /* 819 * Forwards an already correctly aligned request to the BlockDriver. This 820 * handles copy on read and zeroing after EOF; any other features must be 821 * implemented by the caller. 822 */ 823 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 824 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 825 int64_t align, QEMUIOVector *qiov, int flags) 826 { 827 BlockDriver *drv = bs->drv; 828 int ret; 829 830 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 831 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 832 833 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 834 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 835 assert(!qiov || bytes == qiov->size); 836 837 /* Handle Copy on Read and associated serialisation */ 838 if (flags & BDRV_REQ_COPY_ON_READ) { 839 /* If we touch the same cluster it counts as an overlap. This 840 * guarantees that allocating writes will be serialized and not race 841 * with each other for the same cluster. For example, in copy-on-read 842 * it ensures that the CoR read and write operations are atomic and 843 * guest writes cannot interleave between them. */ 844 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 845 } 846 847 wait_serialising_requests(req); 848 849 if (flags & BDRV_REQ_COPY_ON_READ) { 850 int pnum; 851 852 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 853 if (ret < 0) { 854 goto out; 855 } 856 857 if (!ret || pnum != nb_sectors) { 858 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 859 goto out; 860 } 861 } 862 863 /* Forward the request to the BlockDriver */ 864 if (!bs->zero_beyond_eof) { 865 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 866 } else { 867 /* Read zeros after EOF */ 868 int64_t total_sectors, max_nb_sectors; 869 870 total_sectors = bdrv_nb_sectors(bs); 871 if (total_sectors < 0) { 872 ret = total_sectors; 873 goto out; 874 } 875 876 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 877 align >> BDRV_SECTOR_BITS); 878 if (nb_sectors < max_nb_sectors) { 879 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 880 } else if (max_nb_sectors > 0) { 881 QEMUIOVector local_qiov; 882 883 qemu_iovec_init(&local_qiov, qiov->niov); 884 qemu_iovec_concat(&local_qiov, qiov, 0, 885 max_nb_sectors * BDRV_SECTOR_SIZE); 886 887 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 888 &local_qiov); 889 890 qemu_iovec_destroy(&local_qiov); 891 } else { 892 ret = 0; 893 } 894 895 /* Reading beyond end of file is supposed to produce zeroes */ 896 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 897 uint64_t offset = MAX(0, total_sectors - sector_num); 898 uint64_t bytes = (sector_num + nb_sectors - offset) * 899 BDRV_SECTOR_SIZE; 900 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 901 } 902 } 903 904 out: 905 return ret; 906 } 907 908 /* 909 * Handle a read request in coroutine context 910 */ 911 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 912 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 913 BdrvRequestFlags flags) 914 { 915 BlockDriver *drv = bs->drv; 916 BdrvTrackedRequest req; 917 918 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 919 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 920 uint8_t *head_buf = NULL; 921 uint8_t *tail_buf = NULL; 922 QEMUIOVector local_qiov; 923 bool use_local_qiov = false; 924 int ret; 925 926 if (!drv) { 927 return -ENOMEDIUM; 928 } 929 930 ret = bdrv_check_byte_request(bs, offset, bytes); 931 if (ret < 0) { 932 return ret; 933 } 934 935 /* Don't do copy-on-read if we read data before write operation */ 936 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) { 937 flags |= BDRV_REQ_COPY_ON_READ; 938 } 939 940 /* throttling disk I/O */ 941 if (bs->io_limits_enabled) { 942 throttle_group_co_io_limits_intercept(bs, bytes, false); 943 } 944 945 /* Align read if necessary by padding qiov */ 946 if (offset & (align - 1)) { 947 head_buf = qemu_blockalign(bs, align); 948 qemu_iovec_init(&local_qiov, qiov->niov + 2); 949 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 950 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 951 use_local_qiov = true; 952 953 bytes += offset & (align - 1); 954 offset = offset & ~(align - 1); 955 } 956 957 if ((offset + bytes) & (align - 1)) { 958 if (!use_local_qiov) { 959 qemu_iovec_init(&local_qiov, qiov->niov + 1); 960 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 961 use_local_qiov = true; 962 } 963 tail_buf = qemu_blockalign(bs, align); 964 qemu_iovec_add(&local_qiov, tail_buf, 965 align - ((offset + bytes) & (align - 1))); 966 967 bytes = ROUND_UP(bytes, align); 968 } 969 970 tracked_request_begin(&req, bs, offset, bytes, false); 971 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 972 use_local_qiov ? &local_qiov : qiov, 973 flags); 974 tracked_request_end(&req); 975 976 if (use_local_qiov) { 977 qemu_iovec_destroy(&local_qiov); 978 qemu_vfree(head_buf); 979 qemu_vfree(tail_buf); 980 } 981 982 return ret; 983 } 984 985 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 986 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 987 BdrvRequestFlags flags) 988 { 989 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 990 return -EINVAL; 991 } 992 993 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 994 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 995 } 996 997 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 998 int nb_sectors, QEMUIOVector *qiov) 999 { 1000 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1001 1002 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1003 } 1004 1005 int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs, 1006 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1007 { 1008 trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors); 1009 1010 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1011 BDRV_REQ_NO_COPY_ON_READ); 1012 } 1013 1014 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1015 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1016 { 1017 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1018 1019 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1020 BDRV_REQ_COPY_ON_READ); 1021 } 1022 1023 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1024 1025 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1026 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1027 { 1028 BlockDriver *drv = bs->drv; 1029 QEMUIOVector qiov; 1030 struct iovec iov = {0}; 1031 int ret = 0; 1032 1033 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1034 BDRV_REQUEST_MAX_SECTORS); 1035 1036 while (nb_sectors > 0 && !ret) { 1037 int num = nb_sectors; 1038 1039 /* Align request. Block drivers can expect the "bulk" of the request 1040 * to be aligned. 1041 */ 1042 if (bs->bl.write_zeroes_alignment 1043 && num > bs->bl.write_zeroes_alignment) { 1044 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1045 /* Make a small request up to the first aligned sector. */ 1046 num = bs->bl.write_zeroes_alignment; 1047 num -= sector_num % bs->bl.write_zeroes_alignment; 1048 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1049 /* Shorten the request to the last aligned sector. num cannot 1050 * underflow because num > bs->bl.write_zeroes_alignment. 1051 */ 1052 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1053 } 1054 } 1055 1056 /* limit request size */ 1057 if (num > max_write_zeroes) { 1058 num = max_write_zeroes; 1059 } 1060 1061 ret = -ENOTSUP; 1062 /* First try the efficient write zeroes operation */ 1063 if (drv->bdrv_co_write_zeroes) { 1064 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1065 } 1066 1067 if (ret == -ENOTSUP) { 1068 /* Fall back to bounce buffer if write zeroes is unsupported */ 1069 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1070 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1071 num = MIN(num, max_xfer_len); 1072 iov.iov_len = num * BDRV_SECTOR_SIZE; 1073 if (iov.iov_base == NULL) { 1074 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1075 if (iov.iov_base == NULL) { 1076 ret = -ENOMEM; 1077 goto fail; 1078 } 1079 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1080 } 1081 qemu_iovec_init_external(&qiov, &iov, 1); 1082 1083 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1084 1085 /* Keep bounce buffer around if it is big enough for all 1086 * all future requests. 1087 */ 1088 if (num < max_xfer_len) { 1089 qemu_vfree(iov.iov_base); 1090 iov.iov_base = NULL; 1091 } 1092 } 1093 1094 sector_num += num; 1095 nb_sectors -= num; 1096 } 1097 1098 fail: 1099 qemu_vfree(iov.iov_base); 1100 return ret; 1101 } 1102 1103 /* 1104 * Forwards an already correctly aligned write request to the BlockDriver. 1105 */ 1106 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1107 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1108 QEMUIOVector *qiov, int flags) 1109 { 1110 BlockDriver *drv = bs->drv; 1111 bool waited; 1112 int ret; 1113 1114 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1115 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1116 1117 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1118 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1119 assert(!qiov || bytes == qiov->size); 1120 1121 waited = wait_serialising_requests(req); 1122 assert(!waited || !req->serialising); 1123 assert(req->overlap_offset <= offset); 1124 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1125 1126 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1127 1128 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1129 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1130 qemu_iovec_is_zero(qiov)) { 1131 flags |= BDRV_REQ_ZERO_WRITE; 1132 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1133 flags |= BDRV_REQ_MAY_UNMAP; 1134 } 1135 } 1136 1137 if (ret < 0) { 1138 /* Do nothing, write notifier decided to fail this request */ 1139 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1140 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); 1141 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1142 } else { 1143 BLKDBG_EVENT(bs, BLKDBG_PWRITEV); 1144 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1145 } 1146 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); 1147 1148 if (ret == 0 && !bs->enable_write_cache) { 1149 ret = bdrv_co_flush(bs); 1150 } 1151 1152 bdrv_set_dirty(bs, sector_num, nb_sectors); 1153 1154 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); 1155 1156 if (ret >= 0) { 1157 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1158 } 1159 1160 return ret; 1161 } 1162 1163 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1164 int64_t offset, 1165 unsigned int bytes, 1166 BdrvRequestFlags flags, 1167 BdrvTrackedRequest *req) 1168 { 1169 uint8_t *buf = NULL; 1170 QEMUIOVector local_qiov; 1171 struct iovec iov; 1172 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1173 unsigned int head_padding_bytes, tail_padding_bytes; 1174 int ret = 0; 1175 1176 head_padding_bytes = offset & (align - 1); 1177 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1178 1179 1180 assert(flags & BDRV_REQ_ZERO_WRITE); 1181 if (head_padding_bytes || tail_padding_bytes) { 1182 buf = qemu_blockalign(bs, align); 1183 iov = (struct iovec) { 1184 .iov_base = buf, 1185 .iov_len = align, 1186 }; 1187 qemu_iovec_init_external(&local_qiov, &iov, 1); 1188 } 1189 if (head_padding_bytes) { 1190 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1191 1192 /* RMW the unaligned part before head. */ 1193 mark_request_serialising(req, align); 1194 wait_serialising_requests(req); 1195 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1196 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1197 align, &local_qiov, 0); 1198 if (ret < 0) { 1199 goto fail; 1200 } 1201 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1202 1203 memset(buf + head_padding_bytes, 0, zero_bytes); 1204 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1205 &local_qiov, 1206 flags & ~BDRV_REQ_ZERO_WRITE); 1207 if (ret < 0) { 1208 goto fail; 1209 } 1210 offset += zero_bytes; 1211 bytes -= zero_bytes; 1212 } 1213 1214 assert(!bytes || (offset & (align - 1)) == 0); 1215 if (bytes >= align) { 1216 /* Write the aligned part in the middle. */ 1217 uint64_t aligned_bytes = bytes & ~(align - 1); 1218 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1219 NULL, flags); 1220 if (ret < 0) { 1221 goto fail; 1222 } 1223 bytes -= aligned_bytes; 1224 offset += aligned_bytes; 1225 } 1226 1227 assert(!bytes || (offset & (align - 1)) == 0); 1228 if (bytes) { 1229 assert(align == tail_padding_bytes + bytes); 1230 /* RMW the unaligned part after tail. */ 1231 mark_request_serialising(req, align); 1232 wait_serialising_requests(req); 1233 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1234 ret = bdrv_aligned_preadv(bs, req, offset, align, 1235 align, &local_qiov, 0); 1236 if (ret < 0) { 1237 goto fail; 1238 } 1239 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1240 1241 memset(buf, 0, bytes); 1242 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1243 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1244 } 1245 fail: 1246 qemu_vfree(buf); 1247 return ret; 1248 1249 } 1250 1251 /* 1252 * Handle a write request in coroutine context 1253 */ 1254 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1255 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1256 BdrvRequestFlags flags) 1257 { 1258 BdrvTrackedRequest req; 1259 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1260 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1261 uint8_t *head_buf = NULL; 1262 uint8_t *tail_buf = NULL; 1263 QEMUIOVector local_qiov; 1264 bool use_local_qiov = false; 1265 int ret; 1266 1267 if (!bs->drv) { 1268 return -ENOMEDIUM; 1269 } 1270 if (bs->read_only) { 1271 return -EPERM; 1272 } 1273 1274 ret = bdrv_check_byte_request(bs, offset, bytes); 1275 if (ret < 0) { 1276 return ret; 1277 } 1278 1279 /* throttling disk I/O */ 1280 if (bs->io_limits_enabled) { 1281 throttle_group_co_io_limits_intercept(bs, bytes, true); 1282 } 1283 1284 /* 1285 * Align write if necessary by performing a read-modify-write cycle. 1286 * Pad qiov with the read parts and be sure to have a tracked request not 1287 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1288 */ 1289 tracked_request_begin(&req, bs, offset, bytes, true); 1290 1291 if (!qiov) { 1292 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1293 goto out; 1294 } 1295 1296 if (offset & (align - 1)) { 1297 QEMUIOVector head_qiov; 1298 struct iovec head_iov; 1299 1300 mark_request_serialising(&req, align); 1301 wait_serialising_requests(&req); 1302 1303 head_buf = qemu_blockalign(bs, align); 1304 head_iov = (struct iovec) { 1305 .iov_base = head_buf, 1306 .iov_len = align, 1307 }; 1308 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1309 1310 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1311 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1312 align, &head_qiov, 0); 1313 if (ret < 0) { 1314 goto fail; 1315 } 1316 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1317 1318 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1319 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1320 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1321 use_local_qiov = true; 1322 1323 bytes += offset & (align - 1); 1324 offset = offset & ~(align - 1); 1325 } 1326 1327 if ((offset + bytes) & (align - 1)) { 1328 QEMUIOVector tail_qiov; 1329 struct iovec tail_iov; 1330 size_t tail_bytes; 1331 bool waited; 1332 1333 mark_request_serialising(&req, align); 1334 waited = wait_serialising_requests(&req); 1335 assert(!waited || !use_local_qiov); 1336 1337 tail_buf = qemu_blockalign(bs, align); 1338 tail_iov = (struct iovec) { 1339 .iov_base = tail_buf, 1340 .iov_len = align, 1341 }; 1342 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1343 1344 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1345 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1346 align, &tail_qiov, 0); 1347 if (ret < 0) { 1348 goto fail; 1349 } 1350 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1351 1352 if (!use_local_qiov) { 1353 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1354 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1355 use_local_qiov = true; 1356 } 1357 1358 tail_bytes = (offset + bytes) & (align - 1); 1359 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1360 1361 bytes = ROUND_UP(bytes, align); 1362 } 1363 1364 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1365 use_local_qiov ? &local_qiov : qiov, 1366 flags); 1367 1368 fail: 1369 1370 if (use_local_qiov) { 1371 qemu_iovec_destroy(&local_qiov); 1372 } 1373 qemu_vfree(head_buf); 1374 qemu_vfree(tail_buf); 1375 out: 1376 tracked_request_end(&req); 1377 return ret; 1378 } 1379 1380 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1381 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1382 BdrvRequestFlags flags) 1383 { 1384 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1385 return -EINVAL; 1386 } 1387 1388 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1389 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1390 } 1391 1392 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1393 int nb_sectors, QEMUIOVector *qiov) 1394 { 1395 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1396 1397 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1398 } 1399 1400 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1401 int64_t sector_num, int nb_sectors, 1402 BdrvRequestFlags flags) 1403 { 1404 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1405 1406 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1407 flags &= ~BDRV_REQ_MAY_UNMAP; 1408 } 1409 1410 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1411 BDRV_REQ_ZERO_WRITE | flags); 1412 } 1413 1414 int bdrv_flush_all(void) 1415 { 1416 BlockDriverState *bs = NULL; 1417 int result = 0; 1418 1419 while ((bs = bdrv_next(bs))) { 1420 AioContext *aio_context = bdrv_get_aio_context(bs); 1421 int ret; 1422 1423 aio_context_acquire(aio_context); 1424 ret = bdrv_flush(bs); 1425 if (ret < 0 && !result) { 1426 result = ret; 1427 } 1428 aio_context_release(aio_context); 1429 } 1430 1431 return result; 1432 } 1433 1434 typedef struct BdrvCoGetBlockStatusData { 1435 BlockDriverState *bs; 1436 BlockDriverState *base; 1437 int64_t sector_num; 1438 int nb_sectors; 1439 int *pnum; 1440 int64_t ret; 1441 bool done; 1442 } BdrvCoGetBlockStatusData; 1443 1444 /* 1445 * Returns the allocation status of the specified sectors. 1446 * Drivers not implementing the functionality are assumed to not support 1447 * backing files, hence all their sectors are reported as allocated. 1448 * 1449 * If 'sector_num' is beyond the end of the disk image the return value is 0 1450 * and 'pnum' is set to 0. 1451 * 1452 * 'pnum' is set to the number of sectors (including and immediately following 1453 * the specified sector) that are known to be in the same 1454 * allocated/unallocated state. 1455 * 1456 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1457 * beyond the end of the disk image it will be clamped. 1458 */ 1459 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1460 int64_t sector_num, 1461 int nb_sectors, int *pnum) 1462 { 1463 int64_t total_sectors; 1464 int64_t n; 1465 int64_t ret, ret2; 1466 1467 total_sectors = bdrv_nb_sectors(bs); 1468 if (total_sectors < 0) { 1469 return total_sectors; 1470 } 1471 1472 if (sector_num >= total_sectors) { 1473 *pnum = 0; 1474 return 0; 1475 } 1476 1477 n = total_sectors - sector_num; 1478 if (n < nb_sectors) { 1479 nb_sectors = n; 1480 } 1481 1482 if (!bs->drv->bdrv_co_get_block_status) { 1483 *pnum = nb_sectors; 1484 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1485 if (bs->drv->protocol_name) { 1486 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1487 } 1488 return ret; 1489 } 1490 1491 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1492 if (ret < 0) { 1493 *pnum = 0; 1494 return ret; 1495 } 1496 1497 if (ret & BDRV_BLOCK_RAW) { 1498 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1499 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1500 *pnum, pnum); 1501 } 1502 1503 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1504 ret |= BDRV_BLOCK_ALLOCATED; 1505 } else { 1506 if (bdrv_unallocated_blocks_are_zero(bs)) { 1507 ret |= BDRV_BLOCK_ZERO; 1508 } else if (bs->backing_hd) { 1509 BlockDriverState *bs2 = bs->backing_hd; 1510 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1511 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1512 ret |= BDRV_BLOCK_ZERO; 1513 } 1514 } 1515 } 1516 1517 if (bs->file && 1518 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1519 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1520 int file_pnum; 1521 1522 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1523 *pnum, &file_pnum); 1524 if (ret2 >= 0) { 1525 /* Ignore errors. This is just providing extra information, it 1526 * is useful but not necessary. 1527 */ 1528 if (!file_pnum) { 1529 /* !file_pnum indicates an offset at or beyond the EOF; it is 1530 * perfectly valid for the format block driver to point to such 1531 * offsets, so catch it and mark everything as zero */ 1532 ret |= BDRV_BLOCK_ZERO; 1533 } else { 1534 /* Limit request to the range reported by the protocol driver */ 1535 *pnum = file_pnum; 1536 ret |= (ret2 & BDRV_BLOCK_ZERO); 1537 } 1538 } 1539 } 1540 1541 return ret; 1542 } 1543 1544 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1545 BlockDriverState *base, 1546 int64_t sector_num, 1547 int nb_sectors, 1548 int *pnum) 1549 { 1550 BlockDriverState *p; 1551 int64_t ret = 0; 1552 1553 assert(bs != base); 1554 for (p = bs; p != base; p = p->backing_hd) { 1555 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); 1556 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1557 break; 1558 } 1559 /* [sector_num, pnum] unallocated on this layer, which could be only 1560 * the first part of [sector_num, nb_sectors]. */ 1561 nb_sectors = MIN(nb_sectors, *pnum); 1562 } 1563 return ret; 1564 } 1565 1566 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1567 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1568 { 1569 BdrvCoGetBlockStatusData *data = opaque; 1570 1571 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1572 data->sector_num, 1573 data->nb_sectors, 1574 data->pnum); 1575 data->done = true; 1576 } 1577 1578 /* 1579 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1580 * 1581 * See bdrv_co_get_block_status_above() for details. 1582 */ 1583 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1584 BlockDriverState *base, 1585 int64_t sector_num, 1586 int nb_sectors, int *pnum) 1587 { 1588 Coroutine *co; 1589 BdrvCoGetBlockStatusData data = { 1590 .bs = bs, 1591 .base = base, 1592 .sector_num = sector_num, 1593 .nb_sectors = nb_sectors, 1594 .pnum = pnum, 1595 .done = false, 1596 }; 1597 1598 if (qemu_in_coroutine()) { 1599 /* Fast-path if already in coroutine context */ 1600 bdrv_get_block_status_above_co_entry(&data); 1601 } else { 1602 AioContext *aio_context = bdrv_get_aio_context(bs); 1603 1604 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1605 qemu_coroutine_enter(co, &data); 1606 while (!data.done) { 1607 aio_poll(aio_context, true); 1608 } 1609 } 1610 return data.ret; 1611 } 1612 1613 int64_t bdrv_get_block_status(BlockDriverState *bs, 1614 int64_t sector_num, 1615 int nb_sectors, int *pnum) 1616 { 1617 return bdrv_get_block_status_above(bs, bs->backing_hd, 1618 sector_num, nb_sectors, pnum); 1619 } 1620 1621 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1622 int nb_sectors, int *pnum) 1623 { 1624 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1625 if (ret < 0) { 1626 return ret; 1627 } 1628 return !!(ret & BDRV_BLOCK_ALLOCATED); 1629 } 1630 1631 /* 1632 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1633 * 1634 * Return true if the given sector is allocated in any image between 1635 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1636 * sector is allocated in any image of the chain. Return false otherwise. 1637 * 1638 * 'pnum' is set to the number of sectors (including and immediately following 1639 * the specified sector) that are known to be in the same 1640 * allocated/unallocated state. 1641 * 1642 */ 1643 int bdrv_is_allocated_above(BlockDriverState *top, 1644 BlockDriverState *base, 1645 int64_t sector_num, 1646 int nb_sectors, int *pnum) 1647 { 1648 BlockDriverState *intermediate; 1649 int ret, n = nb_sectors; 1650 1651 intermediate = top; 1652 while (intermediate && intermediate != base) { 1653 int pnum_inter; 1654 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1655 &pnum_inter); 1656 if (ret < 0) { 1657 return ret; 1658 } else if (ret) { 1659 *pnum = pnum_inter; 1660 return 1; 1661 } 1662 1663 /* 1664 * [sector_num, nb_sectors] is unallocated on top but intermediate 1665 * might have 1666 * 1667 * [sector_num+x, nr_sectors] allocated. 1668 */ 1669 if (n > pnum_inter && 1670 (intermediate == top || 1671 sector_num + pnum_inter < intermediate->total_sectors)) { 1672 n = pnum_inter; 1673 } 1674 1675 intermediate = intermediate->backing_hd; 1676 } 1677 1678 *pnum = n; 1679 return 0; 1680 } 1681 1682 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1683 const uint8_t *buf, int nb_sectors) 1684 { 1685 BlockDriver *drv = bs->drv; 1686 int ret; 1687 1688 if (!drv) { 1689 return -ENOMEDIUM; 1690 } 1691 if (!drv->bdrv_write_compressed) { 1692 return -ENOTSUP; 1693 } 1694 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1695 if (ret < 0) { 1696 return ret; 1697 } 1698 1699 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1700 1701 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1702 } 1703 1704 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1705 int64_t pos, int size) 1706 { 1707 QEMUIOVector qiov; 1708 struct iovec iov = { 1709 .iov_base = (void *) buf, 1710 .iov_len = size, 1711 }; 1712 1713 qemu_iovec_init_external(&qiov, &iov, 1); 1714 return bdrv_writev_vmstate(bs, &qiov, pos); 1715 } 1716 1717 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1718 { 1719 BlockDriver *drv = bs->drv; 1720 1721 if (!drv) { 1722 return -ENOMEDIUM; 1723 } else if (drv->bdrv_save_vmstate) { 1724 return drv->bdrv_save_vmstate(bs, qiov, pos); 1725 } else if (bs->file) { 1726 return bdrv_writev_vmstate(bs->file, qiov, pos); 1727 } 1728 1729 return -ENOTSUP; 1730 } 1731 1732 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1733 int64_t pos, int size) 1734 { 1735 BlockDriver *drv = bs->drv; 1736 if (!drv) 1737 return -ENOMEDIUM; 1738 if (drv->bdrv_load_vmstate) 1739 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1740 if (bs->file) 1741 return bdrv_load_vmstate(bs->file, buf, pos, size); 1742 return -ENOTSUP; 1743 } 1744 1745 /**************************************************************/ 1746 /* async I/Os */ 1747 1748 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1749 QEMUIOVector *qiov, int nb_sectors, 1750 BlockCompletionFunc *cb, void *opaque) 1751 { 1752 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1753 1754 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1755 cb, opaque, false); 1756 } 1757 1758 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1759 QEMUIOVector *qiov, int nb_sectors, 1760 BlockCompletionFunc *cb, void *opaque) 1761 { 1762 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1763 1764 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1765 cb, opaque, true); 1766 } 1767 1768 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1769 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1770 BlockCompletionFunc *cb, void *opaque) 1771 { 1772 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1773 1774 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1775 BDRV_REQ_ZERO_WRITE | flags, 1776 cb, opaque, true); 1777 } 1778 1779 1780 typedef struct MultiwriteCB { 1781 int error; 1782 int num_requests; 1783 int num_callbacks; 1784 struct { 1785 BlockCompletionFunc *cb; 1786 void *opaque; 1787 QEMUIOVector *free_qiov; 1788 } callbacks[]; 1789 } MultiwriteCB; 1790 1791 static void multiwrite_user_cb(MultiwriteCB *mcb) 1792 { 1793 int i; 1794 1795 for (i = 0; i < mcb->num_callbacks; i++) { 1796 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1797 if (mcb->callbacks[i].free_qiov) { 1798 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1799 } 1800 g_free(mcb->callbacks[i].free_qiov); 1801 } 1802 } 1803 1804 static void multiwrite_cb(void *opaque, int ret) 1805 { 1806 MultiwriteCB *mcb = opaque; 1807 1808 trace_multiwrite_cb(mcb, ret); 1809 1810 if (ret < 0 && !mcb->error) { 1811 mcb->error = ret; 1812 } 1813 1814 mcb->num_requests--; 1815 if (mcb->num_requests == 0) { 1816 multiwrite_user_cb(mcb); 1817 g_free(mcb); 1818 } 1819 } 1820 1821 static int multiwrite_req_compare(const void *a, const void *b) 1822 { 1823 const BlockRequest *req1 = a, *req2 = b; 1824 1825 /* 1826 * Note that we can't simply subtract req2->sector from req1->sector 1827 * here as that could overflow the return value. 1828 */ 1829 if (req1->sector > req2->sector) { 1830 return 1; 1831 } else if (req1->sector < req2->sector) { 1832 return -1; 1833 } else { 1834 return 0; 1835 } 1836 } 1837 1838 /* 1839 * Takes a bunch of requests and tries to merge them. Returns the number of 1840 * requests that remain after merging. 1841 */ 1842 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1843 int num_reqs, MultiwriteCB *mcb) 1844 { 1845 int i, outidx; 1846 1847 // Sort requests by start sector 1848 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1849 1850 // Check if adjacent requests touch the same clusters. If so, combine them, 1851 // filling up gaps with zero sectors. 1852 outidx = 0; 1853 for (i = 1; i < num_reqs; i++) { 1854 int merge = 0; 1855 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1856 1857 // Handle exactly sequential writes and overlapping writes. 1858 if (reqs[i].sector <= oldreq_last) { 1859 merge = 1; 1860 } 1861 1862 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1863 merge = 0; 1864 } 1865 1866 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1867 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1868 merge = 0; 1869 } 1870 1871 if (merge) { 1872 size_t size; 1873 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1874 qemu_iovec_init(qiov, 1875 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1876 1877 // Add the first request to the merged one. If the requests are 1878 // overlapping, drop the last sectors of the first request. 1879 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1880 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1881 1882 // We should need to add any zeros between the two requests 1883 assert (reqs[i].sector <= oldreq_last); 1884 1885 // Add the second request 1886 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1887 1888 // Add tail of first request, if necessary 1889 if (qiov->size < reqs[outidx].qiov->size) { 1890 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1891 reqs[outidx].qiov->size - qiov->size); 1892 } 1893 1894 reqs[outidx].nb_sectors = qiov->size >> 9; 1895 reqs[outidx].qiov = qiov; 1896 1897 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1898 } else { 1899 outidx++; 1900 reqs[outidx].sector = reqs[i].sector; 1901 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1902 reqs[outidx].qiov = reqs[i].qiov; 1903 } 1904 } 1905 1906 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); 1907 1908 return outidx + 1; 1909 } 1910 1911 /* 1912 * Submit multiple AIO write requests at once. 1913 * 1914 * On success, the function returns 0 and all requests in the reqs array have 1915 * been submitted. In error case this function returns -1, and any of the 1916 * requests may or may not be submitted yet. In particular, this means that the 1917 * callback will be called for some of the requests, for others it won't. The 1918 * caller must check the error field of the BlockRequest to wait for the right 1919 * callbacks (if error != 0, no callback will be called). 1920 * 1921 * The implementation may modify the contents of the reqs array, e.g. to merge 1922 * requests. However, the fields opaque and error are left unmodified as they 1923 * are used to signal failure for a single request to the caller. 1924 */ 1925 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1926 { 1927 MultiwriteCB *mcb; 1928 int i; 1929 1930 /* don't submit writes if we don't have a medium */ 1931 if (bs->drv == NULL) { 1932 for (i = 0; i < num_reqs; i++) { 1933 reqs[i].error = -ENOMEDIUM; 1934 } 1935 return -1; 1936 } 1937 1938 if (num_reqs == 0) { 1939 return 0; 1940 } 1941 1942 // Create MultiwriteCB structure 1943 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1944 mcb->num_requests = 0; 1945 mcb->num_callbacks = num_reqs; 1946 1947 for (i = 0; i < num_reqs; i++) { 1948 mcb->callbacks[i].cb = reqs[i].cb; 1949 mcb->callbacks[i].opaque = reqs[i].opaque; 1950 } 1951 1952 // Check for mergable requests 1953 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1954 1955 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1956 1957 /* Run the aio requests. */ 1958 mcb->num_requests = num_reqs; 1959 for (i = 0; i < num_reqs; i++) { 1960 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1961 reqs[i].nb_sectors, reqs[i].flags, 1962 multiwrite_cb, mcb, 1963 true); 1964 } 1965 1966 return 0; 1967 } 1968 1969 void bdrv_aio_cancel(BlockAIOCB *acb) 1970 { 1971 qemu_aio_ref(acb); 1972 bdrv_aio_cancel_async(acb); 1973 while (acb->refcnt > 1) { 1974 if (acb->aiocb_info->get_aio_context) { 1975 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1976 } else if (acb->bs) { 1977 aio_poll(bdrv_get_aio_context(acb->bs), true); 1978 } else { 1979 abort(); 1980 } 1981 } 1982 qemu_aio_unref(acb); 1983 } 1984 1985 /* Async version of aio cancel. The caller is not blocked if the acb implements 1986 * cancel_async, otherwise we do nothing and let the request normally complete. 1987 * In either case the completion callback must be called. */ 1988 void bdrv_aio_cancel_async(BlockAIOCB *acb) 1989 { 1990 if (acb->aiocb_info->cancel_async) { 1991 acb->aiocb_info->cancel_async(acb); 1992 } 1993 } 1994 1995 /**************************************************************/ 1996 /* async block device emulation */ 1997 1998 typedef struct BlockAIOCBSync { 1999 BlockAIOCB common; 2000 QEMUBH *bh; 2001 int ret; 2002 /* vector translation state */ 2003 QEMUIOVector *qiov; 2004 uint8_t *bounce; 2005 int is_write; 2006 } BlockAIOCBSync; 2007 2008 static const AIOCBInfo bdrv_em_aiocb_info = { 2009 .aiocb_size = sizeof(BlockAIOCBSync), 2010 }; 2011 2012 static void bdrv_aio_bh_cb(void *opaque) 2013 { 2014 BlockAIOCBSync *acb = opaque; 2015 2016 if (!acb->is_write && acb->ret >= 0) { 2017 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2018 } 2019 qemu_vfree(acb->bounce); 2020 acb->common.cb(acb->common.opaque, acb->ret); 2021 qemu_bh_delete(acb->bh); 2022 acb->bh = NULL; 2023 qemu_aio_unref(acb); 2024 } 2025 2026 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2027 int64_t sector_num, 2028 QEMUIOVector *qiov, 2029 int nb_sectors, 2030 BlockCompletionFunc *cb, 2031 void *opaque, 2032 int is_write) 2033 2034 { 2035 BlockAIOCBSync *acb; 2036 2037 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2038 acb->is_write = is_write; 2039 acb->qiov = qiov; 2040 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2041 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2042 2043 if (acb->bounce == NULL) { 2044 acb->ret = -ENOMEM; 2045 } else if (is_write) { 2046 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2047 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2048 } else { 2049 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2050 } 2051 2052 qemu_bh_schedule(acb->bh); 2053 2054 return &acb->common; 2055 } 2056 2057 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2058 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2059 BlockCompletionFunc *cb, void *opaque) 2060 { 2061 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2062 } 2063 2064 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2065 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2066 BlockCompletionFunc *cb, void *opaque) 2067 { 2068 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2069 } 2070 2071 2072 typedef struct BlockAIOCBCoroutine { 2073 BlockAIOCB common; 2074 BlockRequest req; 2075 bool is_write; 2076 bool need_bh; 2077 bool *done; 2078 QEMUBH* bh; 2079 } BlockAIOCBCoroutine; 2080 2081 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2082 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2083 }; 2084 2085 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2086 { 2087 if (!acb->need_bh) { 2088 acb->common.cb(acb->common.opaque, acb->req.error); 2089 qemu_aio_unref(acb); 2090 } 2091 } 2092 2093 static void bdrv_co_em_bh(void *opaque) 2094 { 2095 BlockAIOCBCoroutine *acb = opaque; 2096 2097 assert(!acb->need_bh); 2098 qemu_bh_delete(acb->bh); 2099 bdrv_co_complete(acb); 2100 } 2101 2102 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2103 { 2104 acb->need_bh = false; 2105 if (acb->req.error != -EINPROGRESS) { 2106 BlockDriverState *bs = acb->common.bs; 2107 2108 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2109 qemu_bh_schedule(acb->bh); 2110 } 2111 } 2112 2113 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2114 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2115 { 2116 BlockAIOCBCoroutine *acb = opaque; 2117 BlockDriverState *bs = acb->common.bs; 2118 2119 if (!acb->is_write) { 2120 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2121 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2122 } else { 2123 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2124 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2125 } 2126 2127 bdrv_co_complete(acb); 2128 } 2129 2130 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2131 int64_t sector_num, 2132 QEMUIOVector *qiov, 2133 int nb_sectors, 2134 BdrvRequestFlags flags, 2135 BlockCompletionFunc *cb, 2136 void *opaque, 2137 bool is_write) 2138 { 2139 Coroutine *co; 2140 BlockAIOCBCoroutine *acb; 2141 2142 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2143 acb->need_bh = true; 2144 acb->req.error = -EINPROGRESS; 2145 acb->req.sector = sector_num; 2146 acb->req.nb_sectors = nb_sectors; 2147 acb->req.qiov = qiov; 2148 acb->req.flags = flags; 2149 acb->is_write = is_write; 2150 2151 co = qemu_coroutine_create(bdrv_co_do_rw); 2152 qemu_coroutine_enter(co, acb); 2153 2154 bdrv_co_maybe_schedule_bh(acb); 2155 return &acb->common; 2156 } 2157 2158 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2159 { 2160 BlockAIOCBCoroutine *acb = opaque; 2161 BlockDriverState *bs = acb->common.bs; 2162 2163 acb->req.error = bdrv_co_flush(bs); 2164 bdrv_co_complete(acb); 2165 } 2166 2167 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2168 BlockCompletionFunc *cb, void *opaque) 2169 { 2170 trace_bdrv_aio_flush(bs, opaque); 2171 2172 Coroutine *co; 2173 BlockAIOCBCoroutine *acb; 2174 2175 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2176 acb->need_bh = true; 2177 acb->req.error = -EINPROGRESS; 2178 2179 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2180 qemu_coroutine_enter(co, acb); 2181 2182 bdrv_co_maybe_schedule_bh(acb); 2183 return &acb->common; 2184 } 2185 2186 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2187 { 2188 BlockAIOCBCoroutine *acb = opaque; 2189 BlockDriverState *bs = acb->common.bs; 2190 2191 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2192 bdrv_co_complete(acb); 2193 } 2194 2195 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2196 int64_t sector_num, int nb_sectors, 2197 BlockCompletionFunc *cb, void *opaque) 2198 { 2199 Coroutine *co; 2200 BlockAIOCBCoroutine *acb; 2201 2202 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2203 2204 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2205 acb->need_bh = true; 2206 acb->req.error = -EINPROGRESS; 2207 acb->req.sector = sector_num; 2208 acb->req.nb_sectors = nb_sectors; 2209 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2210 qemu_coroutine_enter(co, acb); 2211 2212 bdrv_co_maybe_schedule_bh(acb); 2213 return &acb->common; 2214 } 2215 2216 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2217 BlockCompletionFunc *cb, void *opaque) 2218 { 2219 BlockAIOCB *acb; 2220 2221 acb = g_malloc(aiocb_info->aiocb_size); 2222 acb->aiocb_info = aiocb_info; 2223 acb->bs = bs; 2224 acb->cb = cb; 2225 acb->opaque = opaque; 2226 acb->refcnt = 1; 2227 return acb; 2228 } 2229 2230 void qemu_aio_ref(void *p) 2231 { 2232 BlockAIOCB *acb = p; 2233 acb->refcnt++; 2234 } 2235 2236 void qemu_aio_unref(void *p) 2237 { 2238 BlockAIOCB *acb = p; 2239 assert(acb->refcnt > 0); 2240 if (--acb->refcnt == 0) { 2241 g_free(acb); 2242 } 2243 } 2244 2245 /**************************************************************/ 2246 /* Coroutine block device emulation */ 2247 2248 typedef struct CoroutineIOCompletion { 2249 Coroutine *coroutine; 2250 int ret; 2251 } CoroutineIOCompletion; 2252 2253 static void bdrv_co_io_em_complete(void *opaque, int ret) 2254 { 2255 CoroutineIOCompletion *co = opaque; 2256 2257 co->ret = ret; 2258 qemu_coroutine_enter(co->coroutine, NULL); 2259 } 2260 2261 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2262 int nb_sectors, QEMUIOVector *iov, 2263 bool is_write) 2264 { 2265 CoroutineIOCompletion co = { 2266 .coroutine = qemu_coroutine_self(), 2267 }; 2268 BlockAIOCB *acb; 2269 2270 if (is_write) { 2271 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2272 bdrv_co_io_em_complete, &co); 2273 } else { 2274 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2275 bdrv_co_io_em_complete, &co); 2276 } 2277 2278 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2279 if (!acb) { 2280 return -EIO; 2281 } 2282 qemu_coroutine_yield(); 2283 2284 return co.ret; 2285 } 2286 2287 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2288 int64_t sector_num, int nb_sectors, 2289 QEMUIOVector *iov) 2290 { 2291 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2292 } 2293 2294 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2295 int64_t sector_num, int nb_sectors, 2296 QEMUIOVector *iov) 2297 { 2298 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2299 } 2300 2301 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2302 { 2303 RwCo *rwco = opaque; 2304 2305 rwco->ret = bdrv_co_flush(rwco->bs); 2306 } 2307 2308 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2309 { 2310 int ret; 2311 2312 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2313 bdrv_is_sg(bs)) { 2314 return 0; 2315 } 2316 2317 /* Write back cached data to the OS even with cache=unsafe */ 2318 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2319 if (bs->drv->bdrv_co_flush_to_os) { 2320 ret = bs->drv->bdrv_co_flush_to_os(bs); 2321 if (ret < 0) { 2322 return ret; 2323 } 2324 } 2325 2326 /* But don't actually force it to the disk with cache=unsafe */ 2327 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2328 goto flush_parent; 2329 } 2330 2331 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2332 if (bs->drv->bdrv_co_flush_to_disk) { 2333 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2334 } else if (bs->drv->bdrv_aio_flush) { 2335 BlockAIOCB *acb; 2336 CoroutineIOCompletion co = { 2337 .coroutine = qemu_coroutine_self(), 2338 }; 2339 2340 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2341 if (acb == NULL) { 2342 ret = -EIO; 2343 } else { 2344 qemu_coroutine_yield(); 2345 ret = co.ret; 2346 } 2347 } else { 2348 /* 2349 * Some block drivers always operate in either writethrough or unsafe 2350 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2351 * know how the server works (because the behaviour is hardcoded or 2352 * depends on server-side configuration), so we can't ensure that 2353 * everything is safe on disk. Returning an error doesn't work because 2354 * that would break guests even if the server operates in writethrough 2355 * mode. 2356 * 2357 * Let's hope the user knows what he's doing. 2358 */ 2359 ret = 0; 2360 } 2361 if (ret < 0) { 2362 return ret; 2363 } 2364 2365 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2366 * in the case of cache=unsafe, so there are no useless flushes. 2367 */ 2368 flush_parent: 2369 return bdrv_co_flush(bs->file); 2370 } 2371 2372 int bdrv_flush(BlockDriverState *bs) 2373 { 2374 Coroutine *co; 2375 RwCo rwco = { 2376 .bs = bs, 2377 .ret = NOT_DONE, 2378 }; 2379 2380 if (qemu_in_coroutine()) { 2381 /* Fast-path if already in coroutine context */ 2382 bdrv_flush_co_entry(&rwco); 2383 } else { 2384 AioContext *aio_context = bdrv_get_aio_context(bs); 2385 2386 co = qemu_coroutine_create(bdrv_flush_co_entry); 2387 qemu_coroutine_enter(co, &rwco); 2388 while (rwco.ret == NOT_DONE) { 2389 aio_poll(aio_context, true); 2390 } 2391 } 2392 2393 return rwco.ret; 2394 } 2395 2396 typedef struct DiscardCo { 2397 BlockDriverState *bs; 2398 int64_t sector_num; 2399 int nb_sectors; 2400 int ret; 2401 } DiscardCo; 2402 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2403 { 2404 DiscardCo *rwco = opaque; 2405 2406 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2407 } 2408 2409 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2410 int nb_sectors) 2411 { 2412 int max_discard, ret; 2413 2414 if (!bs->drv) { 2415 return -ENOMEDIUM; 2416 } 2417 2418 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2419 if (ret < 0) { 2420 return ret; 2421 } else if (bs->read_only) { 2422 return -EPERM; 2423 } 2424 2425 /* Do nothing if disabled. */ 2426 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2427 return 0; 2428 } 2429 2430 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2431 return 0; 2432 } 2433 2434 bdrv_set_dirty(bs, sector_num, nb_sectors); 2435 2436 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2437 while (nb_sectors > 0) { 2438 int ret; 2439 int num = nb_sectors; 2440 2441 /* align request */ 2442 if (bs->bl.discard_alignment && 2443 num >= bs->bl.discard_alignment && 2444 sector_num % bs->bl.discard_alignment) { 2445 if (num > bs->bl.discard_alignment) { 2446 num = bs->bl.discard_alignment; 2447 } 2448 num -= sector_num % bs->bl.discard_alignment; 2449 } 2450 2451 /* limit request size */ 2452 if (num > max_discard) { 2453 num = max_discard; 2454 } 2455 2456 if (bs->drv->bdrv_co_discard) { 2457 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2458 } else { 2459 BlockAIOCB *acb; 2460 CoroutineIOCompletion co = { 2461 .coroutine = qemu_coroutine_self(), 2462 }; 2463 2464 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2465 bdrv_co_io_em_complete, &co); 2466 if (acb == NULL) { 2467 return -EIO; 2468 } else { 2469 qemu_coroutine_yield(); 2470 ret = co.ret; 2471 } 2472 } 2473 if (ret && ret != -ENOTSUP) { 2474 return ret; 2475 } 2476 2477 sector_num += num; 2478 nb_sectors -= num; 2479 } 2480 return 0; 2481 } 2482 2483 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2484 { 2485 Coroutine *co; 2486 DiscardCo rwco = { 2487 .bs = bs, 2488 .sector_num = sector_num, 2489 .nb_sectors = nb_sectors, 2490 .ret = NOT_DONE, 2491 }; 2492 2493 if (qemu_in_coroutine()) { 2494 /* Fast-path if already in coroutine context */ 2495 bdrv_discard_co_entry(&rwco); 2496 } else { 2497 AioContext *aio_context = bdrv_get_aio_context(bs); 2498 2499 co = qemu_coroutine_create(bdrv_discard_co_entry); 2500 qemu_coroutine_enter(co, &rwco); 2501 while (rwco.ret == NOT_DONE) { 2502 aio_poll(aio_context, true); 2503 } 2504 } 2505 2506 return rwco.ret; 2507 } 2508 2509 /* needed for generic scsi interface */ 2510 2511 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2512 { 2513 BlockDriver *drv = bs->drv; 2514 2515 if (drv && drv->bdrv_ioctl) 2516 return drv->bdrv_ioctl(bs, req, buf); 2517 return -ENOTSUP; 2518 } 2519 2520 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2521 unsigned long int req, void *buf, 2522 BlockCompletionFunc *cb, void *opaque) 2523 { 2524 BlockDriver *drv = bs->drv; 2525 2526 if (drv && drv->bdrv_aio_ioctl) 2527 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 2528 return NULL; 2529 } 2530 2531 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2532 { 2533 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2534 } 2535 2536 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2537 { 2538 return memset(qemu_blockalign(bs, size), 0, size); 2539 } 2540 2541 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2542 { 2543 size_t align = bdrv_opt_mem_align(bs); 2544 2545 /* Ensure that NULL is never returned on success */ 2546 assert(align > 0); 2547 if (size == 0) { 2548 size = align; 2549 } 2550 2551 return qemu_try_memalign(align, size); 2552 } 2553 2554 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2555 { 2556 void *mem = qemu_try_blockalign(bs, size); 2557 2558 if (mem) { 2559 memset(mem, 0, size); 2560 } 2561 2562 return mem; 2563 } 2564 2565 /* 2566 * Check if all memory in this vector is sector aligned. 2567 */ 2568 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2569 { 2570 int i; 2571 size_t alignment = bdrv_min_mem_align(bs); 2572 2573 for (i = 0; i < qiov->niov; i++) { 2574 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2575 return false; 2576 } 2577 if (qiov->iov[i].iov_len % alignment) { 2578 return false; 2579 } 2580 } 2581 2582 return true; 2583 } 2584 2585 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2586 NotifierWithReturn *notifier) 2587 { 2588 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2589 } 2590 2591 void bdrv_io_plug(BlockDriverState *bs) 2592 { 2593 BlockDriver *drv = bs->drv; 2594 if (drv && drv->bdrv_io_plug) { 2595 drv->bdrv_io_plug(bs); 2596 } else if (bs->file) { 2597 bdrv_io_plug(bs->file); 2598 } 2599 } 2600 2601 void bdrv_io_unplug(BlockDriverState *bs) 2602 { 2603 BlockDriver *drv = bs->drv; 2604 if (drv && drv->bdrv_io_unplug) { 2605 drv->bdrv_io_unplug(bs); 2606 } else if (bs->file) { 2607 bdrv_io_unplug(bs->file); 2608 } 2609 } 2610 2611 void bdrv_flush_io_queue(BlockDriverState *bs) 2612 { 2613 BlockDriver *drv = bs->drv; 2614 if (drv && drv->bdrv_flush_io_queue) { 2615 drv->bdrv_flush_io_queue(bs); 2616 } else if (bs->file) { 2617 bdrv_flush_io_queue(bs->file); 2618 } 2619 bdrv_start_throttled_reqs(bs); 2620 } 2621