1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "sysemu/block-backend.h" 27 #include "block/blockjob.h" 28 #include "block/block_int.h" 29 #include "block/throttle-groups.h" 30 #include "qemu/error-report.h" 31 32 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 33 34 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 35 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 36 BlockCompletionFunc *cb, void *opaque); 37 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 38 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 39 BlockCompletionFunc *cb, void *opaque); 40 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 41 int64_t sector_num, int nb_sectors, 42 QEMUIOVector *iov); 43 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 44 int64_t sector_num, int nb_sectors, 45 QEMUIOVector *iov); 46 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 47 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 48 BdrvRequestFlags flags); 49 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 50 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 51 BdrvRequestFlags flags); 52 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 53 int64_t sector_num, 54 QEMUIOVector *qiov, 55 int nb_sectors, 56 BdrvRequestFlags flags, 57 BlockCompletionFunc *cb, 58 void *opaque, 59 bool is_write); 60 static void coroutine_fn bdrv_co_do_rw(void *opaque); 61 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 62 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 63 64 /* throttling disk I/O limits */ 65 void bdrv_set_io_limits(BlockDriverState *bs, 66 ThrottleConfig *cfg) 67 { 68 int i; 69 70 throttle_group_config(bs, cfg); 71 72 for (i = 0; i < 2; i++) { 73 qemu_co_enter_next(&bs->throttled_reqs[i]); 74 } 75 } 76 77 /* this function drain all the throttled IOs */ 78 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 79 { 80 bool drained = false; 81 bool enabled = bs->io_limits_enabled; 82 int i; 83 84 bs->io_limits_enabled = false; 85 86 for (i = 0; i < 2; i++) { 87 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 88 drained = true; 89 } 90 } 91 92 bs->io_limits_enabled = enabled; 93 94 return drained; 95 } 96 97 void bdrv_io_limits_disable(BlockDriverState *bs) 98 { 99 bs->io_limits_enabled = false; 100 bdrv_start_throttled_reqs(bs); 101 throttle_group_unregister_bs(bs); 102 } 103 104 /* should be called before bdrv_set_io_limits if a limit is set */ 105 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 106 { 107 assert(!bs->io_limits_enabled); 108 throttle_group_register_bs(bs, group); 109 bs->io_limits_enabled = true; 110 } 111 112 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 113 { 114 /* this bs is not part of any group */ 115 if (!bs->throttle_state) { 116 return; 117 } 118 119 /* this bs is a part of the same group than the one we want */ 120 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 121 return; 122 } 123 124 /* need to change the group this bs belong to */ 125 bdrv_io_limits_disable(bs); 126 bdrv_io_limits_enable(bs, group); 127 } 128 129 void bdrv_setup_io_funcs(BlockDriver *bdrv) 130 { 131 /* Block drivers without coroutine functions need emulation */ 132 if (!bdrv->bdrv_co_readv) { 133 bdrv->bdrv_co_readv = bdrv_co_readv_em; 134 bdrv->bdrv_co_writev = bdrv_co_writev_em; 135 136 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 137 * the block driver lacks aio we need to emulate that too. 138 */ 139 if (!bdrv->bdrv_aio_readv) { 140 /* add AIO emulation layer */ 141 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 142 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 143 } 144 } 145 } 146 147 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 148 { 149 BlockDriver *drv = bs->drv; 150 Error *local_err = NULL; 151 152 memset(&bs->bl, 0, sizeof(bs->bl)); 153 154 if (!drv) { 155 return; 156 } 157 158 /* Take some limits from the children as a default */ 159 if (bs->file) { 160 bdrv_refresh_limits(bs->file->bs, &local_err); 161 if (local_err) { 162 error_propagate(errp, local_err); 163 return; 164 } 165 bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; 166 bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; 167 bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; 168 bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; 169 } else { 170 bs->bl.min_mem_alignment = 512; 171 bs->bl.opt_mem_alignment = getpagesize(); 172 } 173 174 if (bs->backing) { 175 bdrv_refresh_limits(bs->backing->bs, &local_err); 176 if (local_err) { 177 error_propagate(errp, local_err); 178 return; 179 } 180 bs->bl.opt_transfer_length = 181 MAX(bs->bl.opt_transfer_length, 182 bs->backing->bs->bl.opt_transfer_length); 183 bs->bl.max_transfer_length = 184 MIN_NON_ZERO(bs->bl.max_transfer_length, 185 bs->backing->bs->bl.max_transfer_length); 186 bs->bl.opt_mem_alignment = 187 MAX(bs->bl.opt_mem_alignment, 188 bs->backing->bs->bl.opt_mem_alignment); 189 bs->bl.min_mem_alignment = 190 MAX(bs->bl.min_mem_alignment, 191 bs->backing->bs->bl.min_mem_alignment); 192 } 193 194 /* Then let the driver override it */ 195 if (drv->bdrv_refresh_limits) { 196 drv->bdrv_refresh_limits(bs, errp); 197 } 198 } 199 200 /** 201 * The copy-on-read flag is actually a reference count so multiple users may 202 * use the feature without worrying about clobbering its previous state. 203 * Copy-on-read stays enabled until all users have called to disable it. 204 */ 205 void bdrv_enable_copy_on_read(BlockDriverState *bs) 206 { 207 bs->copy_on_read++; 208 } 209 210 void bdrv_disable_copy_on_read(BlockDriverState *bs) 211 { 212 assert(bs->copy_on_read > 0); 213 bs->copy_on_read--; 214 } 215 216 /* Check if any requests are in-flight (including throttled requests) */ 217 bool bdrv_requests_pending(BlockDriverState *bs) 218 { 219 if (!QLIST_EMPTY(&bs->tracked_requests)) { 220 return true; 221 } 222 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 223 return true; 224 } 225 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 226 return true; 227 } 228 if (bs->file && bdrv_requests_pending(bs->file->bs)) { 229 return true; 230 } 231 if (bs->backing && bdrv_requests_pending(bs->backing->bs)) { 232 return true; 233 } 234 return false; 235 } 236 237 /* 238 * Wait for pending requests to complete on a single BlockDriverState subtree 239 * 240 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 241 * AioContext. 242 * 243 * Only this BlockDriverState's AioContext is run, so in-flight requests must 244 * not depend on events in other AioContexts. In that case, use 245 * bdrv_drain_all() instead. 246 */ 247 void bdrv_drain(BlockDriverState *bs) 248 { 249 bool busy = true; 250 251 while (busy) { 252 /* Keep iterating */ 253 bdrv_flush_io_queue(bs); 254 busy = bdrv_requests_pending(bs); 255 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 256 } 257 } 258 259 /* 260 * Wait for pending requests to complete across all BlockDriverStates 261 * 262 * This function does not flush data to disk, use bdrv_flush_all() for that 263 * after calling this function. 264 */ 265 void bdrv_drain_all(void) 266 { 267 /* Always run first iteration so any pending completion BHs run */ 268 bool busy = true; 269 BlockDriverState *bs = NULL; 270 GSList *aio_ctxs = NULL, *ctx; 271 272 while ((bs = bdrv_next(bs))) { 273 AioContext *aio_context = bdrv_get_aio_context(bs); 274 275 aio_context_acquire(aio_context); 276 if (bs->job) { 277 block_job_pause(bs->job); 278 } 279 aio_context_release(aio_context); 280 281 if (!g_slist_find(aio_ctxs, aio_context)) { 282 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 283 } 284 } 285 286 /* Note that completion of an asynchronous I/O operation can trigger any 287 * number of other I/O operations on other devices---for example a 288 * coroutine can submit an I/O request to another device in response to 289 * request completion. Therefore we must keep looping until there was no 290 * more activity rather than simply draining each device independently. 291 */ 292 while (busy) { 293 busy = false; 294 295 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 296 AioContext *aio_context = ctx->data; 297 bs = NULL; 298 299 aio_context_acquire(aio_context); 300 while ((bs = bdrv_next(bs))) { 301 if (aio_context == bdrv_get_aio_context(bs)) { 302 bdrv_flush_io_queue(bs); 303 if (bdrv_requests_pending(bs)) { 304 busy = true; 305 aio_poll(aio_context, busy); 306 } 307 } 308 } 309 busy |= aio_poll(aio_context, false); 310 aio_context_release(aio_context); 311 } 312 } 313 314 bs = NULL; 315 while ((bs = bdrv_next(bs))) { 316 AioContext *aio_context = bdrv_get_aio_context(bs); 317 318 aio_context_acquire(aio_context); 319 if (bs->job) { 320 block_job_resume(bs->job); 321 } 322 aio_context_release(aio_context); 323 } 324 g_slist_free(aio_ctxs); 325 } 326 327 /** 328 * Remove an active request from the tracked requests list 329 * 330 * This function should be called when a tracked request is completing. 331 */ 332 static void tracked_request_end(BdrvTrackedRequest *req) 333 { 334 if (req->serialising) { 335 req->bs->serialising_in_flight--; 336 } 337 338 QLIST_REMOVE(req, list); 339 qemu_co_queue_restart_all(&req->wait_queue); 340 } 341 342 /** 343 * Add an active request to the tracked requests list 344 */ 345 static void tracked_request_begin(BdrvTrackedRequest *req, 346 BlockDriverState *bs, 347 int64_t offset, 348 unsigned int bytes, bool is_write) 349 { 350 *req = (BdrvTrackedRequest){ 351 .bs = bs, 352 .offset = offset, 353 .bytes = bytes, 354 .is_write = is_write, 355 .co = qemu_coroutine_self(), 356 .serialising = false, 357 .overlap_offset = offset, 358 .overlap_bytes = bytes, 359 }; 360 361 qemu_co_queue_init(&req->wait_queue); 362 363 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 364 } 365 366 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 367 { 368 int64_t overlap_offset = req->offset & ~(align - 1); 369 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 370 - overlap_offset; 371 372 if (!req->serialising) { 373 req->bs->serialising_in_flight++; 374 req->serialising = true; 375 } 376 377 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 378 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 379 } 380 381 /** 382 * Round a region to cluster boundaries 383 */ 384 void bdrv_round_to_clusters(BlockDriverState *bs, 385 int64_t sector_num, int nb_sectors, 386 int64_t *cluster_sector_num, 387 int *cluster_nb_sectors) 388 { 389 BlockDriverInfo bdi; 390 391 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 392 *cluster_sector_num = sector_num; 393 *cluster_nb_sectors = nb_sectors; 394 } else { 395 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 396 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 397 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 398 nb_sectors, c); 399 } 400 } 401 402 static int bdrv_get_cluster_size(BlockDriverState *bs) 403 { 404 BlockDriverInfo bdi; 405 int ret; 406 407 ret = bdrv_get_info(bs, &bdi); 408 if (ret < 0 || bdi.cluster_size == 0) { 409 return bs->request_alignment; 410 } else { 411 return bdi.cluster_size; 412 } 413 } 414 415 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 416 int64_t offset, unsigned int bytes) 417 { 418 /* aaaa bbbb */ 419 if (offset >= req->overlap_offset + req->overlap_bytes) { 420 return false; 421 } 422 /* bbbb aaaa */ 423 if (req->overlap_offset >= offset + bytes) { 424 return false; 425 } 426 return true; 427 } 428 429 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 430 { 431 BlockDriverState *bs = self->bs; 432 BdrvTrackedRequest *req; 433 bool retry; 434 bool waited = false; 435 436 if (!bs->serialising_in_flight) { 437 return false; 438 } 439 440 do { 441 retry = false; 442 QLIST_FOREACH(req, &bs->tracked_requests, list) { 443 if (req == self || (!req->serialising && !self->serialising)) { 444 continue; 445 } 446 if (tracked_request_overlaps(req, self->overlap_offset, 447 self->overlap_bytes)) 448 { 449 /* Hitting this means there was a reentrant request, for 450 * example, a block driver issuing nested requests. This must 451 * never happen since it means deadlock. 452 */ 453 assert(qemu_coroutine_self() != req->co); 454 455 /* If the request is already (indirectly) waiting for us, or 456 * will wait for us as soon as it wakes up, then just go on 457 * (instead of producing a deadlock in the former case). */ 458 if (!req->waiting_for) { 459 self->waiting_for = req; 460 qemu_co_queue_wait(&req->wait_queue); 461 self->waiting_for = NULL; 462 retry = true; 463 waited = true; 464 break; 465 } 466 } 467 } 468 } while (retry); 469 470 return waited; 471 } 472 473 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 474 size_t size) 475 { 476 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 477 return -EIO; 478 } 479 480 if (!bdrv_is_inserted(bs)) { 481 return -ENOMEDIUM; 482 } 483 484 if (offset < 0) { 485 return -EIO; 486 } 487 488 return 0; 489 } 490 491 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 492 int nb_sectors) 493 { 494 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 495 return -EIO; 496 } 497 498 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 499 nb_sectors * BDRV_SECTOR_SIZE); 500 } 501 502 typedef struct RwCo { 503 BlockDriverState *bs; 504 int64_t offset; 505 QEMUIOVector *qiov; 506 bool is_write; 507 int ret; 508 BdrvRequestFlags flags; 509 } RwCo; 510 511 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 512 { 513 RwCo *rwco = opaque; 514 515 if (!rwco->is_write) { 516 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 517 rwco->qiov->size, rwco->qiov, 518 rwco->flags); 519 } else { 520 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 521 rwco->qiov->size, rwco->qiov, 522 rwco->flags); 523 } 524 } 525 526 /* 527 * Process a vectored synchronous request using coroutines 528 */ 529 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 530 QEMUIOVector *qiov, bool is_write, 531 BdrvRequestFlags flags) 532 { 533 Coroutine *co; 534 RwCo rwco = { 535 .bs = bs, 536 .offset = offset, 537 .qiov = qiov, 538 .is_write = is_write, 539 .ret = NOT_DONE, 540 .flags = flags, 541 }; 542 543 /** 544 * In sync call context, when the vcpu is blocked, this throttling timer 545 * will not fire; so the I/O throttling function has to be disabled here 546 * if it has been enabled. 547 */ 548 if (bs->io_limits_enabled) { 549 fprintf(stderr, "Disabling I/O throttling on '%s' due " 550 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 551 bdrv_io_limits_disable(bs); 552 } 553 554 if (qemu_in_coroutine()) { 555 /* Fast-path if already in coroutine context */ 556 bdrv_rw_co_entry(&rwco); 557 } else { 558 AioContext *aio_context = bdrv_get_aio_context(bs); 559 560 co = qemu_coroutine_create(bdrv_rw_co_entry); 561 qemu_coroutine_enter(co, &rwco); 562 while (rwco.ret == NOT_DONE) { 563 aio_poll(aio_context, true); 564 } 565 } 566 return rwco.ret; 567 } 568 569 /* 570 * Process a synchronous request using coroutines 571 */ 572 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 573 int nb_sectors, bool is_write, BdrvRequestFlags flags) 574 { 575 QEMUIOVector qiov; 576 struct iovec iov = { 577 .iov_base = (void *)buf, 578 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 579 }; 580 581 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 582 return -EINVAL; 583 } 584 585 qemu_iovec_init_external(&qiov, &iov, 1); 586 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 587 &qiov, is_write, flags); 588 } 589 590 /* return < 0 if error. See bdrv_write() for the return codes */ 591 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 592 uint8_t *buf, int nb_sectors) 593 { 594 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 595 } 596 597 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 598 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 599 uint8_t *buf, int nb_sectors) 600 { 601 bool enabled; 602 int ret; 603 604 enabled = bs->io_limits_enabled; 605 bs->io_limits_enabled = false; 606 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 607 bs->io_limits_enabled = enabled; 608 return ret; 609 } 610 611 /* Return < 0 if error. Important errors are: 612 -EIO generic I/O error (may happen for all errors) 613 -ENOMEDIUM No media inserted. 614 -EINVAL Invalid sector number or nb_sectors 615 -EACCES Trying to write a read-only device 616 */ 617 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 618 const uint8_t *buf, int nb_sectors) 619 { 620 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 621 } 622 623 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 624 int nb_sectors, BdrvRequestFlags flags) 625 { 626 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 627 BDRV_REQ_ZERO_WRITE | flags); 628 } 629 630 /* 631 * Completely zero out a block device with the help of bdrv_write_zeroes. 632 * The operation is sped up by checking the block status and only writing 633 * zeroes to the device if they currently do not return zeroes. Optional 634 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 635 * 636 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 637 */ 638 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 639 { 640 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 641 int n; 642 643 target_sectors = bdrv_nb_sectors(bs); 644 if (target_sectors < 0) { 645 return target_sectors; 646 } 647 648 for (;;) { 649 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 650 if (nb_sectors <= 0) { 651 return 0; 652 } 653 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 654 if (ret < 0) { 655 error_report("error getting block status at sector %" PRId64 ": %s", 656 sector_num, strerror(-ret)); 657 return ret; 658 } 659 if (ret & BDRV_BLOCK_ZERO) { 660 sector_num += n; 661 continue; 662 } 663 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 664 if (ret < 0) { 665 error_report("error writing zeroes at sector %" PRId64 ": %s", 666 sector_num, strerror(-ret)); 667 return ret; 668 } 669 sector_num += n; 670 } 671 } 672 673 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 674 { 675 QEMUIOVector qiov; 676 struct iovec iov = { 677 .iov_base = (void *)buf, 678 .iov_len = bytes, 679 }; 680 int ret; 681 682 if (bytes < 0) { 683 return -EINVAL; 684 } 685 686 qemu_iovec_init_external(&qiov, &iov, 1); 687 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 688 if (ret < 0) { 689 return ret; 690 } 691 692 return bytes; 693 } 694 695 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 696 { 697 int ret; 698 699 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 700 if (ret < 0) { 701 return ret; 702 } 703 704 return qiov->size; 705 } 706 707 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 708 const void *buf, int bytes) 709 { 710 QEMUIOVector qiov; 711 struct iovec iov = { 712 .iov_base = (void *) buf, 713 .iov_len = bytes, 714 }; 715 716 if (bytes < 0) { 717 return -EINVAL; 718 } 719 720 qemu_iovec_init_external(&qiov, &iov, 1); 721 return bdrv_pwritev(bs, offset, &qiov); 722 } 723 724 /* 725 * Writes to the file and ensures that no writes are reordered across this 726 * request (acts as a barrier) 727 * 728 * Returns 0 on success, -errno in error cases. 729 */ 730 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 731 const void *buf, int count) 732 { 733 int ret; 734 735 ret = bdrv_pwrite(bs, offset, buf, count); 736 if (ret < 0) { 737 return ret; 738 } 739 740 /* No flush needed for cache modes that already do it */ 741 if (bs->enable_write_cache) { 742 bdrv_flush(bs); 743 } 744 745 return 0; 746 } 747 748 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 749 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 750 { 751 /* Perform I/O through a temporary buffer so that users who scribble over 752 * their read buffer while the operation is in progress do not end up 753 * modifying the image file. This is critical for zero-copy guest I/O 754 * where anything might happen inside guest memory. 755 */ 756 void *bounce_buffer; 757 758 BlockDriver *drv = bs->drv; 759 struct iovec iov; 760 QEMUIOVector bounce_qiov; 761 int64_t cluster_sector_num; 762 int cluster_nb_sectors; 763 size_t skip_bytes; 764 int ret; 765 766 /* Cover entire cluster so no additional backing file I/O is required when 767 * allocating cluster in the image file. 768 */ 769 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 770 &cluster_sector_num, &cluster_nb_sectors); 771 772 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 773 cluster_sector_num, cluster_nb_sectors); 774 775 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 776 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 777 if (bounce_buffer == NULL) { 778 ret = -ENOMEM; 779 goto err; 780 } 781 782 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 783 784 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 785 &bounce_qiov); 786 if (ret < 0) { 787 goto err; 788 } 789 790 if (drv->bdrv_co_write_zeroes && 791 buffer_is_zero(bounce_buffer, iov.iov_len)) { 792 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 793 cluster_nb_sectors, 0); 794 } else { 795 /* This does not change the data on the disk, it is not necessary 796 * to flush even in cache=writethrough mode. 797 */ 798 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 799 &bounce_qiov); 800 } 801 802 if (ret < 0) { 803 /* It might be okay to ignore write errors for guest requests. If this 804 * is a deliberate copy-on-read then we don't want to ignore the error. 805 * Simply report it in all cases. 806 */ 807 goto err; 808 } 809 810 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 811 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 812 nb_sectors * BDRV_SECTOR_SIZE); 813 814 err: 815 qemu_vfree(bounce_buffer); 816 return ret; 817 } 818 819 /* 820 * Forwards an already correctly aligned request to the BlockDriver. This 821 * handles copy on read and zeroing after EOF; any other features must be 822 * implemented by the caller. 823 */ 824 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 825 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 826 int64_t align, QEMUIOVector *qiov, int flags) 827 { 828 BlockDriver *drv = bs->drv; 829 int ret; 830 831 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 832 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 833 834 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 835 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 836 assert(!qiov || bytes == qiov->size); 837 838 /* Handle Copy on Read and associated serialisation */ 839 if (flags & BDRV_REQ_COPY_ON_READ) { 840 /* If we touch the same cluster it counts as an overlap. This 841 * guarantees that allocating writes will be serialized and not race 842 * with each other for the same cluster. For example, in copy-on-read 843 * it ensures that the CoR read and write operations are atomic and 844 * guest writes cannot interleave between them. */ 845 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 846 } 847 848 wait_serialising_requests(req); 849 850 if (flags & BDRV_REQ_COPY_ON_READ) { 851 int pnum; 852 853 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 854 if (ret < 0) { 855 goto out; 856 } 857 858 if (!ret || pnum != nb_sectors) { 859 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 860 goto out; 861 } 862 } 863 864 /* Forward the request to the BlockDriver */ 865 if (!bs->zero_beyond_eof) { 866 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 867 } else { 868 /* Read zeros after EOF */ 869 int64_t total_sectors, max_nb_sectors; 870 871 total_sectors = bdrv_nb_sectors(bs); 872 if (total_sectors < 0) { 873 ret = total_sectors; 874 goto out; 875 } 876 877 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 878 align >> BDRV_SECTOR_BITS); 879 if (nb_sectors < max_nb_sectors) { 880 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 881 } else if (max_nb_sectors > 0) { 882 QEMUIOVector local_qiov; 883 884 qemu_iovec_init(&local_qiov, qiov->niov); 885 qemu_iovec_concat(&local_qiov, qiov, 0, 886 max_nb_sectors * BDRV_SECTOR_SIZE); 887 888 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 889 &local_qiov); 890 891 qemu_iovec_destroy(&local_qiov); 892 } else { 893 ret = 0; 894 } 895 896 /* Reading beyond end of file is supposed to produce zeroes */ 897 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 898 uint64_t offset = MAX(0, total_sectors - sector_num); 899 uint64_t bytes = (sector_num + nb_sectors - offset) * 900 BDRV_SECTOR_SIZE; 901 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 902 } 903 } 904 905 out: 906 return ret; 907 } 908 909 /* 910 * Handle a read request in coroutine context 911 */ 912 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 913 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 914 BdrvRequestFlags flags) 915 { 916 BlockDriver *drv = bs->drv; 917 BdrvTrackedRequest req; 918 919 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 920 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 921 uint8_t *head_buf = NULL; 922 uint8_t *tail_buf = NULL; 923 QEMUIOVector local_qiov; 924 bool use_local_qiov = false; 925 int ret; 926 927 if (!drv) { 928 return -ENOMEDIUM; 929 } 930 931 ret = bdrv_check_byte_request(bs, offset, bytes); 932 if (ret < 0) { 933 return ret; 934 } 935 936 /* Don't do copy-on-read if we read data before write operation */ 937 if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) { 938 flags |= BDRV_REQ_COPY_ON_READ; 939 } 940 941 /* throttling disk I/O */ 942 if (bs->io_limits_enabled) { 943 throttle_group_co_io_limits_intercept(bs, bytes, false); 944 } 945 946 /* Align read if necessary by padding qiov */ 947 if (offset & (align - 1)) { 948 head_buf = qemu_blockalign(bs, align); 949 qemu_iovec_init(&local_qiov, qiov->niov + 2); 950 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 951 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 952 use_local_qiov = true; 953 954 bytes += offset & (align - 1); 955 offset = offset & ~(align - 1); 956 } 957 958 if ((offset + bytes) & (align - 1)) { 959 if (!use_local_qiov) { 960 qemu_iovec_init(&local_qiov, qiov->niov + 1); 961 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 962 use_local_qiov = true; 963 } 964 tail_buf = qemu_blockalign(bs, align); 965 qemu_iovec_add(&local_qiov, tail_buf, 966 align - ((offset + bytes) & (align - 1))); 967 968 bytes = ROUND_UP(bytes, align); 969 } 970 971 tracked_request_begin(&req, bs, offset, bytes, false); 972 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 973 use_local_qiov ? &local_qiov : qiov, 974 flags); 975 tracked_request_end(&req); 976 977 if (use_local_qiov) { 978 qemu_iovec_destroy(&local_qiov); 979 qemu_vfree(head_buf); 980 qemu_vfree(tail_buf); 981 } 982 983 return ret; 984 } 985 986 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 987 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 988 BdrvRequestFlags flags) 989 { 990 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 991 return -EINVAL; 992 } 993 994 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 995 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 996 } 997 998 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 999 int nb_sectors, QEMUIOVector *qiov) 1000 { 1001 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1002 1003 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1004 } 1005 1006 int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs, 1007 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1008 { 1009 trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors); 1010 1011 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1012 BDRV_REQ_NO_COPY_ON_READ); 1013 } 1014 1015 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1016 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1017 { 1018 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1019 1020 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1021 BDRV_REQ_COPY_ON_READ); 1022 } 1023 1024 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1025 1026 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1027 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1028 { 1029 BlockDriver *drv = bs->drv; 1030 QEMUIOVector qiov; 1031 struct iovec iov = {0}; 1032 int ret = 0; 1033 1034 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1035 BDRV_REQUEST_MAX_SECTORS); 1036 1037 while (nb_sectors > 0 && !ret) { 1038 int num = nb_sectors; 1039 1040 /* Align request. Block drivers can expect the "bulk" of the request 1041 * to be aligned. 1042 */ 1043 if (bs->bl.write_zeroes_alignment 1044 && num > bs->bl.write_zeroes_alignment) { 1045 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1046 /* Make a small request up to the first aligned sector. */ 1047 num = bs->bl.write_zeroes_alignment; 1048 num -= sector_num % bs->bl.write_zeroes_alignment; 1049 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1050 /* Shorten the request to the last aligned sector. num cannot 1051 * underflow because num > bs->bl.write_zeroes_alignment. 1052 */ 1053 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1054 } 1055 } 1056 1057 /* limit request size */ 1058 if (num > max_write_zeroes) { 1059 num = max_write_zeroes; 1060 } 1061 1062 ret = -ENOTSUP; 1063 /* First try the efficient write zeroes operation */ 1064 if (drv->bdrv_co_write_zeroes) { 1065 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1066 } 1067 1068 if (ret == -ENOTSUP) { 1069 /* Fall back to bounce buffer if write zeroes is unsupported */ 1070 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1071 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1072 num = MIN(num, max_xfer_len); 1073 iov.iov_len = num * BDRV_SECTOR_SIZE; 1074 if (iov.iov_base == NULL) { 1075 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1076 if (iov.iov_base == NULL) { 1077 ret = -ENOMEM; 1078 goto fail; 1079 } 1080 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1081 } 1082 qemu_iovec_init_external(&qiov, &iov, 1); 1083 1084 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1085 1086 /* Keep bounce buffer around if it is big enough for all 1087 * all future requests. 1088 */ 1089 if (num < max_xfer_len) { 1090 qemu_vfree(iov.iov_base); 1091 iov.iov_base = NULL; 1092 } 1093 } 1094 1095 sector_num += num; 1096 nb_sectors -= num; 1097 } 1098 1099 fail: 1100 qemu_vfree(iov.iov_base); 1101 return ret; 1102 } 1103 1104 /* 1105 * Forwards an already correctly aligned write request to the BlockDriver. 1106 */ 1107 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1108 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1109 QEMUIOVector *qiov, int flags) 1110 { 1111 BlockDriver *drv = bs->drv; 1112 bool waited; 1113 int ret; 1114 1115 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1116 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1117 1118 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1119 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1120 assert(!qiov || bytes == qiov->size); 1121 1122 waited = wait_serialising_requests(req); 1123 assert(!waited || !req->serialising); 1124 assert(req->overlap_offset <= offset); 1125 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1126 1127 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1128 1129 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1130 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1131 qemu_iovec_is_zero(qiov)) { 1132 flags |= BDRV_REQ_ZERO_WRITE; 1133 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1134 flags |= BDRV_REQ_MAY_UNMAP; 1135 } 1136 } 1137 1138 if (ret < 0) { 1139 /* Do nothing, write notifier decided to fail this request */ 1140 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1141 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 1142 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1143 } else { 1144 bdrv_debug_event(bs, BLKDBG_PWRITEV); 1145 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1146 } 1147 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 1148 1149 if (ret == 0 && !bs->enable_write_cache) { 1150 ret = bdrv_co_flush(bs); 1151 } 1152 1153 bdrv_set_dirty(bs, sector_num, nb_sectors); 1154 1155 if (bs->wr_highest_offset < offset + bytes) { 1156 bs->wr_highest_offset = offset + bytes; 1157 } 1158 1159 if (ret >= 0) { 1160 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1161 } 1162 1163 return ret; 1164 } 1165 1166 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1167 int64_t offset, 1168 unsigned int bytes, 1169 BdrvRequestFlags flags, 1170 BdrvTrackedRequest *req) 1171 { 1172 uint8_t *buf = NULL; 1173 QEMUIOVector local_qiov; 1174 struct iovec iov; 1175 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1176 unsigned int head_padding_bytes, tail_padding_bytes; 1177 int ret = 0; 1178 1179 head_padding_bytes = offset & (align - 1); 1180 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1181 1182 1183 assert(flags & BDRV_REQ_ZERO_WRITE); 1184 if (head_padding_bytes || tail_padding_bytes) { 1185 buf = qemu_blockalign(bs, align); 1186 iov = (struct iovec) { 1187 .iov_base = buf, 1188 .iov_len = align, 1189 }; 1190 qemu_iovec_init_external(&local_qiov, &iov, 1); 1191 } 1192 if (head_padding_bytes) { 1193 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1194 1195 /* RMW the unaligned part before head. */ 1196 mark_request_serialising(req, align); 1197 wait_serialising_requests(req); 1198 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1199 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1200 align, &local_qiov, 0); 1201 if (ret < 0) { 1202 goto fail; 1203 } 1204 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1205 1206 memset(buf + head_padding_bytes, 0, zero_bytes); 1207 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1208 &local_qiov, 1209 flags & ~BDRV_REQ_ZERO_WRITE); 1210 if (ret < 0) { 1211 goto fail; 1212 } 1213 offset += zero_bytes; 1214 bytes -= zero_bytes; 1215 } 1216 1217 assert(!bytes || (offset & (align - 1)) == 0); 1218 if (bytes >= align) { 1219 /* Write the aligned part in the middle. */ 1220 uint64_t aligned_bytes = bytes & ~(align - 1); 1221 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1222 NULL, flags); 1223 if (ret < 0) { 1224 goto fail; 1225 } 1226 bytes -= aligned_bytes; 1227 offset += aligned_bytes; 1228 } 1229 1230 assert(!bytes || (offset & (align - 1)) == 0); 1231 if (bytes) { 1232 assert(align == tail_padding_bytes + bytes); 1233 /* RMW the unaligned part after tail. */ 1234 mark_request_serialising(req, align); 1235 wait_serialising_requests(req); 1236 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1237 ret = bdrv_aligned_preadv(bs, req, offset, align, 1238 align, &local_qiov, 0); 1239 if (ret < 0) { 1240 goto fail; 1241 } 1242 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1243 1244 memset(buf, 0, bytes); 1245 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1246 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1247 } 1248 fail: 1249 qemu_vfree(buf); 1250 return ret; 1251 1252 } 1253 1254 /* 1255 * Handle a write request in coroutine context 1256 */ 1257 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1258 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1259 BdrvRequestFlags flags) 1260 { 1261 BdrvTrackedRequest req; 1262 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1263 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1264 uint8_t *head_buf = NULL; 1265 uint8_t *tail_buf = NULL; 1266 QEMUIOVector local_qiov; 1267 bool use_local_qiov = false; 1268 int ret; 1269 1270 if (!bs->drv) { 1271 return -ENOMEDIUM; 1272 } 1273 if (bs->read_only) { 1274 return -EPERM; 1275 } 1276 1277 ret = bdrv_check_byte_request(bs, offset, bytes); 1278 if (ret < 0) { 1279 return ret; 1280 } 1281 1282 /* throttling disk I/O */ 1283 if (bs->io_limits_enabled) { 1284 throttle_group_co_io_limits_intercept(bs, bytes, true); 1285 } 1286 1287 /* 1288 * Align write if necessary by performing a read-modify-write cycle. 1289 * Pad qiov with the read parts and be sure to have a tracked request not 1290 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1291 */ 1292 tracked_request_begin(&req, bs, offset, bytes, true); 1293 1294 if (!qiov) { 1295 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1296 goto out; 1297 } 1298 1299 if (offset & (align - 1)) { 1300 QEMUIOVector head_qiov; 1301 struct iovec head_iov; 1302 1303 mark_request_serialising(&req, align); 1304 wait_serialising_requests(&req); 1305 1306 head_buf = qemu_blockalign(bs, align); 1307 head_iov = (struct iovec) { 1308 .iov_base = head_buf, 1309 .iov_len = align, 1310 }; 1311 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1312 1313 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 1314 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1315 align, &head_qiov, 0); 1316 if (ret < 0) { 1317 goto fail; 1318 } 1319 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1320 1321 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1322 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1323 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1324 use_local_qiov = true; 1325 1326 bytes += offset & (align - 1); 1327 offset = offset & ~(align - 1); 1328 } 1329 1330 if ((offset + bytes) & (align - 1)) { 1331 QEMUIOVector tail_qiov; 1332 struct iovec tail_iov; 1333 size_t tail_bytes; 1334 bool waited; 1335 1336 mark_request_serialising(&req, align); 1337 waited = wait_serialising_requests(&req); 1338 assert(!waited || !use_local_qiov); 1339 1340 tail_buf = qemu_blockalign(bs, align); 1341 tail_iov = (struct iovec) { 1342 .iov_base = tail_buf, 1343 .iov_len = align, 1344 }; 1345 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1346 1347 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 1348 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1349 align, &tail_qiov, 0); 1350 if (ret < 0) { 1351 goto fail; 1352 } 1353 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1354 1355 if (!use_local_qiov) { 1356 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1357 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1358 use_local_qiov = true; 1359 } 1360 1361 tail_bytes = (offset + bytes) & (align - 1); 1362 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1363 1364 bytes = ROUND_UP(bytes, align); 1365 } 1366 1367 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1368 use_local_qiov ? &local_qiov : qiov, 1369 flags); 1370 1371 fail: 1372 1373 if (use_local_qiov) { 1374 qemu_iovec_destroy(&local_qiov); 1375 } 1376 qemu_vfree(head_buf); 1377 qemu_vfree(tail_buf); 1378 out: 1379 tracked_request_end(&req); 1380 return ret; 1381 } 1382 1383 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1384 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1385 BdrvRequestFlags flags) 1386 { 1387 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1388 return -EINVAL; 1389 } 1390 1391 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1392 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1393 } 1394 1395 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1396 int nb_sectors, QEMUIOVector *qiov) 1397 { 1398 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1399 1400 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1401 } 1402 1403 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1404 int64_t sector_num, int nb_sectors, 1405 BdrvRequestFlags flags) 1406 { 1407 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1408 1409 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1410 flags &= ~BDRV_REQ_MAY_UNMAP; 1411 } 1412 1413 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1414 BDRV_REQ_ZERO_WRITE | flags); 1415 } 1416 1417 int bdrv_flush_all(void) 1418 { 1419 BlockDriverState *bs = NULL; 1420 int result = 0; 1421 1422 while ((bs = bdrv_next(bs))) { 1423 AioContext *aio_context = bdrv_get_aio_context(bs); 1424 int ret; 1425 1426 aio_context_acquire(aio_context); 1427 ret = bdrv_flush(bs); 1428 if (ret < 0 && !result) { 1429 result = ret; 1430 } 1431 aio_context_release(aio_context); 1432 } 1433 1434 return result; 1435 } 1436 1437 typedef struct BdrvCoGetBlockStatusData { 1438 BlockDriverState *bs; 1439 BlockDriverState *base; 1440 int64_t sector_num; 1441 int nb_sectors; 1442 int *pnum; 1443 int64_t ret; 1444 bool done; 1445 } BdrvCoGetBlockStatusData; 1446 1447 /* 1448 * Returns the allocation status of the specified sectors. 1449 * Drivers not implementing the functionality are assumed to not support 1450 * backing files, hence all their sectors are reported as allocated. 1451 * 1452 * If 'sector_num' is beyond the end of the disk image the return value is 0 1453 * and 'pnum' is set to 0. 1454 * 1455 * 'pnum' is set to the number of sectors (including and immediately following 1456 * the specified sector) that are known to be in the same 1457 * allocated/unallocated state. 1458 * 1459 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1460 * beyond the end of the disk image it will be clamped. 1461 */ 1462 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1463 int64_t sector_num, 1464 int nb_sectors, int *pnum) 1465 { 1466 int64_t total_sectors; 1467 int64_t n; 1468 int64_t ret, ret2; 1469 1470 total_sectors = bdrv_nb_sectors(bs); 1471 if (total_sectors < 0) { 1472 return total_sectors; 1473 } 1474 1475 if (sector_num >= total_sectors) { 1476 *pnum = 0; 1477 return 0; 1478 } 1479 1480 n = total_sectors - sector_num; 1481 if (n < nb_sectors) { 1482 nb_sectors = n; 1483 } 1484 1485 if (!bs->drv->bdrv_co_get_block_status) { 1486 *pnum = nb_sectors; 1487 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1488 if (bs->drv->protocol_name) { 1489 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1490 } 1491 return ret; 1492 } 1493 1494 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1495 if (ret < 0) { 1496 *pnum = 0; 1497 return ret; 1498 } 1499 1500 if (ret & BDRV_BLOCK_RAW) { 1501 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1502 return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1503 *pnum, pnum); 1504 } 1505 1506 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1507 ret |= BDRV_BLOCK_ALLOCATED; 1508 } else { 1509 if (bdrv_unallocated_blocks_are_zero(bs)) { 1510 ret |= BDRV_BLOCK_ZERO; 1511 } else if (bs->backing) { 1512 BlockDriverState *bs2 = bs->backing->bs; 1513 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1514 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1515 ret |= BDRV_BLOCK_ZERO; 1516 } 1517 } 1518 } 1519 1520 if (bs->file && 1521 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1522 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1523 int file_pnum; 1524 1525 ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, 1526 *pnum, &file_pnum); 1527 if (ret2 >= 0) { 1528 /* Ignore errors. This is just providing extra information, it 1529 * is useful but not necessary. 1530 */ 1531 if (!file_pnum) { 1532 /* !file_pnum indicates an offset at or beyond the EOF; it is 1533 * perfectly valid for the format block driver to point to such 1534 * offsets, so catch it and mark everything as zero */ 1535 ret |= BDRV_BLOCK_ZERO; 1536 } else { 1537 /* Limit request to the range reported by the protocol driver */ 1538 *pnum = file_pnum; 1539 ret |= (ret2 & BDRV_BLOCK_ZERO); 1540 } 1541 } 1542 } 1543 1544 return ret; 1545 } 1546 1547 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1548 BlockDriverState *base, 1549 int64_t sector_num, 1550 int nb_sectors, 1551 int *pnum) 1552 { 1553 BlockDriverState *p; 1554 int64_t ret = 0; 1555 1556 assert(bs != base); 1557 for (p = bs; p != base; p = backing_bs(p)) { 1558 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); 1559 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1560 break; 1561 } 1562 /* [sector_num, pnum] unallocated on this layer, which could be only 1563 * the first part of [sector_num, nb_sectors]. */ 1564 nb_sectors = MIN(nb_sectors, *pnum); 1565 } 1566 return ret; 1567 } 1568 1569 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1570 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1571 { 1572 BdrvCoGetBlockStatusData *data = opaque; 1573 1574 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1575 data->sector_num, 1576 data->nb_sectors, 1577 data->pnum); 1578 data->done = true; 1579 } 1580 1581 /* 1582 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1583 * 1584 * See bdrv_co_get_block_status_above() for details. 1585 */ 1586 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1587 BlockDriverState *base, 1588 int64_t sector_num, 1589 int nb_sectors, int *pnum) 1590 { 1591 Coroutine *co; 1592 BdrvCoGetBlockStatusData data = { 1593 .bs = bs, 1594 .base = base, 1595 .sector_num = sector_num, 1596 .nb_sectors = nb_sectors, 1597 .pnum = pnum, 1598 .done = false, 1599 }; 1600 1601 if (qemu_in_coroutine()) { 1602 /* Fast-path if already in coroutine context */ 1603 bdrv_get_block_status_above_co_entry(&data); 1604 } else { 1605 AioContext *aio_context = bdrv_get_aio_context(bs); 1606 1607 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1608 qemu_coroutine_enter(co, &data); 1609 while (!data.done) { 1610 aio_poll(aio_context, true); 1611 } 1612 } 1613 return data.ret; 1614 } 1615 1616 int64_t bdrv_get_block_status(BlockDriverState *bs, 1617 int64_t sector_num, 1618 int nb_sectors, int *pnum) 1619 { 1620 return bdrv_get_block_status_above(bs, backing_bs(bs), 1621 sector_num, nb_sectors, pnum); 1622 } 1623 1624 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1625 int nb_sectors, int *pnum) 1626 { 1627 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1628 if (ret < 0) { 1629 return ret; 1630 } 1631 return !!(ret & BDRV_BLOCK_ALLOCATED); 1632 } 1633 1634 /* 1635 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1636 * 1637 * Return true if the given sector is allocated in any image between 1638 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1639 * sector is allocated in any image of the chain. Return false otherwise. 1640 * 1641 * 'pnum' is set to the number of sectors (including and immediately following 1642 * the specified sector) that are known to be in the same 1643 * allocated/unallocated state. 1644 * 1645 */ 1646 int bdrv_is_allocated_above(BlockDriverState *top, 1647 BlockDriverState *base, 1648 int64_t sector_num, 1649 int nb_sectors, int *pnum) 1650 { 1651 BlockDriverState *intermediate; 1652 int ret, n = nb_sectors; 1653 1654 intermediate = top; 1655 while (intermediate && intermediate != base) { 1656 int pnum_inter; 1657 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1658 &pnum_inter); 1659 if (ret < 0) { 1660 return ret; 1661 } else if (ret) { 1662 *pnum = pnum_inter; 1663 return 1; 1664 } 1665 1666 /* 1667 * [sector_num, nb_sectors] is unallocated on top but intermediate 1668 * might have 1669 * 1670 * [sector_num+x, nr_sectors] allocated. 1671 */ 1672 if (n > pnum_inter && 1673 (intermediate == top || 1674 sector_num + pnum_inter < intermediate->total_sectors)) { 1675 n = pnum_inter; 1676 } 1677 1678 intermediate = backing_bs(intermediate); 1679 } 1680 1681 *pnum = n; 1682 return 0; 1683 } 1684 1685 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1686 const uint8_t *buf, int nb_sectors) 1687 { 1688 BlockDriver *drv = bs->drv; 1689 int ret; 1690 1691 if (!drv) { 1692 return -ENOMEDIUM; 1693 } 1694 if (!drv->bdrv_write_compressed) { 1695 return -ENOTSUP; 1696 } 1697 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1698 if (ret < 0) { 1699 return ret; 1700 } 1701 1702 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1703 1704 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1705 } 1706 1707 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1708 int64_t pos, int size) 1709 { 1710 QEMUIOVector qiov; 1711 struct iovec iov = { 1712 .iov_base = (void *) buf, 1713 .iov_len = size, 1714 }; 1715 1716 qemu_iovec_init_external(&qiov, &iov, 1); 1717 return bdrv_writev_vmstate(bs, &qiov, pos); 1718 } 1719 1720 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1721 { 1722 BlockDriver *drv = bs->drv; 1723 1724 if (!drv) { 1725 return -ENOMEDIUM; 1726 } else if (drv->bdrv_save_vmstate) { 1727 return drv->bdrv_save_vmstate(bs, qiov, pos); 1728 } else if (bs->file) { 1729 return bdrv_writev_vmstate(bs->file->bs, qiov, pos); 1730 } 1731 1732 return -ENOTSUP; 1733 } 1734 1735 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1736 int64_t pos, int size) 1737 { 1738 BlockDriver *drv = bs->drv; 1739 if (!drv) 1740 return -ENOMEDIUM; 1741 if (drv->bdrv_load_vmstate) 1742 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1743 if (bs->file) 1744 return bdrv_load_vmstate(bs->file->bs, buf, pos, size); 1745 return -ENOTSUP; 1746 } 1747 1748 /**************************************************************/ 1749 /* async I/Os */ 1750 1751 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1752 QEMUIOVector *qiov, int nb_sectors, 1753 BlockCompletionFunc *cb, void *opaque) 1754 { 1755 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1756 1757 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1758 cb, opaque, false); 1759 } 1760 1761 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1762 QEMUIOVector *qiov, int nb_sectors, 1763 BlockCompletionFunc *cb, void *opaque) 1764 { 1765 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1766 1767 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1768 cb, opaque, true); 1769 } 1770 1771 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1772 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1773 BlockCompletionFunc *cb, void *opaque) 1774 { 1775 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1776 1777 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1778 BDRV_REQ_ZERO_WRITE | flags, 1779 cb, opaque, true); 1780 } 1781 1782 1783 typedef struct MultiwriteCB { 1784 int error; 1785 int num_requests; 1786 int num_callbacks; 1787 struct { 1788 BlockCompletionFunc *cb; 1789 void *opaque; 1790 QEMUIOVector *free_qiov; 1791 } callbacks[]; 1792 } MultiwriteCB; 1793 1794 static void multiwrite_user_cb(MultiwriteCB *mcb) 1795 { 1796 int i; 1797 1798 for (i = 0; i < mcb->num_callbacks; i++) { 1799 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1800 if (mcb->callbacks[i].free_qiov) { 1801 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1802 } 1803 g_free(mcb->callbacks[i].free_qiov); 1804 } 1805 } 1806 1807 static void multiwrite_cb(void *opaque, int ret) 1808 { 1809 MultiwriteCB *mcb = opaque; 1810 1811 trace_multiwrite_cb(mcb, ret); 1812 1813 if (ret < 0 && !mcb->error) { 1814 mcb->error = ret; 1815 } 1816 1817 mcb->num_requests--; 1818 if (mcb->num_requests == 0) { 1819 multiwrite_user_cb(mcb); 1820 g_free(mcb); 1821 } 1822 } 1823 1824 static int multiwrite_req_compare(const void *a, const void *b) 1825 { 1826 const BlockRequest *req1 = a, *req2 = b; 1827 1828 /* 1829 * Note that we can't simply subtract req2->sector from req1->sector 1830 * here as that could overflow the return value. 1831 */ 1832 if (req1->sector > req2->sector) { 1833 return 1; 1834 } else if (req1->sector < req2->sector) { 1835 return -1; 1836 } else { 1837 return 0; 1838 } 1839 } 1840 1841 /* 1842 * Takes a bunch of requests and tries to merge them. Returns the number of 1843 * requests that remain after merging. 1844 */ 1845 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1846 int num_reqs, MultiwriteCB *mcb) 1847 { 1848 int i, outidx; 1849 1850 // Sort requests by start sector 1851 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1852 1853 // Check if adjacent requests touch the same clusters. If so, combine them, 1854 // filling up gaps with zero sectors. 1855 outidx = 0; 1856 for (i = 1; i < num_reqs; i++) { 1857 int merge = 0; 1858 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1859 1860 // Handle exactly sequential writes and overlapping writes. 1861 if (reqs[i].sector <= oldreq_last) { 1862 merge = 1; 1863 } 1864 1865 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1866 merge = 0; 1867 } 1868 1869 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1870 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1871 merge = 0; 1872 } 1873 1874 if (merge) { 1875 size_t size; 1876 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1877 qemu_iovec_init(qiov, 1878 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1879 1880 // Add the first request to the merged one. If the requests are 1881 // overlapping, drop the last sectors of the first request. 1882 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1883 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1884 1885 // We should need to add any zeros between the two requests 1886 assert (reqs[i].sector <= oldreq_last); 1887 1888 // Add the second request 1889 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1890 1891 // Add tail of first request, if necessary 1892 if (qiov->size < reqs[outidx].qiov->size) { 1893 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1894 reqs[outidx].qiov->size - qiov->size); 1895 } 1896 1897 reqs[outidx].nb_sectors = qiov->size >> 9; 1898 reqs[outidx].qiov = qiov; 1899 1900 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1901 } else { 1902 outidx++; 1903 reqs[outidx].sector = reqs[i].sector; 1904 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1905 reqs[outidx].qiov = reqs[i].qiov; 1906 } 1907 } 1908 1909 if (bs->blk) { 1910 block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, 1911 num_reqs - outidx - 1); 1912 } 1913 1914 return outidx + 1; 1915 } 1916 1917 /* 1918 * Submit multiple AIO write requests at once. 1919 * 1920 * On success, the function returns 0 and all requests in the reqs array have 1921 * been submitted. In error case this function returns -1, and any of the 1922 * requests may or may not be submitted yet. In particular, this means that the 1923 * callback will be called for some of the requests, for others it won't. The 1924 * caller must check the error field of the BlockRequest to wait for the right 1925 * callbacks (if error != 0, no callback will be called). 1926 * 1927 * The implementation may modify the contents of the reqs array, e.g. to merge 1928 * requests. However, the fields opaque and error are left unmodified as they 1929 * are used to signal failure for a single request to the caller. 1930 */ 1931 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1932 { 1933 MultiwriteCB *mcb; 1934 int i; 1935 1936 /* don't submit writes if we don't have a medium */ 1937 if (bs->drv == NULL) { 1938 for (i = 0; i < num_reqs; i++) { 1939 reqs[i].error = -ENOMEDIUM; 1940 } 1941 return -1; 1942 } 1943 1944 if (num_reqs == 0) { 1945 return 0; 1946 } 1947 1948 // Create MultiwriteCB structure 1949 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1950 mcb->num_requests = 0; 1951 mcb->num_callbacks = num_reqs; 1952 1953 for (i = 0; i < num_reqs; i++) { 1954 mcb->callbacks[i].cb = reqs[i].cb; 1955 mcb->callbacks[i].opaque = reqs[i].opaque; 1956 } 1957 1958 // Check for mergable requests 1959 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1960 1961 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1962 1963 /* Run the aio requests. */ 1964 mcb->num_requests = num_reqs; 1965 for (i = 0; i < num_reqs; i++) { 1966 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1967 reqs[i].nb_sectors, reqs[i].flags, 1968 multiwrite_cb, mcb, 1969 true); 1970 } 1971 1972 return 0; 1973 } 1974 1975 void bdrv_aio_cancel(BlockAIOCB *acb) 1976 { 1977 qemu_aio_ref(acb); 1978 bdrv_aio_cancel_async(acb); 1979 while (acb->refcnt > 1) { 1980 if (acb->aiocb_info->get_aio_context) { 1981 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1982 } else if (acb->bs) { 1983 aio_poll(bdrv_get_aio_context(acb->bs), true); 1984 } else { 1985 abort(); 1986 } 1987 } 1988 qemu_aio_unref(acb); 1989 } 1990 1991 /* Async version of aio cancel. The caller is not blocked if the acb implements 1992 * cancel_async, otherwise we do nothing and let the request normally complete. 1993 * In either case the completion callback must be called. */ 1994 void bdrv_aio_cancel_async(BlockAIOCB *acb) 1995 { 1996 if (acb->aiocb_info->cancel_async) { 1997 acb->aiocb_info->cancel_async(acb); 1998 } 1999 } 2000 2001 /**************************************************************/ 2002 /* async block device emulation */ 2003 2004 typedef struct BlockAIOCBSync { 2005 BlockAIOCB common; 2006 QEMUBH *bh; 2007 int ret; 2008 /* vector translation state */ 2009 QEMUIOVector *qiov; 2010 uint8_t *bounce; 2011 int is_write; 2012 } BlockAIOCBSync; 2013 2014 static const AIOCBInfo bdrv_em_aiocb_info = { 2015 .aiocb_size = sizeof(BlockAIOCBSync), 2016 }; 2017 2018 static void bdrv_aio_bh_cb(void *opaque) 2019 { 2020 BlockAIOCBSync *acb = opaque; 2021 2022 if (!acb->is_write && acb->ret >= 0) { 2023 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2024 } 2025 qemu_vfree(acb->bounce); 2026 acb->common.cb(acb->common.opaque, acb->ret); 2027 qemu_bh_delete(acb->bh); 2028 acb->bh = NULL; 2029 qemu_aio_unref(acb); 2030 } 2031 2032 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2033 int64_t sector_num, 2034 QEMUIOVector *qiov, 2035 int nb_sectors, 2036 BlockCompletionFunc *cb, 2037 void *opaque, 2038 int is_write) 2039 2040 { 2041 BlockAIOCBSync *acb; 2042 2043 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2044 acb->is_write = is_write; 2045 acb->qiov = qiov; 2046 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2047 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2048 2049 if (acb->bounce == NULL) { 2050 acb->ret = -ENOMEM; 2051 } else if (is_write) { 2052 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2053 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2054 } else { 2055 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2056 } 2057 2058 qemu_bh_schedule(acb->bh); 2059 2060 return &acb->common; 2061 } 2062 2063 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2064 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2065 BlockCompletionFunc *cb, void *opaque) 2066 { 2067 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2068 } 2069 2070 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2071 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2072 BlockCompletionFunc *cb, void *opaque) 2073 { 2074 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2075 } 2076 2077 2078 typedef struct BlockAIOCBCoroutine { 2079 BlockAIOCB common; 2080 BlockRequest req; 2081 bool is_write; 2082 bool need_bh; 2083 bool *done; 2084 QEMUBH* bh; 2085 } BlockAIOCBCoroutine; 2086 2087 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2088 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2089 }; 2090 2091 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2092 { 2093 if (!acb->need_bh) { 2094 acb->common.cb(acb->common.opaque, acb->req.error); 2095 qemu_aio_unref(acb); 2096 } 2097 } 2098 2099 static void bdrv_co_em_bh(void *opaque) 2100 { 2101 BlockAIOCBCoroutine *acb = opaque; 2102 2103 assert(!acb->need_bh); 2104 qemu_bh_delete(acb->bh); 2105 bdrv_co_complete(acb); 2106 } 2107 2108 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2109 { 2110 acb->need_bh = false; 2111 if (acb->req.error != -EINPROGRESS) { 2112 BlockDriverState *bs = acb->common.bs; 2113 2114 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2115 qemu_bh_schedule(acb->bh); 2116 } 2117 } 2118 2119 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2120 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2121 { 2122 BlockAIOCBCoroutine *acb = opaque; 2123 BlockDriverState *bs = acb->common.bs; 2124 2125 if (!acb->is_write) { 2126 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2127 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2128 } else { 2129 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2130 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2131 } 2132 2133 bdrv_co_complete(acb); 2134 } 2135 2136 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2137 int64_t sector_num, 2138 QEMUIOVector *qiov, 2139 int nb_sectors, 2140 BdrvRequestFlags flags, 2141 BlockCompletionFunc *cb, 2142 void *opaque, 2143 bool is_write) 2144 { 2145 Coroutine *co; 2146 BlockAIOCBCoroutine *acb; 2147 2148 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2149 acb->need_bh = true; 2150 acb->req.error = -EINPROGRESS; 2151 acb->req.sector = sector_num; 2152 acb->req.nb_sectors = nb_sectors; 2153 acb->req.qiov = qiov; 2154 acb->req.flags = flags; 2155 acb->is_write = is_write; 2156 2157 co = qemu_coroutine_create(bdrv_co_do_rw); 2158 qemu_coroutine_enter(co, acb); 2159 2160 bdrv_co_maybe_schedule_bh(acb); 2161 return &acb->common; 2162 } 2163 2164 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2165 { 2166 BlockAIOCBCoroutine *acb = opaque; 2167 BlockDriverState *bs = acb->common.bs; 2168 2169 acb->req.error = bdrv_co_flush(bs); 2170 bdrv_co_complete(acb); 2171 } 2172 2173 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2174 BlockCompletionFunc *cb, void *opaque) 2175 { 2176 trace_bdrv_aio_flush(bs, opaque); 2177 2178 Coroutine *co; 2179 BlockAIOCBCoroutine *acb; 2180 2181 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2182 acb->need_bh = true; 2183 acb->req.error = -EINPROGRESS; 2184 2185 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2186 qemu_coroutine_enter(co, acb); 2187 2188 bdrv_co_maybe_schedule_bh(acb); 2189 return &acb->common; 2190 } 2191 2192 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2193 { 2194 BlockAIOCBCoroutine *acb = opaque; 2195 BlockDriverState *bs = acb->common.bs; 2196 2197 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2198 bdrv_co_complete(acb); 2199 } 2200 2201 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2202 int64_t sector_num, int nb_sectors, 2203 BlockCompletionFunc *cb, void *opaque) 2204 { 2205 Coroutine *co; 2206 BlockAIOCBCoroutine *acb; 2207 2208 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2209 2210 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2211 acb->need_bh = true; 2212 acb->req.error = -EINPROGRESS; 2213 acb->req.sector = sector_num; 2214 acb->req.nb_sectors = nb_sectors; 2215 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2216 qemu_coroutine_enter(co, acb); 2217 2218 bdrv_co_maybe_schedule_bh(acb); 2219 return &acb->common; 2220 } 2221 2222 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2223 BlockCompletionFunc *cb, void *opaque) 2224 { 2225 BlockAIOCB *acb; 2226 2227 acb = g_malloc(aiocb_info->aiocb_size); 2228 acb->aiocb_info = aiocb_info; 2229 acb->bs = bs; 2230 acb->cb = cb; 2231 acb->opaque = opaque; 2232 acb->refcnt = 1; 2233 return acb; 2234 } 2235 2236 void qemu_aio_ref(void *p) 2237 { 2238 BlockAIOCB *acb = p; 2239 acb->refcnt++; 2240 } 2241 2242 void qemu_aio_unref(void *p) 2243 { 2244 BlockAIOCB *acb = p; 2245 assert(acb->refcnt > 0); 2246 if (--acb->refcnt == 0) { 2247 g_free(acb); 2248 } 2249 } 2250 2251 /**************************************************************/ 2252 /* Coroutine block device emulation */ 2253 2254 typedef struct CoroutineIOCompletion { 2255 Coroutine *coroutine; 2256 int ret; 2257 } CoroutineIOCompletion; 2258 2259 static void bdrv_co_io_em_complete(void *opaque, int ret) 2260 { 2261 CoroutineIOCompletion *co = opaque; 2262 2263 co->ret = ret; 2264 qemu_coroutine_enter(co->coroutine, NULL); 2265 } 2266 2267 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2268 int nb_sectors, QEMUIOVector *iov, 2269 bool is_write) 2270 { 2271 CoroutineIOCompletion co = { 2272 .coroutine = qemu_coroutine_self(), 2273 }; 2274 BlockAIOCB *acb; 2275 2276 if (is_write) { 2277 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2278 bdrv_co_io_em_complete, &co); 2279 } else { 2280 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2281 bdrv_co_io_em_complete, &co); 2282 } 2283 2284 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2285 if (!acb) { 2286 return -EIO; 2287 } 2288 qemu_coroutine_yield(); 2289 2290 return co.ret; 2291 } 2292 2293 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2294 int64_t sector_num, int nb_sectors, 2295 QEMUIOVector *iov) 2296 { 2297 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2298 } 2299 2300 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2301 int64_t sector_num, int nb_sectors, 2302 QEMUIOVector *iov) 2303 { 2304 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2305 } 2306 2307 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2308 { 2309 RwCo *rwco = opaque; 2310 2311 rwco->ret = bdrv_co_flush(rwco->bs); 2312 } 2313 2314 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2315 { 2316 int ret; 2317 2318 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2319 bdrv_is_sg(bs)) { 2320 return 0; 2321 } 2322 2323 /* Write back cached data to the OS even with cache=unsafe */ 2324 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2325 if (bs->drv->bdrv_co_flush_to_os) { 2326 ret = bs->drv->bdrv_co_flush_to_os(bs); 2327 if (ret < 0) { 2328 return ret; 2329 } 2330 } 2331 2332 /* But don't actually force it to the disk with cache=unsafe */ 2333 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2334 goto flush_parent; 2335 } 2336 2337 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2338 if (bs->drv->bdrv_co_flush_to_disk) { 2339 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2340 } else if (bs->drv->bdrv_aio_flush) { 2341 BlockAIOCB *acb; 2342 CoroutineIOCompletion co = { 2343 .coroutine = qemu_coroutine_self(), 2344 }; 2345 2346 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2347 if (acb == NULL) { 2348 ret = -EIO; 2349 } else { 2350 qemu_coroutine_yield(); 2351 ret = co.ret; 2352 } 2353 } else { 2354 /* 2355 * Some block drivers always operate in either writethrough or unsafe 2356 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2357 * know how the server works (because the behaviour is hardcoded or 2358 * depends on server-side configuration), so we can't ensure that 2359 * everything is safe on disk. Returning an error doesn't work because 2360 * that would break guests even if the server operates in writethrough 2361 * mode. 2362 * 2363 * Let's hope the user knows what he's doing. 2364 */ 2365 ret = 0; 2366 } 2367 if (ret < 0) { 2368 return ret; 2369 } 2370 2371 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2372 * in the case of cache=unsafe, so there are no useless flushes. 2373 */ 2374 flush_parent: 2375 return bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2376 } 2377 2378 int bdrv_flush(BlockDriverState *bs) 2379 { 2380 Coroutine *co; 2381 RwCo rwco = { 2382 .bs = bs, 2383 .ret = NOT_DONE, 2384 }; 2385 2386 if (qemu_in_coroutine()) { 2387 /* Fast-path if already in coroutine context */ 2388 bdrv_flush_co_entry(&rwco); 2389 } else { 2390 AioContext *aio_context = bdrv_get_aio_context(bs); 2391 2392 co = qemu_coroutine_create(bdrv_flush_co_entry); 2393 qemu_coroutine_enter(co, &rwco); 2394 while (rwco.ret == NOT_DONE) { 2395 aio_poll(aio_context, true); 2396 } 2397 } 2398 2399 return rwco.ret; 2400 } 2401 2402 typedef struct DiscardCo { 2403 BlockDriverState *bs; 2404 int64_t sector_num; 2405 int nb_sectors; 2406 int ret; 2407 } DiscardCo; 2408 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2409 { 2410 DiscardCo *rwco = opaque; 2411 2412 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2413 } 2414 2415 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2416 int nb_sectors) 2417 { 2418 int max_discard, ret; 2419 2420 if (!bs->drv) { 2421 return -ENOMEDIUM; 2422 } 2423 2424 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2425 if (ret < 0) { 2426 return ret; 2427 } else if (bs->read_only) { 2428 return -EPERM; 2429 } 2430 2431 /* Do nothing if disabled. */ 2432 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2433 return 0; 2434 } 2435 2436 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2437 return 0; 2438 } 2439 2440 bdrv_set_dirty(bs, sector_num, nb_sectors); 2441 2442 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2443 while (nb_sectors > 0) { 2444 int ret; 2445 int num = nb_sectors; 2446 2447 /* align request */ 2448 if (bs->bl.discard_alignment && 2449 num >= bs->bl.discard_alignment && 2450 sector_num % bs->bl.discard_alignment) { 2451 if (num > bs->bl.discard_alignment) { 2452 num = bs->bl.discard_alignment; 2453 } 2454 num -= sector_num % bs->bl.discard_alignment; 2455 } 2456 2457 /* limit request size */ 2458 if (num > max_discard) { 2459 num = max_discard; 2460 } 2461 2462 if (bs->drv->bdrv_co_discard) { 2463 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2464 } else { 2465 BlockAIOCB *acb; 2466 CoroutineIOCompletion co = { 2467 .coroutine = qemu_coroutine_self(), 2468 }; 2469 2470 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2471 bdrv_co_io_em_complete, &co); 2472 if (acb == NULL) { 2473 return -EIO; 2474 } else { 2475 qemu_coroutine_yield(); 2476 ret = co.ret; 2477 } 2478 } 2479 if (ret && ret != -ENOTSUP) { 2480 return ret; 2481 } 2482 2483 sector_num += num; 2484 nb_sectors -= num; 2485 } 2486 return 0; 2487 } 2488 2489 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2490 { 2491 Coroutine *co; 2492 DiscardCo rwco = { 2493 .bs = bs, 2494 .sector_num = sector_num, 2495 .nb_sectors = nb_sectors, 2496 .ret = NOT_DONE, 2497 }; 2498 2499 if (qemu_in_coroutine()) { 2500 /* Fast-path if already in coroutine context */ 2501 bdrv_discard_co_entry(&rwco); 2502 } else { 2503 AioContext *aio_context = bdrv_get_aio_context(bs); 2504 2505 co = qemu_coroutine_create(bdrv_discard_co_entry); 2506 qemu_coroutine_enter(co, &rwco); 2507 while (rwco.ret == NOT_DONE) { 2508 aio_poll(aio_context, true); 2509 } 2510 } 2511 2512 return rwco.ret; 2513 } 2514 2515 /* needed for generic scsi interface */ 2516 2517 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2518 { 2519 BlockDriver *drv = bs->drv; 2520 2521 if (drv && drv->bdrv_ioctl) 2522 return drv->bdrv_ioctl(bs, req, buf); 2523 return -ENOTSUP; 2524 } 2525 2526 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2527 unsigned long int req, void *buf, 2528 BlockCompletionFunc *cb, void *opaque) 2529 { 2530 BlockDriver *drv = bs->drv; 2531 2532 if (drv && drv->bdrv_aio_ioctl) 2533 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 2534 return NULL; 2535 } 2536 2537 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2538 { 2539 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2540 } 2541 2542 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2543 { 2544 return memset(qemu_blockalign(bs, size), 0, size); 2545 } 2546 2547 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2548 { 2549 size_t align = bdrv_opt_mem_align(bs); 2550 2551 /* Ensure that NULL is never returned on success */ 2552 assert(align > 0); 2553 if (size == 0) { 2554 size = align; 2555 } 2556 2557 return qemu_try_memalign(align, size); 2558 } 2559 2560 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2561 { 2562 void *mem = qemu_try_blockalign(bs, size); 2563 2564 if (mem) { 2565 memset(mem, 0, size); 2566 } 2567 2568 return mem; 2569 } 2570 2571 /* 2572 * Check if all memory in this vector is sector aligned. 2573 */ 2574 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2575 { 2576 int i; 2577 size_t alignment = bdrv_min_mem_align(bs); 2578 2579 for (i = 0; i < qiov->niov; i++) { 2580 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2581 return false; 2582 } 2583 if (qiov->iov[i].iov_len % alignment) { 2584 return false; 2585 } 2586 } 2587 2588 return true; 2589 } 2590 2591 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2592 NotifierWithReturn *notifier) 2593 { 2594 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2595 } 2596 2597 void bdrv_io_plug(BlockDriverState *bs) 2598 { 2599 BlockDriver *drv = bs->drv; 2600 if (drv && drv->bdrv_io_plug) { 2601 drv->bdrv_io_plug(bs); 2602 } else if (bs->file) { 2603 bdrv_io_plug(bs->file->bs); 2604 } 2605 } 2606 2607 void bdrv_io_unplug(BlockDriverState *bs) 2608 { 2609 BlockDriver *drv = bs->drv; 2610 if (drv && drv->bdrv_io_unplug) { 2611 drv->bdrv_io_unplug(bs); 2612 } else if (bs->file) { 2613 bdrv_io_unplug(bs->file->bs); 2614 } 2615 } 2616 2617 void bdrv_flush_io_queue(BlockDriverState *bs) 2618 { 2619 BlockDriver *drv = bs->drv; 2620 if (drv && drv->bdrv_flush_io_queue) { 2621 drv->bdrv_flush_io_queue(bs); 2622 } else if (bs->file) { 2623 bdrv_flush_io_queue(bs->file->bs); 2624 } 2625 bdrv_start_throttled_reqs(bs); 2626 } 2627 2628 void bdrv_drained_begin(BlockDriverState *bs) 2629 { 2630 if (!bs->quiesce_counter++) { 2631 aio_disable_external(bdrv_get_aio_context(bs)); 2632 } 2633 bdrv_drain(bs); 2634 } 2635 2636 void bdrv_drained_end(BlockDriverState *bs) 2637 { 2638 assert(bs->quiesce_counter > 0); 2639 if (--bs->quiesce_counter > 0) { 2640 return; 2641 } 2642 aio_enable_external(bdrv_get_aio_context(bs)); 2643 } 2644