1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "sysemu/qtest.h" 27 #include "block/blockjob.h" 28 #include "block/block_int.h" 29 30 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 31 32 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 33 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 34 BlockCompletionFunc *cb, void *opaque); 35 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 36 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 37 BlockCompletionFunc *cb, void *opaque); 38 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 39 int64_t sector_num, int nb_sectors, 40 QEMUIOVector *iov); 41 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 42 int64_t sector_num, int nb_sectors, 43 QEMUIOVector *iov); 44 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 45 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 46 BdrvRequestFlags flags); 47 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 48 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 49 BdrvRequestFlags flags); 50 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 51 int64_t sector_num, 52 QEMUIOVector *qiov, 53 int nb_sectors, 54 BdrvRequestFlags flags, 55 BlockCompletionFunc *cb, 56 void *opaque, 57 bool is_write); 58 static void coroutine_fn bdrv_co_do_rw(void *opaque); 59 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 60 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 61 62 /* throttling disk I/O limits */ 63 void bdrv_set_io_limits(BlockDriverState *bs, 64 ThrottleConfig *cfg) 65 { 66 int i; 67 68 throttle_config(&bs->throttle_state, cfg); 69 70 for (i = 0; i < 2; i++) { 71 qemu_co_enter_next(&bs->throttled_reqs[i]); 72 } 73 } 74 75 /* this function drain all the throttled IOs */ 76 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 77 { 78 bool drained = false; 79 bool enabled = bs->io_limits_enabled; 80 int i; 81 82 bs->io_limits_enabled = false; 83 84 for (i = 0; i < 2; i++) { 85 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 86 drained = true; 87 } 88 } 89 90 bs->io_limits_enabled = enabled; 91 92 return drained; 93 } 94 95 void bdrv_io_limits_disable(BlockDriverState *bs) 96 { 97 bs->io_limits_enabled = false; 98 99 bdrv_start_throttled_reqs(bs); 100 101 throttle_destroy(&bs->throttle_state); 102 } 103 104 static void bdrv_throttle_read_timer_cb(void *opaque) 105 { 106 BlockDriverState *bs = opaque; 107 qemu_co_enter_next(&bs->throttled_reqs[0]); 108 } 109 110 static void bdrv_throttle_write_timer_cb(void *opaque) 111 { 112 BlockDriverState *bs = opaque; 113 qemu_co_enter_next(&bs->throttled_reqs[1]); 114 } 115 116 /* should be called before bdrv_set_io_limits if a limit is set */ 117 void bdrv_io_limits_enable(BlockDriverState *bs) 118 { 119 int clock_type = QEMU_CLOCK_REALTIME; 120 121 if (qtest_enabled()) { 122 /* For testing block IO throttling only */ 123 clock_type = QEMU_CLOCK_VIRTUAL; 124 } 125 assert(!bs->io_limits_enabled); 126 throttle_init(&bs->throttle_state, 127 bdrv_get_aio_context(bs), 128 clock_type, 129 bdrv_throttle_read_timer_cb, 130 bdrv_throttle_write_timer_cb, 131 bs); 132 bs->io_limits_enabled = true; 133 } 134 135 /* This function makes an IO wait if needed 136 * 137 * @nb_sectors: the number of sectors of the IO 138 * @is_write: is the IO a write 139 */ 140 static void bdrv_io_limits_intercept(BlockDriverState *bs, 141 unsigned int bytes, 142 bool is_write) 143 { 144 /* does this io must wait */ 145 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); 146 147 /* if must wait or any request of this type throttled queue the IO */ 148 if (must_wait || 149 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { 150 qemu_co_queue_wait(&bs->throttled_reqs[is_write]); 151 } 152 153 /* the IO will be executed, do the accounting */ 154 throttle_account(&bs->throttle_state, is_write, bytes); 155 156 157 /* if the next request must wait -> do nothing */ 158 if (throttle_schedule_timer(&bs->throttle_state, is_write)) { 159 return; 160 } 161 162 /* else queue next request for execution */ 163 qemu_co_queue_next(&bs->throttled_reqs[is_write]); 164 } 165 166 void bdrv_setup_io_funcs(BlockDriver *bdrv) 167 { 168 /* Block drivers without coroutine functions need emulation */ 169 if (!bdrv->bdrv_co_readv) { 170 bdrv->bdrv_co_readv = bdrv_co_readv_em; 171 bdrv->bdrv_co_writev = bdrv_co_writev_em; 172 173 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 174 * the block driver lacks aio we need to emulate that too. 175 */ 176 if (!bdrv->bdrv_aio_readv) { 177 /* add AIO emulation layer */ 178 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 179 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 180 } 181 } 182 } 183 184 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 185 { 186 BlockDriver *drv = bs->drv; 187 Error *local_err = NULL; 188 189 memset(&bs->bl, 0, sizeof(bs->bl)); 190 191 if (!drv) { 192 return; 193 } 194 195 /* Take some limits from the children as a default */ 196 if (bs->file) { 197 bdrv_refresh_limits(bs->file, &local_err); 198 if (local_err) { 199 error_propagate(errp, local_err); 200 return; 201 } 202 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; 203 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; 204 bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; 205 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; 206 } else { 207 bs->bl.min_mem_alignment = 512; 208 bs->bl.opt_mem_alignment = getpagesize(); 209 } 210 211 if (bs->backing_hd) { 212 bdrv_refresh_limits(bs->backing_hd, &local_err); 213 if (local_err) { 214 error_propagate(errp, local_err); 215 return; 216 } 217 bs->bl.opt_transfer_length = 218 MAX(bs->bl.opt_transfer_length, 219 bs->backing_hd->bl.opt_transfer_length); 220 bs->bl.max_transfer_length = 221 MIN_NON_ZERO(bs->bl.max_transfer_length, 222 bs->backing_hd->bl.max_transfer_length); 223 bs->bl.opt_mem_alignment = 224 MAX(bs->bl.opt_mem_alignment, 225 bs->backing_hd->bl.opt_mem_alignment); 226 bs->bl.min_mem_alignment = 227 MAX(bs->bl.min_mem_alignment, 228 bs->backing_hd->bl.min_mem_alignment); 229 } 230 231 /* Then let the driver override it */ 232 if (drv->bdrv_refresh_limits) { 233 drv->bdrv_refresh_limits(bs, errp); 234 } 235 } 236 237 /** 238 * The copy-on-read flag is actually a reference count so multiple users may 239 * use the feature without worrying about clobbering its previous state. 240 * Copy-on-read stays enabled until all users have called to disable it. 241 */ 242 void bdrv_enable_copy_on_read(BlockDriverState *bs) 243 { 244 bs->copy_on_read++; 245 } 246 247 void bdrv_disable_copy_on_read(BlockDriverState *bs) 248 { 249 assert(bs->copy_on_read > 0); 250 bs->copy_on_read--; 251 } 252 253 /* Check if any requests are in-flight (including throttled requests) */ 254 static bool bdrv_requests_pending(BlockDriverState *bs) 255 { 256 if (!QLIST_EMPTY(&bs->tracked_requests)) { 257 return true; 258 } 259 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 260 return true; 261 } 262 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 263 return true; 264 } 265 if (bs->file && bdrv_requests_pending(bs->file)) { 266 return true; 267 } 268 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 269 return true; 270 } 271 return false; 272 } 273 274 static bool bdrv_drain_one(BlockDriverState *bs) 275 { 276 bool bs_busy; 277 278 bdrv_flush_io_queue(bs); 279 bdrv_start_throttled_reqs(bs); 280 bs_busy = bdrv_requests_pending(bs); 281 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); 282 return bs_busy; 283 } 284 285 /* 286 * Wait for pending requests to complete on a single BlockDriverState subtree 287 * 288 * See the warning in bdrv_drain_all(). This function can only be called if 289 * you are sure nothing can generate I/O because you have op blockers 290 * installed. 291 * 292 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 293 * AioContext. 294 */ 295 void bdrv_drain(BlockDriverState *bs) 296 { 297 while (bdrv_drain_one(bs)) { 298 /* Keep iterating */ 299 } 300 } 301 302 /* 303 * Wait for pending requests to complete across all BlockDriverStates 304 * 305 * This function does not flush data to disk, use bdrv_flush_all() for that 306 * after calling this function. 307 * 308 * Note that completion of an asynchronous I/O operation can trigger any 309 * number of other I/O operations on other devices---for example a coroutine 310 * can be arbitrarily complex and a constant flow of I/O can come until the 311 * coroutine is complete. Because of this, it is not possible to have a 312 * function to drain a single device's I/O queue. 313 */ 314 void bdrv_drain_all(void) 315 { 316 /* Always run first iteration so any pending completion BHs run */ 317 bool busy = true; 318 BlockDriverState *bs = NULL; 319 320 while ((bs = bdrv_next(bs))) { 321 AioContext *aio_context = bdrv_get_aio_context(bs); 322 323 aio_context_acquire(aio_context); 324 if (bs->job) { 325 block_job_pause(bs->job); 326 } 327 aio_context_release(aio_context); 328 } 329 330 while (busy) { 331 busy = false; 332 bs = NULL; 333 334 while ((bs = bdrv_next(bs))) { 335 AioContext *aio_context = bdrv_get_aio_context(bs); 336 337 aio_context_acquire(aio_context); 338 busy |= bdrv_drain_one(bs); 339 aio_context_release(aio_context); 340 } 341 } 342 343 bs = NULL; 344 while ((bs = bdrv_next(bs))) { 345 AioContext *aio_context = bdrv_get_aio_context(bs); 346 347 aio_context_acquire(aio_context); 348 if (bs->job) { 349 block_job_resume(bs->job); 350 } 351 aio_context_release(aio_context); 352 } 353 } 354 355 /** 356 * Remove an active request from the tracked requests list 357 * 358 * This function should be called when a tracked request is completing. 359 */ 360 static void tracked_request_end(BdrvTrackedRequest *req) 361 { 362 if (req->serialising) { 363 req->bs->serialising_in_flight--; 364 } 365 366 QLIST_REMOVE(req, list); 367 qemu_co_queue_restart_all(&req->wait_queue); 368 } 369 370 /** 371 * Add an active request to the tracked requests list 372 */ 373 static void tracked_request_begin(BdrvTrackedRequest *req, 374 BlockDriverState *bs, 375 int64_t offset, 376 unsigned int bytes, bool is_write) 377 { 378 *req = (BdrvTrackedRequest){ 379 .bs = bs, 380 .offset = offset, 381 .bytes = bytes, 382 .is_write = is_write, 383 .co = qemu_coroutine_self(), 384 .serialising = false, 385 .overlap_offset = offset, 386 .overlap_bytes = bytes, 387 }; 388 389 qemu_co_queue_init(&req->wait_queue); 390 391 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 392 } 393 394 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 395 { 396 int64_t overlap_offset = req->offset & ~(align - 1); 397 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 398 - overlap_offset; 399 400 if (!req->serialising) { 401 req->bs->serialising_in_flight++; 402 req->serialising = true; 403 } 404 405 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 406 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 407 } 408 409 /** 410 * Round a region to cluster boundaries 411 */ 412 void bdrv_round_to_clusters(BlockDriverState *bs, 413 int64_t sector_num, int nb_sectors, 414 int64_t *cluster_sector_num, 415 int *cluster_nb_sectors) 416 { 417 BlockDriverInfo bdi; 418 419 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 420 *cluster_sector_num = sector_num; 421 *cluster_nb_sectors = nb_sectors; 422 } else { 423 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 424 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 425 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 426 nb_sectors, c); 427 } 428 } 429 430 static int bdrv_get_cluster_size(BlockDriverState *bs) 431 { 432 BlockDriverInfo bdi; 433 int ret; 434 435 ret = bdrv_get_info(bs, &bdi); 436 if (ret < 0 || bdi.cluster_size == 0) { 437 return bs->request_alignment; 438 } else { 439 return bdi.cluster_size; 440 } 441 } 442 443 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 444 int64_t offset, unsigned int bytes) 445 { 446 /* aaaa bbbb */ 447 if (offset >= req->overlap_offset + req->overlap_bytes) { 448 return false; 449 } 450 /* bbbb aaaa */ 451 if (req->overlap_offset >= offset + bytes) { 452 return false; 453 } 454 return true; 455 } 456 457 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 458 { 459 BlockDriverState *bs = self->bs; 460 BdrvTrackedRequest *req; 461 bool retry; 462 bool waited = false; 463 464 if (!bs->serialising_in_flight) { 465 return false; 466 } 467 468 do { 469 retry = false; 470 QLIST_FOREACH(req, &bs->tracked_requests, list) { 471 if (req == self || (!req->serialising && !self->serialising)) { 472 continue; 473 } 474 if (tracked_request_overlaps(req, self->overlap_offset, 475 self->overlap_bytes)) 476 { 477 /* Hitting this means there was a reentrant request, for 478 * example, a block driver issuing nested requests. This must 479 * never happen since it means deadlock. 480 */ 481 assert(qemu_coroutine_self() != req->co); 482 483 /* If the request is already (indirectly) waiting for us, or 484 * will wait for us as soon as it wakes up, then just go on 485 * (instead of producing a deadlock in the former case). */ 486 if (!req->waiting_for) { 487 self->waiting_for = req; 488 qemu_co_queue_wait(&req->wait_queue); 489 self->waiting_for = NULL; 490 retry = true; 491 waited = true; 492 break; 493 } 494 } 495 } 496 } while (retry); 497 498 return waited; 499 } 500 501 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 502 size_t size) 503 { 504 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 505 return -EIO; 506 } 507 508 if (!bdrv_is_inserted(bs)) { 509 return -ENOMEDIUM; 510 } 511 512 if (offset < 0) { 513 return -EIO; 514 } 515 516 return 0; 517 } 518 519 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 520 int nb_sectors) 521 { 522 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 523 return -EIO; 524 } 525 526 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 527 nb_sectors * BDRV_SECTOR_SIZE); 528 } 529 530 typedef struct RwCo { 531 BlockDriverState *bs; 532 int64_t offset; 533 QEMUIOVector *qiov; 534 bool is_write; 535 int ret; 536 BdrvRequestFlags flags; 537 } RwCo; 538 539 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 540 { 541 RwCo *rwco = opaque; 542 543 if (!rwco->is_write) { 544 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 545 rwco->qiov->size, rwco->qiov, 546 rwco->flags); 547 } else { 548 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 549 rwco->qiov->size, rwco->qiov, 550 rwco->flags); 551 } 552 } 553 554 /* 555 * Process a vectored synchronous request using coroutines 556 */ 557 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 558 QEMUIOVector *qiov, bool is_write, 559 BdrvRequestFlags flags) 560 { 561 Coroutine *co; 562 RwCo rwco = { 563 .bs = bs, 564 .offset = offset, 565 .qiov = qiov, 566 .is_write = is_write, 567 .ret = NOT_DONE, 568 .flags = flags, 569 }; 570 571 /** 572 * In sync call context, when the vcpu is blocked, this throttling timer 573 * will not fire; so the I/O throttling function has to be disabled here 574 * if it has been enabled. 575 */ 576 if (bs->io_limits_enabled) { 577 fprintf(stderr, "Disabling I/O throttling on '%s' due " 578 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 579 bdrv_io_limits_disable(bs); 580 } 581 582 if (qemu_in_coroutine()) { 583 /* Fast-path if already in coroutine context */ 584 bdrv_rw_co_entry(&rwco); 585 } else { 586 AioContext *aio_context = bdrv_get_aio_context(bs); 587 588 co = qemu_coroutine_create(bdrv_rw_co_entry); 589 qemu_coroutine_enter(co, &rwco); 590 while (rwco.ret == NOT_DONE) { 591 aio_poll(aio_context, true); 592 } 593 } 594 return rwco.ret; 595 } 596 597 /* 598 * Process a synchronous request using coroutines 599 */ 600 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 601 int nb_sectors, bool is_write, BdrvRequestFlags flags) 602 { 603 QEMUIOVector qiov; 604 struct iovec iov = { 605 .iov_base = (void *)buf, 606 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 607 }; 608 609 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 610 return -EINVAL; 611 } 612 613 qemu_iovec_init_external(&qiov, &iov, 1); 614 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 615 &qiov, is_write, flags); 616 } 617 618 /* return < 0 if error. See bdrv_write() for the return codes */ 619 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 620 uint8_t *buf, int nb_sectors) 621 { 622 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 623 } 624 625 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 626 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 627 uint8_t *buf, int nb_sectors) 628 { 629 bool enabled; 630 int ret; 631 632 enabled = bs->io_limits_enabled; 633 bs->io_limits_enabled = false; 634 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 635 bs->io_limits_enabled = enabled; 636 return ret; 637 } 638 639 /* Return < 0 if error. Important errors are: 640 -EIO generic I/O error (may happen for all errors) 641 -ENOMEDIUM No media inserted. 642 -EINVAL Invalid sector number or nb_sectors 643 -EACCES Trying to write a read-only device 644 */ 645 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 646 const uint8_t *buf, int nb_sectors) 647 { 648 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 649 } 650 651 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 652 int nb_sectors, BdrvRequestFlags flags) 653 { 654 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 655 BDRV_REQ_ZERO_WRITE | flags); 656 } 657 658 /* 659 * Completely zero out a block device with the help of bdrv_write_zeroes. 660 * The operation is sped up by checking the block status and only writing 661 * zeroes to the device if they currently do not return zeroes. Optional 662 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 663 * 664 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 665 */ 666 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 667 { 668 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 669 int n; 670 671 target_sectors = bdrv_nb_sectors(bs); 672 if (target_sectors < 0) { 673 return target_sectors; 674 } 675 676 for (;;) { 677 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 678 if (nb_sectors <= 0) { 679 return 0; 680 } 681 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 682 if (ret < 0) { 683 error_report("error getting block status at sector %" PRId64 ": %s", 684 sector_num, strerror(-ret)); 685 return ret; 686 } 687 if (ret & BDRV_BLOCK_ZERO) { 688 sector_num += n; 689 continue; 690 } 691 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 692 if (ret < 0) { 693 error_report("error writing zeroes at sector %" PRId64 ": %s", 694 sector_num, strerror(-ret)); 695 return ret; 696 } 697 sector_num += n; 698 } 699 } 700 701 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 702 { 703 QEMUIOVector qiov; 704 struct iovec iov = { 705 .iov_base = (void *)buf, 706 .iov_len = bytes, 707 }; 708 int ret; 709 710 if (bytes < 0) { 711 return -EINVAL; 712 } 713 714 qemu_iovec_init_external(&qiov, &iov, 1); 715 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 716 if (ret < 0) { 717 return ret; 718 } 719 720 return bytes; 721 } 722 723 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 724 { 725 int ret; 726 727 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 728 if (ret < 0) { 729 return ret; 730 } 731 732 return qiov->size; 733 } 734 735 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 736 const void *buf, int bytes) 737 { 738 QEMUIOVector qiov; 739 struct iovec iov = { 740 .iov_base = (void *) buf, 741 .iov_len = bytes, 742 }; 743 744 if (bytes < 0) { 745 return -EINVAL; 746 } 747 748 qemu_iovec_init_external(&qiov, &iov, 1); 749 return bdrv_pwritev(bs, offset, &qiov); 750 } 751 752 /* 753 * Writes to the file and ensures that no writes are reordered across this 754 * request (acts as a barrier) 755 * 756 * Returns 0 on success, -errno in error cases. 757 */ 758 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 759 const void *buf, int count) 760 { 761 int ret; 762 763 ret = bdrv_pwrite(bs, offset, buf, count); 764 if (ret < 0) { 765 return ret; 766 } 767 768 /* No flush needed for cache modes that already do it */ 769 if (bs->enable_write_cache) { 770 bdrv_flush(bs); 771 } 772 773 return 0; 774 } 775 776 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 777 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 778 { 779 /* Perform I/O through a temporary buffer so that users who scribble over 780 * their read buffer while the operation is in progress do not end up 781 * modifying the image file. This is critical for zero-copy guest I/O 782 * where anything might happen inside guest memory. 783 */ 784 void *bounce_buffer; 785 786 BlockDriver *drv = bs->drv; 787 struct iovec iov; 788 QEMUIOVector bounce_qiov; 789 int64_t cluster_sector_num; 790 int cluster_nb_sectors; 791 size_t skip_bytes; 792 int ret; 793 794 /* Cover entire cluster so no additional backing file I/O is required when 795 * allocating cluster in the image file. 796 */ 797 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 798 &cluster_sector_num, &cluster_nb_sectors); 799 800 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 801 cluster_sector_num, cluster_nb_sectors); 802 803 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 804 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 805 if (bounce_buffer == NULL) { 806 ret = -ENOMEM; 807 goto err; 808 } 809 810 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 811 812 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 813 &bounce_qiov); 814 if (ret < 0) { 815 goto err; 816 } 817 818 if (drv->bdrv_co_write_zeroes && 819 buffer_is_zero(bounce_buffer, iov.iov_len)) { 820 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 821 cluster_nb_sectors, 0); 822 } else { 823 /* This does not change the data on the disk, it is not necessary 824 * to flush even in cache=writethrough mode. 825 */ 826 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 827 &bounce_qiov); 828 } 829 830 if (ret < 0) { 831 /* It might be okay to ignore write errors for guest requests. If this 832 * is a deliberate copy-on-read then we don't want to ignore the error. 833 * Simply report it in all cases. 834 */ 835 goto err; 836 } 837 838 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 839 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 840 nb_sectors * BDRV_SECTOR_SIZE); 841 842 err: 843 qemu_vfree(bounce_buffer); 844 return ret; 845 } 846 847 /* 848 * Forwards an already correctly aligned request to the BlockDriver. This 849 * handles copy on read and zeroing after EOF; any other features must be 850 * implemented by the caller. 851 */ 852 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 853 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 854 int64_t align, QEMUIOVector *qiov, int flags) 855 { 856 BlockDriver *drv = bs->drv; 857 int ret; 858 859 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 860 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 861 862 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 863 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 864 assert(!qiov || bytes == qiov->size); 865 866 /* Handle Copy on Read and associated serialisation */ 867 if (flags & BDRV_REQ_COPY_ON_READ) { 868 /* If we touch the same cluster it counts as an overlap. This 869 * guarantees that allocating writes will be serialized and not race 870 * with each other for the same cluster. For example, in copy-on-read 871 * it ensures that the CoR read and write operations are atomic and 872 * guest writes cannot interleave between them. */ 873 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 874 } 875 876 wait_serialising_requests(req); 877 878 if (flags & BDRV_REQ_COPY_ON_READ) { 879 int pnum; 880 881 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 882 if (ret < 0) { 883 goto out; 884 } 885 886 if (!ret || pnum != nb_sectors) { 887 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 888 goto out; 889 } 890 } 891 892 /* Forward the request to the BlockDriver */ 893 if (!bs->zero_beyond_eof) { 894 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 895 } else { 896 /* Read zeros after EOF */ 897 int64_t total_sectors, max_nb_sectors; 898 899 total_sectors = bdrv_nb_sectors(bs); 900 if (total_sectors < 0) { 901 ret = total_sectors; 902 goto out; 903 } 904 905 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 906 align >> BDRV_SECTOR_BITS); 907 if (nb_sectors < max_nb_sectors) { 908 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 909 } else if (max_nb_sectors > 0) { 910 QEMUIOVector local_qiov; 911 912 qemu_iovec_init(&local_qiov, qiov->niov); 913 qemu_iovec_concat(&local_qiov, qiov, 0, 914 max_nb_sectors * BDRV_SECTOR_SIZE); 915 916 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 917 &local_qiov); 918 919 qemu_iovec_destroy(&local_qiov); 920 } else { 921 ret = 0; 922 } 923 924 /* Reading beyond end of file is supposed to produce zeroes */ 925 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 926 uint64_t offset = MAX(0, total_sectors - sector_num); 927 uint64_t bytes = (sector_num + nb_sectors - offset) * 928 BDRV_SECTOR_SIZE; 929 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 930 } 931 } 932 933 out: 934 return ret; 935 } 936 937 /* 938 * Handle a read request in coroutine context 939 */ 940 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 941 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 942 BdrvRequestFlags flags) 943 { 944 BlockDriver *drv = bs->drv; 945 BdrvTrackedRequest req; 946 947 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 948 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 949 uint8_t *head_buf = NULL; 950 uint8_t *tail_buf = NULL; 951 QEMUIOVector local_qiov; 952 bool use_local_qiov = false; 953 int ret; 954 955 if (!drv) { 956 return -ENOMEDIUM; 957 } 958 959 ret = bdrv_check_byte_request(bs, offset, bytes); 960 if (ret < 0) { 961 return ret; 962 } 963 964 if (bs->copy_on_read) { 965 flags |= BDRV_REQ_COPY_ON_READ; 966 } 967 968 /* throttling disk I/O */ 969 if (bs->io_limits_enabled) { 970 bdrv_io_limits_intercept(bs, bytes, false); 971 } 972 973 /* Align read if necessary by padding qiov */ 974 if (offset & (align - 1)) { 975 head_buf = qemu_blockalign(bs, align); 976 qemu_iovec_init(&local_qiov, qiov->niov + 2); 977 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 978 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 979 use_local_qiov = true; 980 981 bytes += offset & (align - 1); 982 offset = offset & ~(align - 1); 983 } 984 985 if ((offset + bytes) & (align - 1)) { 986 if (!use_local_qiov) { 987 qemu_iovec_init(&local_qiov, qiov->niov + 1); 988 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 989 use_local_qiov = true; 990 } 991 tail_buf = qemu_blockalign(bs, align); 992 qemu_iovec_add(&local_qiov, tail_buf, 993 align - ((offset + bytes) & (align - 1))); 994 995 bytes = ROUND_UP(bytes, align); 996 } 997 998 tracked_request_begin(&req, bs, offset, bytes, false); 999 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 1000 use_local_qiov ? &local_qiov : qiov, 1001 flags); 1002 tracked_request_end(&req); 1003 1004 if (use_local_qiov) { 1005 qemu_iovec_destroy(&local_qiov); 1006 qemu_vfree(head_buf); 1007 qemu_vfree(tail_buf); 1008 } 1009 1010 return ret; 1011 } 1012 1013 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1014 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1015 BdrvRequestFlags flags) 1016 { 1017 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1018 return -EINVAL; 1019 } 1020 1021 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 1022 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1023 } 1024 1025 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1026 int nb_sectors, QEMUIOVector *qiov) 1027 { 1028 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1029 1030 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1031 } 1032 1033 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1034 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1035 { 1036 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1037 1038 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1039 BDRV_REQ_COPY_ON_READ); 1040 } 1041 1042 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1043 1044 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1045 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1046 { 1047 BlockDriver *drv = bs->drv; 1048 QEMUIOVector qiov; 1049 struct iovec iov = {0}; 1050 int ret = 0; 1051 1052 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1053 BDRV_REQUEST_MAX_SECTORS); 1054 1055 while (nb_sectors > 0 && !ret) { 1056 int num = nb_sectors; 1057 1058 /* Align request. Block drivers can expect the "bulk" of the request 1059 * to be aligned. 1060 */ 1061 if (bs->bl.write_zeroes_alignment 1062 && num > bs->bl.write_zeroes_alignment) { 1063 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1064 /* Make a small request up to the first aligned sector. */ 1065 num = bs->bl.write_zeroes_alignment; 1066 num -= sector_num % bs->bl.write_zeroes_alignment; 1067 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1068 /* Shorten the request to the last aligned sector. num cannot 1069 * underflow because num > bs->bl.write_zeroes_alignment. 1070 */ 1071 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1072 } 1073 } 1074 1075 /* limit request size */ 1076 if (num > max_write_zeroes) { 1077 num = max_write_zeroes; 1078 } 1079 1080 ret = -ENOTSUP; 1081 /* First try the efficient write zeroes operation */ 1082 if (drv->bdrv_co_write_zeroes) { 1083 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1084 } 1085 1086 if (ret == -ENOTSUP) { 1087 /* Fall back to bounce buffer if write zeroes is unsupported */ 1088 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1089 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1090 num = MIN(num, max_xfer_len); 1091 iov.iov_len = num * BDRV_SECTOR_SIZE; 1092 if (iov.iov_base == NULL) { 1093 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1094 if (iov.iov_base == NULL) { 1095 ret = -ENOMEM; 1096 goto fail; 1097 } 1098 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1099 } 1100 qemu_iovec_init_external(&qiov, &iov, 1); 1101 1102 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1103 1104 /* Keep bounce buffer around if it is big enough for all 1105 * all future requests. 1106 */ 1107 if (num < max_xfer_len) { 1108 qemu_vfree(iov.iov_base); 1109 iov.iov_base = NULL; 1110 } 1111 } 1112 1113 sector_num += num; 1114 nb_sectors -= num; 1115 } 1116 1117 fail: 1118 qemu_vfree(iov.iov_base); 1119 return ret; 1120 } 1121 1122 /* 1123 * Forwards an already correctly aligned write request to the BlockDriver. 1124 */ 1125 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1126 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1127 QEMUIOVector *qiov, int flags) 1128 { 1129 BlockDriver *drv = bs->drv; 1130 bool waited; 1131 int ret; 1132 1133 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1134 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1135 1136 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1137 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1138 assert(!qiov || bytes == qiov->size); 1139 1140 waited = wait_serialising_requests(req); 1141 assert(!waited || !req->serialising); 1142 assert(req->overlap_offset <= offset); 1143 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1144 1145 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1146 1147 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1148 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1149 qemu_iovec_is_zero(qiov)) { 1150 flags |= BDRV_REQ_ZERO_WRITE; 1151 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1152 flags |= BDRV_REQ_MAY_UNMAP; 1153 } 1154 } 1155 1156 if (ret < 0) { 1157 /* Do nothing, write notifier decided to fail this request */ 1158 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1159 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); 1160 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1161 } else { 1162 BLKDBG_EVENT(bs, BLKDBG_PWRITEV); 1163 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1164 } 1165 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); 1166 1167 if (ret == 0 && !bs->enable_write_cache) { 1168 ret = bdrv_co_flush(bs); 1169 } 1170 1171 bdrv_set_dirty(bs, sector_num, nb_sectors); 1172 1173 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); 1174 1175 if (ret >= 0) { 1176 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1177 } 1178 1179 return ret; 1180 } 1181 1182 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1183 int64_t offset, 1184 unsigned int bytes, 1185 BdrvRequestFlags flags, 1186 BdrvTrackedRequest *req) 1187 { 1188 uint8_t *buf = NULL; 1189 QEMUIOVector local_qiov; 1190 struct iovec iov; 1191 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1192 unsigned int head_padding_bytes, tail_padding_bytes; 1193 int ret = 0; 1194 1195 head_padding_bytes = offset & (align - 1); 1196 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1197 1198 1199 assert(flags & BDRV_REQ_ZERO_WRITE); 1200 if (head_padding_bytes || tail_padding_bytes) { 1201 buf = qemu_blockalign(bs, align); 1202 iov = (struct iovec) { 1203 .iov_base = buf, 1204 .iov_len = align, 1205 }; 1206 qemu_iovec_init_external(&local_qiov, &iov, 1); 1207 } 1208 if (head_padding_bytes) { 1209 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1210 1211 /* RMW the unaligned part before head. */ 1212 mark_request_serialising(req, align); 1213 wait_serialising_requests(req); 1214 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1215 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1216 align, &local_qiov, 0); 1217 if (ret < 0) { 1218 goto fail; 1219 } 1220 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1221 1222 memset(buf + head_padding_bytes, 0, zero_bytes); 1223 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1224 &local_qiov, 1225 flags & ~BDRV_REQ_ZERO_WRITE); 1226 if (ret < 0) { 1227 goto fail; 1228 } 1229 offset += zero_bytes; 1230 bytes -= zero_bytes; 1231 } 1232 1233 assert(!bytes || (offset & (align - 1)) == 0); 1234 if (bytes >= align) { 1235 /* Write the aligned part in the middle. */ 1236 uint64_t aligned_bytes = bytes & ~(align - 1); 1237 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1238 NULL, flags); 1239 if (ret < 0) { 1240 goto fail; 1241 } 1242 bytes -= aligned_bytes; 1243 offset += aligned_bytes; 1244 } 1245 1246 assert(!bytes || (offset & (align - 1)) == 0); 1247 if (bytes) { 1248 assert(align == tail_padding_bytes + bytes); 1249 /* RMW the unaligned part after tail. */ 1250 mark_request_serialising(req, align); 1251 wait_serialising_requests(req); 1252 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1253 ret = bdrv_aligned_preadv(bs, req, offset, align, 1254 align, &local_qiov, 0); 1255 if (ret < 0) { 1256 goto fail; 1257 } 1258 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1259 1260 memset(buf, 0, bytes); 1261 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1262 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1263 } 1264 fail: 1265 qemu_vfree(buf); 1266 return ret; 1267 1268 } 1269 1270 /* 1271 * Handle a write request in coroutine context 1272 */ 1273 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1274 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1275 BdrvRequestFlags flags) 1276 { 1277 BdrvTrackedRequest req; 1278 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1279 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1280 uint8_t *head_buf = NULL; 1281 uint8_t *tail_buf = NULL; 1282 QEMUIOVector local_qiov; 1283 bool use_local_qiov = false; 1284 int ret; 1285 1286 if (!bs->drv) { 1287 return -ENOMEDIUM; 1288 } 1289 if (bs->read_only) { 1290 return -EPERM; 1291 } 1292 1293 ret = bdrv_check_byte_request(bs, offset, bytes); 1294 if (ret < 0) { 1295 return ret; 1296 } 1297 1298 /* throttling disk I/O */ 1299 if (bs->io_limits_enabled) { 1300 bdrv_io_limits_intercept(bs, bytes, true); 1301 } 1302 1303 /* 1304 * Align write if necessary by performing a read-modify-write cycle. 1305 * Pad qiov with the read parts and be sure to have a tracked request not 1306 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1307 */ 1308 tracked_request_begin(&req, bs, offset, bytes, true); 1309 1310 if (!qiov) { 1311 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1312 goto out; 1313 } 1314 1315 if (offset & (align - 1)) { 1316 QEMUIOVector head_qiov; 1317 struct iovec head_iov; 1318 1319 mark_request_serialising(&req, align); 1320 wait_serialising_requests(&req); 1321 1322 head_buf = qemu_blockalign(bs, align); 1323 head_iov = (struct iovec) { 1324 .iov_base = head_buf, 1325 .iov_len = align, 1326 }; 1327 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1328 1329 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1330 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1331 align, &head_qiov, 0); 1332 if (ret < 0) { 1333 goto fail; 1334 } 1335 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1336 1337 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1338 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1339 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1340 use_local_qiov = true; 1341 1342 bytes += offset & (align - 1); 1343 offset = offset & ~(align - 1); 1344 } 1345 1346 if ((offset + bytes) & (align - 1)) { 1347 QEMUIOVector tail_qiov; 1348 struct iovec tail_iov; 1349 size_t tail_bytes; 1350 bool waited; 1351 1352 mark_request_serialising(&req, align); 1353 waited = wait_serialising_requests(&req); 1354 assert(!waited || !use_local_qiov); 1355 1356 tail_buf = qemu_blockalign(bs, align); 1357 tail_iov = (struct iovec) { 1358 .iov_base = tail_buf, 1359 .iov_len = align, 1360 }; 1361 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1362 1363 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1364 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1365 align, &tail_qiov, 0); 1366 if (ret < 0) { 1367 goto fail; 1368 } 1369 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1370 1371 if (!use_local_qiov) { 1372 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1373 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1374 use_local_qiov = true; 1375 } 1376 1377 tail_bytes = (offset + bytes) & (align - 1); 1378 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1379 1380 bytes = ROUND_UP(bytes, align); 1381 } 1382 1383 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1384 use_local_qiov ? &local_qiov : qiov, 1385 flags); 1386 1387 fail: 1388 1389 if (use_local_qiov) { 1390 qemu_iovec_destroy(&local_qiov); 1391 } 1392 qemu_vfree(head_buf); 1393 qemu_vfree(tail_buf); 1394 out: 1395 tracked_request_end(&req); 1396 return ret; 1397 } 1398 1399 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1400 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1401 BdrvRequestFlags flags) 1402 { 1403 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1404 return -EINVAL; 1405 } 1406 1407 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1408 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1409 } 1410 1411 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1412 int nb_sectors, QEMUIOVector *qiov) 1413 { 1414 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1415 1416 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1417 } 1418 1419 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1420 int64_t sector_num, int nb_sectors, 1421 BdrvRequestFlags flags) 1422 { 1423 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1424 1425 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1426 flags &= ~BDRV_REQ_MAY_UNMAP; 1427 } 1428 1429 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1430 BDRV_REQ_ZERO_WRITE | flags); 1431 } 1432 1433 int bdrv_flush_all(void) 1434 { 1435 BlockDriverState *bs = NULL; 1436 int result = 0; 1437 1438 while ((bs = bdrv_next(bs))) { 1439 AioContext *aio_context = bdrv_get_aio_context(bs); 1440 int ret; 1441 1442 aio_context_acquire(aio_context); 1443 ret = bdrv_flush(bs); 1444 if (ret < 0 && !result) { 1445 result = ret; 1446 } 1447 aio_context_release(aio_context); 1448 } 1449 1450 return result; 1451 } 1452 1453 typedef struct BdrvCoGetBlockStatusData { 1454 BlockDriverState *bs; 1455 BlockDriverState *base; 1456 int64_t sector_num; 1457 int nb_sectors; 1458 int *pnum; 1459 int64_t ret; 1460 bool done; 1461 } BdrvCoGetBlockStatusData; 1462 1463 /* 1464 * Returns the allocation status of the specified sectors. 1465 * Drivers not implementing the functionality are assumed to not support 1466 * backing files, hence all their sectors are reported as allocated. 1467 * 1468 * If 'sector_num' is beyond the end of the disk image the return value is 0 1469 * and 'pnum' is set to 0. 1470 * 1471 * 'pnum' is set to the number of sectors (including and immediately following 1472 * the specified sector) that are known to be in the same 1473 * allocated/unallocated state. 1474 * 1475 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1476 * beyond the end of the disk image it will be clamped. 1477 */ 1478 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1479 int64_t sector_num, 1480 int nb_sectors, int *pnum) 1481 { 1482 int64_t total_sectors; 1483 int64_t n; 1484 int64_t ret, ret2; 1485 1486 total_sectors = bdrv_nb_sectors(bs); 1487 if (total_sectors < 0) { 1488 return total_sectors; 1489 } 1490 1491 if (sector_num >= total_sectors) { 1492 *pnum = 0; 1493 return 0; 1494 } 1495 1496 n = total_sectors - sector_num; 1497 if (n < nb_sectors) { 1498 nb_sectors = n; 1499 } 1500 1501 if (!bs->drv->bdrv_co_get_block_status) { 1502 *pnum = nb_sectors; 1503 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1504 if (bs->drv->protocol_name) { 1505 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1506 } 1507 return ret; 1508 } 1509 1510 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1511 if (ret < 0) { 1512 *pnum = 0; 1513 return ret; 1514 } 1515 1516 if (ret & BDRV_BLOCK_RAW) { 1517 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1518 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1519 *pnum, pnum); 1520 } 1521 1522 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1523 ret |= BDRV_BLOCK_ALLOCATED; 1524 } else { 1525 if (bdrv_unallocated_blocks_are_zero(bs)) { 1526 ret |= BDRV_BLOCK_ZERO; 1527 } else if (bs->backing_hd) { 1528 BlockDriverState *bs2 = bs->backing_hd; 1529 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1530 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1531 ret |= BDRV_BLOCK_ZERO; 1532 } 1533 } 1534 } 1535 1536 if (bs->file && 1537 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1538 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1539 int file_pnum; 1540 1541 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1542 *pnum, &file_pnum); 1543 if (ret2 >= 0) { 1544 /* Ignore errors. This is just providing extra information, it 1545 * is useful but not necessary. 1546 */ 1547 if (!file_pnum) { 1548 /* !file_pnum indicates an offset at or beyond the EOF; it is 1549 * perfectly valid for the format block driver to point to such 1550 * offsets, so catch it and mark everything as zero */ 1551 ret |= BDRV_BLOCK_ZERO; 1552 } else { 1553 /* Limit request to the range reported by the protocol driver */ 1554 *pnum = file_pnum; 1555 ret |= (ret2 & BDRV_BLOCK_ZERO); 1556 } 1557 } 1558 } 1559 1560 return ret; 1561 } 1562 1563 /* Coroutine wrapper for bdrv_get_block_status() */ 1564 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) 1565 { 1566 BdrvCoGetBlockStatusData *data = opaque; 1567 BlockDriverState *bs = data->bs; 1568 1569 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, 1570 data->pnum); 1571 data->done = true; 1572 } 1573 1574 /* 1575 * Synchronous wrapper around bdrv_co_get_block_status(). 1576 * 1577 * See bdrv_co_get_block_status() for details. 1578 */ 1579 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, 1580 int nb_sectors, int *pnum) 1581 { 1582 Coroutine *co; 1583 BdrvCoGetBlockStatusData data = { 1584 .bs = bs, 1585 .sector_num = sector_num, 1586 .nb_sectors = nb_sectors, 1587 .pnum = pnum, 1588 .done = false, 1589 }; 1590 1591 if (qemu_in_coroutine()) { 1592 /* Fast-path if already in coroutine context */ 1593 bdrv_get_block_status_co_entry(&data); 1594 } else { 1595 AioContext *aio_context = bdrv_get_aio_context(bs); 1596 1597 co = qemu_coroutine_create(bdrv_get_block_status_co_entry); 1598 qemu_coroutine_enter(co, &data); 1599 while (!data.done) { 1600 aio_poll(aio_context, true); 1601 } 1602 } 1603 return data.ret; 1604 } 1605 1606 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1607 int nb_sectors, int *pnum) 1608 { 1609 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1610 if (ret < 0) { 1611 return ret; 1612 } 1613 return !!(ret & BDRV_BLOCK_ALLOCATED); 1614 } 1615 1616 /* 1617 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1618 * 1619 * Return true if the given sector is allocated in any image between 1620 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1621 * sector is allocated in any image of the chain. Return false otherwise. 1622 * 1623 * 'pnum' is set to the number of sectors (including and immediately following 1624 * the specified sector) that are known to be in the same 1625 * allocated/unallocated state. 1626 * 1627 */ 1628 int bdrv_is_allocated_above(BlockDriverState *top, 1629 BlockDriverState *base, 1630 int64_t sector_num, 1631 int nb_sectors, int *pnum) 1632 { 1633 BlockDriverState *intermediate; 1634 int ret, n = nb_sectors; 1635 1636 intermediate = top; 1637 while (intermediate && intermediate != base) { 1638 int pnum_inter; 1639 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1640 &pnum_inter); 1641 if (ret < 0) { 1642 return ret; 1643 } else if (ret) { 1644 *pnum = pnum_inter; 1645 return 1; 1646 } 1647 1648 /* 1649 * [sector_num, nb_sectors] is unallocated on top but intermediate 1650 * might have 1651 * 1652 * [sector_num+x, nr_sectors] allocated. 1653 */ 1654 if (n > pnum_inter && 1655 (intermediate == top || 1656 sector_num + pnum_inter < intermediate->total_sectors)) { 1657 n = pnum_inter; 1658 } 1659 1660 intermediate = intermediate->backing_hd; 1661 } 1662 1663 *pnum = n; 1664 return 0; 1665 } 1666 1667 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1668 const uint8_t *buf, int nb_sectors) 1669 { 1670 BlockDriver *drv = bs->drv; 1671 int ret; 1672 1673 if (!drv) { 1674 return -ENOMEDIUM; 1675 } 1676 if (!drv->bdrv_write_compressed) { 1677 return -ENOTSUP; 1678 } 1679 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1680 if (ret < 0) { 1681 return ret; 1682 } 1683 1684 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1685 1686 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1687 } 1688 1689 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1690 int64_t pos, int size) 1691 { 1692 QEMUIOVector qiov; 1693 struct iovec iov = { 1694 .iov_base = (void *) buf, 1695 .iov_len = size, 1696 }; 1697 1698 qemu_iovec_init_external(&qiov, &iov, 1); 1699 return bdrv_writev_vmstate(bs, &qiov, pos); 1700 } 1701 1702 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1703 { 1704 BlockDriver *drv = bs->drv; 1705 1706 if (!drv) { 1707 return -ENOMEDIUM; 1708 } else if (drv->bdrv_save_vmstate) { 1709 return drv->bdrv_save_vmstate(bs, qiov, pos); 1710 } else if (bs->file) { 1711 return bdrv_writev_vmstate(bs->file, qiov, pos); 1712 } 1713 1714 return -ENOTSUP; 1715 } 1716 1717 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1718 int64_t pos, int size) 1719 { 1720 BlockDriver *drv = bs->drv; 1721 if (!drv) 1722 return -ENOMEDIUM; 1723 if (drv->bdrv_load_vmstate) 1724 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1725 if (bs->file) 1726 return bdrv_load_vmstate(bs->file, buf, pos, size); 1727 return -ENOTSUP; 1728 } 1729 1730 /**************************************************************/ 1731 /* async I/Os */ 1732 1733 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1734 QEMUIOVector *qiov, int nb_sectors, 1735 BlockCompletionFunc *cb, void *opaque) 1736 { 1737 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1738 1739 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1740 cb, opaque, false); 1741 } 1742 1743 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1744 QEMUIOVector *qiov, int nb_sectors, 1745 BlockCompletionFunc *cb, void *opaque) 1746 { 1747 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1748 1749 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1750 cb, opaque, true); 1751 } 1752 1753 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1754 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1755 BlockCompletionFunc *cb, void *opaque) 1756 { 1757 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1758 1759 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1760 BDRV_REQ_ZERO_WRITE | flags, 1761 cb, opaque, true); 1762 } 1763 1764 1765 typedef struct MultiwriteCB { 1766 int error; 1767 int num_requests; 1768 int num_callbacks; 1769 struct { 1770 BlockCompletionFunc *cb; 1771 void *opaque; 1772 QEMUIOVector *free_qiov; 1773 } callbacks[]; 1774 } MultiwriteCB; 1775 1776 static void multiwrite_user_cb(MultiwriteCB *mcb) 1777 { 1778 int i; 1779 1780 for (i = 0; i < mcb->num_callbacks; i++) { 1781 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1782 if (mcb->callbacks[i].free_qiov) { 1783 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1784 } 1785 g_free(mcb->callbacks[i].free_qiov); 1786 } 1787 } 1788 1789 static void multiwrite_cb(void *opaque, int ret) 1790 { 1791 MultiwriteCB *mcb = opaque; 1792 1793 trace_multiwrite_cb(mcb, ret); 1794 1795 if (ret < 0 && !mcb->error) { 1796 mcb->error = ret; 1797 } 1798 1799 mcb->num_requests--; 1800 if (mcb->num_requests == 0) { 1801 multiwrite_user_cb(mcb); 1802 g_free(mcb); 1803 } 1804 } 1805 1806 static int multiwrite_req_compare(const void *a, const void *b) 1807 { 1808 const BlockRequest *req1 = a, *req2 = b; 1809 1810 /* 1811 * Note that we can't simply subtract req2->sector from req1->sector 1812 * here as that could overflow the return value. 1813 */ 1814 if (req1->sector > req2->sector) { 1815 return 1; 1816 } else if (req1->sector < req2->sector) { 1817 return -1; 1818 } else { 1819 return 0; 1820 } 1821 } 1822 1823 /* 1824 * Takes a bunch of requests and tries to merge them. Returns the number of 1825 * requests that remain after merging. 1826 */ 1827 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1828 int num_reqs, MultiwriteCB *mcb) 1829 { 1830 int i, outidx; 1831 1832 // Sort requests by start sector 1833 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1834 1835 // Check if adjacent requests touch the same clusters. If so, combine them, 1836 // filling up gaps with zero sectors. 1837 outidx = 0; 1838 for (i = 1; i < num_reqs; i++) { 1839 int merge = 0; 1840 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1841 1842 // Handle exactly sequential writes and overlapping writes. 1843 if (reqs[i].sector <= oldreq_last) { 1844 merge = 1; 1845 } 1846 1847 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1848 merge = 0; 1849 } 1850 1851 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1852 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1853 merge = 0; 1854 } 1855 1856 if (merge) { 1857 size_t size; 1858 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1859 qemu_iovec_init(qiov, 1860 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1861 1862 // Add the first request to the merged one. If the requests are 1863 // overlapping, drop the last sectors of the first request. 1864 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1865 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1866 1867 // We should need to add any zeros between the two requests 1868 assert (reqs[i].sector <= oldreq_last); 1869 1870 // Add the second request 1871 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1872 1873 // Add tail of first request, if necessary 1874 if (qiov->size < reqs[outidx].qiov->size) { 1875 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1876 reqs[outidx].qiov->size - qiov->size); 1877 } 1878 1879 reqs[outidx].nb_sectors = qiov->size >> 9; 1880 reqs[outidx].qiov = qiov; 1881 1882 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1883 } else { 1884 outidx++; 1885 reqs[outidx].sector = reqs[i].sector; 1886 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1887 reqs[outidx].qiov = reqs[i].qiov; 1888 } 1889 } 1890 1891 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); 1892 1893 return outidx + 1; 1894 } 1895 1896 /* 1897 * Submit multiple AIO write requests at once. 1898 * 1899 * On success, the function returns 0 and all requests in the reqs array have 1900 * been submitted. In error case this function returns -1, and any of the 1901 * requests may or may not be submitted yet. In particular, this means that the 1902 * callback will be called for some of the requests, for others it won't. The 1903 * caller must check the error field of the BlockRequest to wait for the right 1904 * callbacks (if error != 0, no callback will be called). 1905 * 1906 * The implementation may modify the contents of the reqs array, e.g. to merge 1907 * requests. However, the fields opaque and error are left unmodified as they 1908 * are used to signal failure for a single request to the caller. 1909 */ 1910 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1911 { 1912 MultiwriteCB *mcb; 1913 int i; 1914 1915 /* don't submit writes if we don't have a medium */ 1916 if (bs->drv == NULL) { 1917 for (i = 0; i < num_reqs; i++) { 1918 reqs[i].error = -ENOMEDIUM; 1919 } 1920 return -1; 1921 } 1922 1923 if (num_reqs == 0) { 1924 return 0; 1925 } 1926 1927 // Create MultiwriteCB structure 1928 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1929 mcb->num_requests = 0; 1930 mcb->num_callbacks = num_reqs; 1931 1932 for (i = 0; i < num_reqs; i++) { 1933 mcb->callbacks[i].cb = reqs[i].cb; 1934 mcb->callbacks[i].opaque = reqs[i].opaque; 1935 } 1936 1937 // Check for mergable requests 1938 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1939 1940 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1941 1942 /* Run the aio requests. */ 1943 mcb->num_requests = num_reqs; 1944 for (i = 0; i < num_reqs; i++) { 1945 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1946 reqs[i].nb_sectors, reqs[i].flags, 1947 multiwrite_cb, mcb, 1948 true); 1949 } 1950 1951 return 0; 1952 } 1953 1954 void bdrv_aio_cancel(BlockAIOCB *acb) 1955 { 1956 qemu_aio_ref(acb); 1957 bdrv_aio_cancel_async(acb); 1958 while (acb->refcnt > 1) { 1959 if (acb->aiocb_info->get_aio_context) { 1960 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1961 } else if (acb->bs) { 1962 aio_poll(bdrv_get_aio_context(acb->bs), true); 1963 } else { 1964 abort(); 1965 } 1966 } 1967 qemu_aio_unref(acb); 1968 } 1969 1970 /* Async version of aio cancel. The caller is not blocked if the acb implements 1971 * cancel_async, otherwise we do nothing and let the request normally complete. 1972 * In either case the completion callback must be called. */ 1973 void bdrv_aio_cancel_async(BlockAIOCB *acb) 1974 { 1975 if (acb->aiocb_info->cancel_async) { 1976 acb->aiocb_info->cancel_async(acb); 1977 } 1978 } 1979 1980 /**************************************************************/ 1981 /* async block device emulation */ 1982 1983 typedef struct BlockAIOCBSync { 1984 BlockAIOCB common; 1985 QEMUBH *bh; 1986 int ret; 1987 /* vector translation state */ 1988 QEMUIOVector *qiov; 1989 uint8_t *bounce; 1990 int is_write; 1991 } BlockAIOCBSync; 1992 1993 static const AIOCBInfo bdrv_em_aiocb_info = { 1994 .aiocb_size = sizeof(BlockAIOCBSync), 1995 }; 1996 1997 static void bdrv_aio_bh_cb(void *opaque) 1998 { 1999 BlockAIOCBSync *acb = opaque; 2000 2001 if (!acb->is_write && acb->ret >= 0) { 2002 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2003 } 2004 qemu_vfree(acb->bounce); 2005 acb->common.cb(acb->common.opaque, acb->ret); 2006 qemu_bh_delete(acb->bh); 2007 acb->bh = NULL; 2008 qemu_aio_unref(acb); 2009 } 2010 2011 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2012 int64_t sector_num, 2013 QEMUIOVector *qiov, 2014 int nb_sectors, 2015 BlockCompletionFunc *cb, 2016 void *opaque, 2017 int is_write) 2018 2019 { 2020 BlockAIOCBSync *acb; 2021 2022 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2023 acb->is_write = is_write; 2024 acb->qiov = qiov; 2025 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2026 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2027 2028 if (acb->bounce == NULL) { 2029 acb->ret = -ENOMEM; 2030 } else if (is_write) { 2031 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2032 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2033 } else { 2034 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2035 } 2036 2037 qemu_bh_schedule(acb->bh); 2038 2039 return &acb->common; 2040 } 2041 2042 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2043 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2044 BlockCompletionFunc *cb, void *opaque) 2045 { 2046 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2047 } 2048 2049 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2050 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2051 BlockCompletionFunc *cb, void *opaque) 2052 { 2053 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2054 } 2055 2056 2057 typedef struct BlockAIOCBCoroutine { 2058 BlockAIOCB common; 2059 BlockRequest req; 2060 bool is_write; 2061 bool need_bh; 2062 bool *done; 2063 QEMUBH* bh; 2064 } BlockAIOCBCoroutine; 2065 2066 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2067 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2068 }; 2069 2070 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2071 { 2072 if (!acb->need_bh) { 2073 acb->common.cb(acb->common.opaque, acb->req.error); 2074 qemu_aio_unref(acb); 2075 } 2076 } 2077 2078 static void bdrv_co_em_bh(void *opaque) 2079 { 2080 BlockAIOCBCoroutine *acb = opaque; 2081 2082 assert(!acb->need_bh); 2083 qemu_bh_delete(acb->bh); 2084 bdrv_co_complete(acb); 2085 } 2086 2087 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2088 { 2089 acb->need_bh = false; 2090 if (acb->req.error != -EINPROGRESS) { 2091 BlockDriverState *bs = acb->common.bs; 2092 2093 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2094 qemu_bh_schedule(acb->bh); 2095 } 2096 } 2097 2098 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2099 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2100 { 2101 BlockAIOCBCoroutine *acb = opaque; 2102 BlockDriverState *bs = acb->common.bs; 2103 2104 if (!acb->is_write) { 2105 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2106 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2107 } else { 2108 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2109 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2110 } 2111 2112 bdrv_co_complete(acb); 2113 } 2114 2115 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2116 int64_t sector_num, 2117 QEMUIOVector *qiov, 2118 int nb_sectors, 2119 BdrvRequestFlags flags, 2120 BlockCompletionFunc *cb, 2121 void *opaque, 2122 bool is_write) 2123 { 2124 Coroutine *co; 2125 BlockAIOCBCoroutine *acb; 2126 2127 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2128 acb->need_bh = true; 2129 acb->req.error = -EINPROGRESS; 2130 acb->req.sector = sector_num; 2131 acb->req.nb_sectors = nb_sectors; 2132 acb->req.qiov = qiov; 2133 acb->req.flags = flags; 2134 acb->is_write = is_write; 2135 2136 co = qemu_coroutine_create(bdrv_co_do_rw); 2137 qemu_coroutine_enter(co, acb); 2138 2139 bdrv_co_maybe_schedule_bh(acb); 2140 return &acb->common; 2141 } 2142 2143 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2144 { 2145 BlockAIOCBCoroutine *acb = opaque; 2146 BlockDriverState *bs = acb->common.bs; 2147 2148 acb->req.error = bdrv_co_flush(bs); 2149 bdrv_co_complete(acb); 2150 } 2151 2152 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2153 BlockCompletionFunc *cb, void *opaque) 2154 { 2155 trace_bdrv_aio_flush(bs, opaque); 2156 2157 Coroutine *co; 2158 BlockAIOCBCoroutine *acb; 2159 2160 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2161 acb->need_bh = true; 2162 acb->req.error = -EINPROGRESS; 2163 2164 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2165 qemu_coroutine_enter(co, acb); 2166 2167 bdrv_co_maybe_schedule_bh(acb); 2168 return &acb->common; 2169 } 2170 2171 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2172 { 2173 BlockAIOCBCoroutine *acb = opaque; 2174 BlockDriverState *bs = acb->common.bs; 2175 2176 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2177 bdrv_co_complete(acb); 2178 } 2179 2180 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2181 int64_t sector_num, int nb_sectors, 2182 BlockCompletionFunc *cb, void *opaque) 2183 { 2184 Coroutine *co; 2185 BlockAIOCBCoroutine *acb; 2186 2187 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2188 2189 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2190 acb->need_bh = true; 2191 acb->req.error = -EINPROGRESS; 2192 acb->req.sector = sector_num; 2193 acb->req.nb_sectors = nb_sectors; 2194 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2195 qemu_coroutine_enter(co, acb); 2196 2197 bdrv_co_maybe_schedule_bh(acb); 2198 return &acb->common; 2199 } 2200 2201 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2202 BlockCompletionFunc *cb, void *opaque) 2203 { 2204 BlockAIOCB *acb; 2205 2206 acb = g_slice_alloc(aiocb_info->aiocb_size); 2207 acb->aiocb_info = aiocb_info; 2208 acb->bs = bs; 2209 acb->cb = cb; 2210 acb->opaque = opaque; 2211 acb->refcnt = 1; 2212 return acb; 2213 } 2214 2215 void qemu_aio_ref(void *p) 2216 { 2217 BlockAIOCB *acb = p; 2218 acb->refcnt++; 2219 } 2220 2221 void qemu_aio_unref(void *p) 2222 { 2223 BlockAIOCB *acb = p; 2224 assert(acb->refcnt > 0); 2225 if (--acb->refcnt == 0) { 2226 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 2227 } 2228 } 2229 2230 /**************************************************************/ 2231 /* Coroutine block device emulation */ 2232 2233 typedef struct CoroutineIOCompletion { 2234 Coroutine *coroutine; 2235 int ret; 2236 } CoroutineIOCompletion; 2237 2238 static void bdrv_co_io_em_complete(void *opaque, int ret) 2239 { 2240 CoroutineIOCompletion *co = opaque; 2241 2242 co->ret = ret; 2243 qemu_coroutine_enter(co->coroutine, NULL); 2244 } 2245 2246 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2247 int nb_sectors, QEMUIOVector *iov, 2248 bool is_write) 2249 { 2250 CoroutineIOCompletion co = { 2251 .coroutine = qemu_coroutine_self(), 2252 }; 2253 BlockAIOCB *acb; 2254 2255 if (is_write) { 2256 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2257 bdrv_co_io_em_complete, &co); 2258 } else { 2259 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2260 bdrv_co_io_em_complete, &co); 2261 } 2262 2263 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2264 if (!acb) { 2265 return -EIO; 2266 } 2267 qemu_coroutine_yield(); 2268 2269 return co.ret; 2270 } 2271 2272 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2273 int64_t sector_num, int nb_sectors, 2274 QEMUIOVector *iov) 2275 { 2276 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2277 } 2278 2279 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2280 int64_t sector_num, int nb_sectors, 2281 QEMUIOVector *iov) 2282 { 2283 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2284 } 2285 2286 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2287 { 2288 RwCo *rwco = opaque; 2289 2290 rwco->ret = bdrv_co_flush(rwco->bs); 2291 } 2292 2293 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2294 { 2295 int ret; 2296 2297 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 2298 return 0; 2299 } 2300 2301 /* Write back cached data to the OS even with cache=unsafe */ 2302 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2303 if (bs->drv->bdrv_co_flush_to_os) { 2304 ret = bs->drv->bdrv_co_flush_to_os(bs); 2305 if (ret < 0) { 2306 return ret; 2307 } 2308 } 2309 2310 /* But don't actually force it to the disk with cache=unsafe */ 2311 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2312 goto flush_parent; 2313 } 2314 2315 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2316 if (bs->drv->bdrv_co_flush_to_disk) { 2317 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2318 } else if (bs->drv->bdrv_aio_flush) { 2319 BlockAIOCB *acb; 2320 CoroutineIOCompletion co = { 2321 .coroutine = qemu_coroutine_self(), 2322 }; 2323 2324 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2325 if (acb == NULL) { 2326 ret = -EIO; 2327 } else { 2328 qemu_coroutine_yield(); 2329 ret = co.ret; 2330 } 2331 } else { 2332 /* 2333 * Some block drivers always operate in either writethrough or unsafe 2334 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2335 * know how the server works (because the behaviour is hardcoded or 2336 * depends on server-side configuration), so we can't ensure that 2337 * everything is safe on disk. Returning an error doesn't work because 2338 * that would break guests even if the server operates in writethrough 2339 * mode. 2340 * 2341 * Let's hope the user knows what he's doing. 2342 */ 2343 ret = 0; 2344 } 2345 if (ret < 0) { 2346 return ret; 2347 } 2348 2349 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2350 * in the case of cache=unsafe, so there are no useless flushes. 2351 */ 2352 flush_parent: 2353 return bdrv_co_flush(bs->file); 2354 } 2355 2356 int bdrv_flush(BlockDriverState *bs) 2357 { 2358 Coroutine *co; 2359 RwCo rwco = { 2360 .bs = bs, 2361 .ret = NOT_DONE, 2362 }; 2363 2364 if (qemu_in_coroutine()) { 2365 /* Fast-path if already in coroutine context */ 2366 bdrv_flush_co_entry(&rwco); 2367 } else { 2368 AioContext *aio_context = bdrv_get_aio_context(bs); 2369 2370 co = qemu_coroutine_create(bdrv_flush_co_entry); 2371 qemu_coroutine_enter(co, &rwco); 2372 while (rwco.ret == NOT_DONE) { 2373 aio_poll(aio_context, true); 2374 } 2375 } 2376 2377 return rwco.ret; 2378 } 2379 2380 typedef struct DiscardCo { 2381 BlockDriverState *bs; 2382 int64_t sector_num; 2383 int nb_sectors; 2384 int ret; 2385 } DiscardCo; 2386 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2387 { 2388 DiscardCo *rwco = opaque; 2389 2390 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2391 } 2392 2393 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2394 int nb_sectors) 2395 { 2396 int max_discard, ret; 2397 2398 if (!bs->drv) { 2399 return -ENOMEDIUM; 2400 } 2401 2402 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2403 if (ret < 0) { 2404 return ret; 2405 } else if (bs->read_only) { 2406 return -EPERM; 2407 } 2408 2409 bdrv_reset_dirty(bs, sector_num, nb_sectors); 2410 2411 /* Do nothing if disabled. */ 2412 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2413 return 0; 2414 } 2415 2416 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2417 return 0; 2418 } 2419 2420 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2421 while (nb_sectors > 0) { 2422 int ret; 2423 int num = nb_sectors; 2424 2425 /* align request */ 2426 if (bs->bl.discard_alignment && 2427 num >= bs->bl.discard_alignment && 2428 sector_num % bs->bl.discard_alignment) { 2429 if (num > bs->bl.discard_alignment) { 2430 num = bs->bl.discard_alignment; 2431 } 2432 num -= sector_num % bs->bl.discard_alignment; 2433 } 2434 2435 /* limit request size */ 2436 if (num > max_discard) { 2437 num = max_discard; 2438 } 2439 2440 if (bs->drv->bdrv_co_discard) { 2441 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2442 } else { 2443 BlockAIOCB *acb; 2444 CoroutineIOCompletion co = { 2445 .coroutine = qemu_coroutine_self(), 2446 }; 2447 2448 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2449 bdrv_co_io_em_complete, &co); 2450 if (acb == NULL) { 2451 return -EIO; 2452 } else { 2453 qemu_coroutine_yield(); 2454 ret = co.ret; 2455 } 2456 } 2457 if (ret && ret != -ENOTSUP) { 2458 return ret; 2459 } 2460 2461 sector_num += num; 2462 nb_sectors -= num; 2463 } 2464 return 0; 2465 } 2466 2467 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2468 { 2469 Coroutine *co; 2470 DiscardCo rwco = { 2471 .bs = bs, 2472 .sector_num = sector_num, 2473 .nb_sectors = nb_sectors, 2474 .ret = NOT_DONE, 2475 }; 2476 2477 if (qemu_in_coroutine()) { 2478 /* Fast-path if already in coroutine context */ 2479 bdrv_discard_co_entry(&rwco); 2480 } else { 2481 AioContext *aio_context = bdrv_get_aio_context(bs); 2482 2483 co = qemu_coroutine_create(bdrv_discard_co_entry); 2484 qemu_coroutine_enter(co, &rwco); 2485 while (rwco.ret == NOT_DONE) { 2486 aio_poll(aio_context, true); 2487 } 2488 } 2489 2490 return rwco.ret; 2491 } 2492 2493 /* needed for generic scsi interface */ 2494 2495 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2496 { 2497 BlockDriver *drv = bs->drv; 2498 2499 if (drv && drv->bdrv_ioctl) 2500 return drv->bdrv_ioctl(bs, req, buf); 2501 return -ENOTSUP; 2502 } 2503 2504 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2505 unsigned long int req, void *buf, 2506 BlockCompletionFunc *cb, void *opaque) 2507 { 2508 BlockDriver *drv = bs->drv; 2509 2510 if (drv && drv->bdrv_aio_ioctl) 2511 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 2512 return NULL; 2513 } 2514 2515 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2516 { 2517 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2518 } 2519 2520 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2521 { 2522 return memset(qemu_blockalign(bs, size), 0, size); 2523 } 2524 2525 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2526 { 2527 size_t align = bdrv_opt_mem_align(bs); 2528 2529 /* Ensure that NULL is never returned on success */ 2530 assert(align > 0); 2531 if (size == 0) { 2532 size = align; 2533 } 2534 2535 return qemu_try_memalign(align, size); 2536 } 2537 2538 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2539 { 2540 void *mem = qemu_try_blockalign(bs, size); 2541 2542 if (mem) { 2543 memset(mem, 0, size); 2544 } 2545 2546 return mem; 2547 } 2548 2549 /* 2550 * Check if all memory in this vector is sector aligned. 2551 */ 2552 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2553 { 2554 int i; 2555 size_t alignment = bdrv_min_mem_align(bs); 2556 2557 for (i = 0; i < qiov->niov; i++) { 2558 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2559 return false; 2560 } 2561 if (qiov->iov[i].iov_len % alignment) { 2562 return false; 2563 } 2564 } 2565 2566 return true; 2567 } 2568 2569 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2570 NotifierWithReturn *notifier) 2571 { 2572 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2573 } 2574 2575 void bdrv_io_plug(BlockDriverState *bs) 2576 { 2577 BlockDriver *drv = bs->drv; 2578 if (drv && drv->bdrv_io_plug) { 2579 drv->bdrv_io_plug(bs); 2580 } else if (bs->file) { 2581 bdrv_io_plug(bs->file); 2582 } 2583 } 2584 2585 void bdrv_io_unplug(BlockDriverState *bs) 2586 { 2587 BlockDriver *drv = bs->drv; 2588 if (drv && drv->bdrv_io_unplug) { 2589 drv->bdrv_io_unplug(bs); 2590 } else if (bs->file) { 2591 bdrv_io_unplug(bs->file); 2592 } 2593 } 2594 2595 void bdrv_flush_io_queue(BlockDriverState *bs) 2596 { 2597 BlockDriver *drv = bs->drv; 2598 if (drv && drv->bdrv_flush_io_queue) { 2599 drv->bdrv_flush_io_queue(bs); 2600 } else if (bs->file) { 2601 bdrv_flush_io_queue(bs->file); 2602 } 2603 } 2604