1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "block/blockjob.h" 27 #include "block/block_int.h" 28 #include "block/throttle-groups.h" 29 #include "qemu/error-report.h" 30 31 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 32 33 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 34 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 35 BlockCompletionFunc *cb, void *opaque); 36 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 37 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 38 BlockCompletionFunc *cb, void *opaque); 39 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 40 int64_t sector_num, int nb_sectors, 41 QEMUIOVector *iov); 42 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 43 int64_t sector_num, int nb_sectors, 44 QEMUIOVector *iov); 45 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 46 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 47 BdrvRequestFlags flags); 48 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 49 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 50 BdrvRequestFlags flags); 51 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 52 int64_t sector_num, 53 QEMUIOVector *qiov, 54 int nb_sectors, 55 BdrvRequestFlags flags, 56 BlockCompletionFunc *cb, 57 void *opaque, 58 bool is_write); 59 static void coroutine_fn bdrv_co_do_rw(void *opaque); 60 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 61 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 62 63 /* throttling disk I/O limits */ 64 void bdrv_set_io_limits(BlockDriverState *bs, 65 ThrottleConfig *cfg) 66 { 67 int i; 68 69 throttle_group_config(bs, cfg); 70 71 for (i = 0; i < 2; i++) { 72 qemu_co_enter_next(&bs->throttled_reqs[i]); 73 } 74 } 75 76 /* this function drain all the throttled IOs */ 77 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 78 { 79 bool drained = false; 80 bool enabled = bs->io_limits_enabled; 81 int i; 82 83 bs->io_limits_enabled = false; 84 85 for (i = 0; i < 2; i++) { 86 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 87 drained = true; 88 } 89 } 90 91 bs->io_limits_enabled = enabled; 92 93 return drained; 94 } 95 96 void bdrv_io_limits_disable(BlockDriverState *bs) 97 { 98 bs->io_limits_enabled = false; 99 bdrv_start_throttled_reqs(bs); 100 throttle_group_unregister_bs(bs); 101 } 102 103 /* should be called before bdrv_set_io_limits if a limit is set */ 104 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 105 { 106 assert(!bs->io_limits_enabled); 107 throttle_group_register_bs(bs, group); 108 bs->io_limits_enabled = true; 109 } 110 111 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 112 { 113 /* this bs is not part of any group */ 114 if (!bs->throttle_state) { 115 return; 116 } 117 118 /* this bs is a part of the same group than the one we want */ 119 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 120 return; 121 } 122 123 /* need to change the group this bs belong to */ 124 bdrv_io_limits_disable(bs); 125 bdrv_io_limits_enable(bs, group); 126 } 127 128 void bdrv_setup_io_funcs(BlockDriver *bdrv) 129 { 130 /* Block drivers without coroutine functions need emulation */ 131 if (!bdrv->bdrv_co_readv) { 132 bdrv->bdrv_co_readv = bdrv_co_readv_em; 133 bdrv->bdrv_co_writev = bdrv_co_writev_em; 134 135 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 136 * the block driver lacks aio we need to emulate that too. 137 */ 138 if (!bdrv->bdrv_aio_readv) { 139 /* add AIO emulation layer */ 140 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 141 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 142 } 143 } 144 } 145 146 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 147 { 148 BlockDriver *drv = bs->drv; 149 Error *local_err = NULL; 150 151 memset(&bs->bl, 0, sizeof(bs->bl)); 152 153 if (!drv) { 154 return; 155 } 156 157 /* Take some limits from the children as a default */ 158 if (bs->file) { 159 bdrv_refresh_limits(bs->file, &local_err); 160 if (local_err) { 161 error_propagate(errp, local_err); 162 return; 163 } 164 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; 165 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; 166 bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; 167 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; 168 } else { 169 bs->bl.min_mem_alignment = 512; 170 bs->bl.opt_mem_alignment = getpagesize(); 171 } 172 173 if (bs->backing_hd) { 174 bdrv_refresh_limits(bs->backing_hd, &local_err); 175 if (local_err) { 176 error_propagate(errp, local_err); 177 return; 178 } 179 bs->bl.opt_transfer_length = 180 MAX(bs->bl.opt_transfer_length, 181 bs->backing_hd->bl.opt_transfer_length); 182 bs->bl.max_transfer_length = 183 MIN_NON_ZERO(bs->bl.max_transfer_length, 184 bs->backing_hd->bl.max_transfer_length); 185 bs->bl.opt_mem_alignment = 186 MAX(bs->bl.opt_mem_alignment, 187 bs->backing_hd->bl.opt_mem_alignment); 188 bs->bl.min_mem_alignment = 189 MAX(bs->bl.min_mem_alignment, 190 bs->backing_hd->bl.min_mem_alignment); 191 } 192 193 /* Then let the driver override it */ 194 if (drv->bdrv_refresh_limits) { 195 drv->bdrv_refresh_limits(bs, errp); 196 } 197 } 198 199 /** 200 * The copy-on-read flag is actually a reference count so multiple users may 201 * use the feature without worrying about clobbering its previous state. 202 * Copy-on-read stays enabled until all users have called to disable it. 203 */ 204 void bdrv_enable_copy_on_read(BlockDriverState *bs) 205 { 206 bs->copy_on_read++; 207 } 208 209 void bdrv_disable_copy_on_read(BlockDriverState *bs) 210 { 211 assert(bs->copy_on_read > 0); 212 bs->copy_on_read--; 213 } 214 215 /* Check if any requests are in-flight (including throttled requests) */ 216 static bool bdrv_requests_pending(BlockDriverState *bs) 217 { 218 if (!QLIST_EMPTY(&bs->tracked_requests)) { 219 return true; 220 } 221 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 222 return true; 223 } 224 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 225 return true; 226 } 227 if (bs->file && bdrv_requests_pending(bs->file)) { 228 return true; 229 } 230 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 231 return true; 232 } 233 return false; 234 } 235 236 /* 237 * Wait for pending requests to complete on a single BlockDriverState subtree 238 * 239 * See the warning in bdrv_drain_all(). This function can only be called if 240 * you are sure nothing can generate I/O because you have op blockers 241 * installed. 242 * 243 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 244 * AioContext. 245 */ 246 void bdrv_drain(BlockDriverState *bs) 247 { 248 bool busy = true; 249 250 while (busy) { 251 /* Keep iterating */ 252 bdrv_flush_io_queue(bs); 253 busy = bdrv_requests_pending(bs); 254 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 255 } 256 } 257 258 /* 259 * Wait for pending requests to complete across all BlockDriverStates 260 * 261 * This function does not flush data to disk, use bdrv_flush_all() for that 262 * after calling this function. 263 * 264 * Note that completion of an asynchronous I/O operation can trigger any 265 * number of other I/O operations on other devices---for example a coroutine 266 * can be arbitrarily complex and a constant flow of I/O can come until the 267 * coroutine is complete. Because of this, it is not possible to have a 268 * function to drain a single device's I/O queue. 269 */ 270 void bdrv_drain_all(void) 271 { 272 /* Always run first iteration so any pending completion BHs run */ 273 bool busy = true; 274 BlockDriverState *bs = NULL; 275 GSList *aio_ctxs = NULL, *ctx; 276 277 while ((bs = bdrv_next(bs))) { 278 AioContext *aio_context = bdrv_get_aio_context(bs); 279 280 aio_context_acquire(aio_context); 281 if (bs->job) { 282 block_job_pause(bs->job); 283 } 284 aio_context_release(aio_context); 285 286 if (!g_slist_find(aio_ctxs, aio_context)) { 287 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 288 } 289 } 290 291 while (busy) { 292 busy = false; 293 294 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 295 AioContext *aio_context = ctx->data; 296 bs = NULL; 297 298 aio_context_acquire(aio_context); 299 while ((bs = bdrv_next(bs))) { 300 if (aio_context == bdrv_get_aio_context(bs)) { 301 bdrv_flush_io_queue(bs); 302 if (bdrv_requests_pending(bs)) { 303 busy = true; 304 aio_poll(aio_context, busy); 305 } 306 } 307 } 308 busy |= aio_poll(aio_context, false); 309 aio_context_release(aio_context); 310 } 311 } 312 313 bs = NULL; 314 while ((bs = bdrv_next(bs))) { 315 AioContext *aio_context = bdrv_get_aio_context(bs); 316 317 aio_context_acquire(aio_context); 318 if (bs->job) { 319 block_job_resume(bs->job); 320 } 321 aio_context_release(aio_context); 322 } 323 g_slist_free(aio_ctxs); 324 } 325 326 /** 327 * Remove an active request from the tracked requests list 328 * 329 * This function should be called when a tracked request is completing. 330 */ 331 static void tracked_request_end(BdrvTrackedRequest *req) 332 { 333 if (req->serialising) { 334 req->bs->serialising_in_flight--; 335 } 336 337 QLIST_REMOVE(req, list); 338 qemu_co_queue_restart_all(&req->wait_queue); 339 } 340 341 /** 342 * Add an active request to the tracked requests list 343 */ 344 static void tracked_request_begin(BdrvTrackedRequest *req, 345 BlockDriverState *bs, 346 int64_t offset, 347 unsigned int bytes, bool is_write) 348 { 349 *req = (BdrvTrackedRequest){ 350 .bs = bs, 351 .offset = offset, 352 .bytes = bytes, 353 .is_write = is_write, 354 .co = qemu_coroutine_self(), 355 .serialising = false, 356 .overlap_offset = offset, 357 .overlap_bytes = bytes, 358 }; 359 360 qemu_co_queue_init(&req->wait_queue); 361 362 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 363 } 364 365 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 366 { 367 int64_t overlap_offset = req->offset & ~(align - 1); 368 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 369 - overlap_offset; 370 371 if (!req->serialising) { 372 req->bs->serialising_in_flight++; 373 req->serialising = true; 374 } 375 376 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 377 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 378 } 379 380 /** 381 * Round a region to cluster boundaries 382 */ 383 void bdrv_round_to_clusters(BlockDriverState *bs, 384 int64_t sector_num, int nb_sectors, 385 int64_t *cluster_sector_num, 386 int *cluster_nb_sectors) 387 { 388 BlockDriverInfo bdi; 389 390 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 391 *cluster_sector_num = sector_num; 392 *cluster_nb_sectors = nb_sectors; 393 } else { 394 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 395 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 396 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 397 nb_sectors, c); 398 } 399 } 400 401 static int bdrv_get_cluster_size(BlockDriverState *bs) 402 { 403 BlockDriverInfo bdi; 404 int ret; 405 406 ret = bdrv_get_info(bs, &bdi); 407 if (ret < 0 || bdi.cluster_size == 0) { 408 return bs->request_alignment; 409 } else { 410 return bdi.cluster_size; 411 } 412 } 413 414 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 415 int64_t offset, unsigned int bytes) 416 { 417 /* aaaa bbbb */ 418 if (offset >= req->overlap_offset + req->overlap_bytes) { 419 return false; 420 } 421 /* bbbb aaaa */ 422 if (req->overlap_offset >= offset + bytes) { 423 return false; 424 } 425 return true; 426 } 427 428 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 429 { 430 BlockDriverState *bs = self->bs; 431 BdrvTrackedRequest *req; 432 bool retry; 433 bool waited = false; 434 435 if (!bs->serialising_in_flight) { 436 return false; 437 } 438 439 do { 440 retry = false; 441 QLIST_FOREACH(req, &bs->tracked_requests, list) { 442 if (req == self || (!req->serialising && !self->serialising)) { 443 continue; 444 } 445 if (tracked_request_overlaps(req, self->overlap_offset, 446 self->overlap_bytes)) 447 { 448 /* Hitting this means there was a reentrant request, for 449 * example, a block driver issuing nested requests. This must 450 * never happen since it means deadlock. 451 */ 452 assert(qemu_coroutine_self() != req->co); 453 454 /* If the request is already (indirectly) waiting for us, or 455 * will wait for us as soon as it wakes up, then just go on 456 * (instead of producing a deadlock in the former case). */ 457 if (!req->waiting_for) { 458 self->waiting_for = req; 459 qemu_co_queue_wait(&req->wait_queue); 460 self->waiting_for = NULL; 461 retry = true; 462 waited = true; 463 break; 464 } 465 } 466 } 467 } while (retry); 468 469 return waited; 470 } 471 472 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 473 size_t size) 474 { 475 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 476 return -EIO; 477 } 478 479 if (!bdrv_is_inserted(bs)) { 480 return -ENOMEDIUM; 481 } 482 483 if (offset < 0) { 484 return -EIO; 485 } 486 487 return 0; 488 } 489 490 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 491 int nb_sectors) 492 { 493 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 494 return -EIO; 495 } 496 497 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 498 nb_sectors * BDRV_SECTOR_SIZE); 499 } 500 501 typedef struct RwCo { 502 BlockDriverState *bs; 503 int64_t offset; 504 QEMUIOVector *qiov; 505 bool is_write; 506 int ret; 507 BdrvRequestFlags flags; 508 } RwCo; 509 510 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 511 { 512 RwCo *rwco = opaque; 513 514 if (!rwco->is_write) { 515 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 516 rwco->qiov->size, rwco->qiov, 517 rwco->flags); 518 } else { 519 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 520 rwco->qiov->size, rwco->qiov, 521 rwco->flags); 522 } 523 } 524 525 /* 526 * Process a vectored synchronous request using coroutines 527 */ 528 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 529 QEMUIOVector *qiov, bool is_write, 530 BdrvRequestFlags flags) 531 { 532 Coroutine *co; 533 RwCo rwco = { 534 .bs = bs, 535 .offset = offset, 536 .qiov = qiov, 537 .is_write = is_write, 538 .ret = NOT_DONE, 539 .flags = flags, 540 }; 541 542 /** 543 * In sync call context, when the vcpu is blocked, this throttling timer 544 * will not fire; so the I/O throttling function has to be disabled here 545 * if it has been enabled. 546 */ 547 if (bs->io_limits_enabled) { 548 fprintf(stderr, "Disabling I/O throttling on '%s' due " 549 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 550 bdrv_io_limits_disable(bs); 551 } 552 553 if (qemu_in_coroutine()) { 554 /* Fast-path if already in coroutine context */ 555 bdrv_rw_co_entry(&rwco); 556 } else { 557 AioContext *aio_context = bdrv_get_aio_context(bs); 558 559 co = qemu_coroutine_create(bdrv_rw_co_entry); 560 qemu_coroutine_enter(co, &rwco); 561 while (rwco.ret == NOT_DONE) { 562 aio_poll(aio_context, true); 563 } 564 } 565 return rwco.ret; 566 } 567 568 /* 569 * Process a synchronous request using coroutines 570 */ 571 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 572 int nb_sectors, bool is_write, BdrvRequestFlags flags) 573 { 574 QEMUIOVector qiov; 575 struct iovec iov = { 576 .iov_base = (void *)buf, 577 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 578 }; 579 580 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 581 return -EINVAL; 582 } 583 584 qemu_iovec_init_external(&qiov, &iov, 1); 585 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 586 &qiov, is_write, flags); 587 } 588 589 /* return < 0 if error. See bdrv_write() for the return codes */ 590 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 591 uint8_t *buf, int nb_sectors) 592 { 593 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 594 } 595 596 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 597 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 598 uint8_t *buf, int nb_sectors) 599 { 600 bool enabled; 601 int ret; 602 603 enabled = bs->io_limits_enabled; 604 bs->io_limits_enabled = false; 605 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 606 bs->io_limits_enabled = enabled; 607 return ret; 608 } 609 610 /* Return < 0 if error. Important errors are: 611 -EIO generic I/O error (may happen for all errors) 612 -ENOMEDIUM No media inserted. 613 -EINVAL Invalid sector number or nb_sectors 614 -EACCES Trying to write a read-only device 615 */ 616 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 617 const uint8_t *buf, int nb_sectors) 618 { 619 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 620 } 621 622 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 623 int nb_sectors, BdrvRequestFlags flags) 624 { 625 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 626 BDRV_REQ_ZERO_WRITE | flags); 627 } 628 629 /* 630 * Completely zero out a block device with the help of bdrv_write_zeroes. 631 * The operation is sped up by checking the block status and only writing 632 * zeroes to the device if they currently do not return zeroes. Optional 633 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 634 * 635 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 636 */ 637 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 638 { 639 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 640 int n; 641 642 target_sectors = bdrv_nb_sectors(bs); 643 if (target_sectors < 0) { 644 return target_sectors; 645 } 646 647 for (;;) { 648 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 649 if (nb_sectors <= 0) { 650 return 0; 651 } 652 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 653 if (ret < 0) { 654 error_report("error getting block status at sector %" PRId64 ": %s", 655 sector_num, strerror(-ret)); 656 return ret; 657 } 658 if (ret & BDRV_BLOCK_ZERO) { 659 sector_num += n; 660 continue; 661 } 662 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 663 if (ret < 0) { 664 error_report("error writing zeroes at sector %" PRId64 ": %s", 665 sector_num, strerror(-ret)); 666 return ret; 667 } 668 sector_num += n; 669 } 670 } 671 672 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 673 { 674 QEMUIOVector qiov; 675 struct iovec iov = { 676 .iov_base = (void *)buf, 677 .iov_len = bytes, 678 }; 679 int ret; 680 681 if (bytes < 0) { 682 return -EINVAL; 683 } 684 685 qemu_iovec_init_external(&qiov, &iov, 1); 686 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 687 if (ret < 0) { 688 return ret; 689 } 690 691 return bytes; 692 } 693 694 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 695 { 696 int ret; 697 698 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 699 if (ret < 0) { 700 return ret; 701 } 702 703 return qiov->size; 704 } 705 706 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 707 const void *buf, int bytes) 708 { 709 QEMUIOVector qiov; 710 struct iovec iov = { 711 .iov_base = (void *) buf, 712 .iov_len = bytes, 713 }; 714 715 if (bytes < 0) { 716 return -EINVAL; 717 } 718 719 qemu_iovec_init_external(&qiov, &iov, 1); 720 return bdrv_pwritev(bs, offset, &qiov); 721 } 722 723 /* 724 * Writes to the file and ensures that no writes are reordered across this 725 * request (acts as a barrier) 726 * 727 * Returns 0 on success, -errno in error cases. 728 */ 729 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 730 const void *buf, int count) 731 { 732 int ret; 733 734 ret = bdrv_pwrite(bs, offset, buf, count); 735 if (ret < 0) { 736 return ret; 737 } 738 739 /* No flush needed for cache modes that already do it */ 740 if (bs->enable_write_cache) { 741 bdrv_flush(bs); 742 } 743 744 return 0; 745 } 746 747 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 748 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 749 { 750 /* Perform I/O through a temporary buffer so that users who scribble over 751 * their read buffer while the operation is in progress do not end up 752 * modifying the image file. This is critical for zero-copy guest I/O 753 * where anything might happen inside guest memory. 754 */ 755 void *bounce_buffer; 756 757 BlockDriver *drv = bs->drv; 758 struct iovec iov; 759 QEMUIOVector bounce_qiov; 760 int64_t cluster_sector_num; 761 int cluster_nb_sectors; 762 size_t skip_bytes; 763 int ret; 764 765 /* Cover entire cluster so no additional backing file I/O is required when 766 * allocating cluster in the image file. 767 */ 768 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 769 &cluster_sector_num, &cluster_nb_sectors); 770 771 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 772 cluster_sector_num, cluster_nb_sectors); 773 774 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 775 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 776 if (bounce_buffer == NULL) { 777 ret = -ENOMEM; 778 goto err; 779 } 780 781 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 782 783 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 784 &bounce_qiov); 785 if (ret < 0) { 786 goto err; 787 } 788 789 if (drv->bdrv_co_write_zeroes && 790 buffer_is_zero(bounce_buffer, iov.iov_len)) { 791 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 792 cluster_nb_sectors, 0); 793 } else { 794 /* This does not change the data on the disk, it is not necessary 795 * to flush even in cache=writethrough mode. 796 */ 797 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 798 &bounce_qiov); 799 } 800 801 if (ret < 0) { 802 /* It might be okay to ignore write errors for guest requests. If this 803 * is a deliberate copy-on-read then we don't want to ignore the error. 804 * Simply report it in all cases. 805 */ 806 goto err; 807 } 808 809 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 810 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 811 nb_sectors * BDRV_SECTOR_SIZE); 812 813 err: 814 qemu_vfree(bounce_buffer); 815 return ret; 816 } 817 818 /* 819 * Forwards an already correctly aligned request to the BlockDriver. This 820 * handles copy on read and zeroing after EOF; any other features must be 821 * implemented by the caller. 822 */ 823 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 824 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 825 int64_t align, QEMUIOVector *qiov, int flags) 826 { 827 BlockDriver *drv = bs->drv; 828 int ret; 829 830 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 831 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 832 833 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 834 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 835 assert(!qiov || bytes == qiov->size); 836 837 /* Handle Copy on Read and associated serialisation */ 838 if (flags & BDRV_REQ_COPY_ON_READ) { 839 /* If we touch the same cluster it counts as an overlap. This 840 * guarantees that allocating writes will be serialized and not race 841 * with each other for the same cluster. For example, in copy-on-read 842 * it ensures that the CoR read and write operations are atomic and 843 * guest writes cannot interleave between them. */ 844 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 845 } 846 847 wait_serialising_requests(req); 848 849 if (flags & BDRV_REQ_COPY_ON_READ) { 850 int pnum; 851 852 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 853 if (ret < 0) { 854 goto out; 855 } 856 857 if (!ret || pnum != nb_sectors) { 858 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 859 goto out; 860 } 861 } 862 863 /* Forward the request to the BlockDriver */ 864 if (!bs->zero_beyond_eof) { 865 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 866 } else { 867 /* Read zeros after EOF */ 868 int64_t total_sectors, max_nb_sectors; 869 870 total_sectors = bdrv_nb_sectors(bs); 871 if (total_sectors < 0) { 872 ret = total_sectors; 873 goto out; 874 } 875 876 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 877 align >> BDRV_SECTOR_BITS); 878 if (nb_sectors < max_nb_sectors) { 879 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 880 } else if (max_nb_sectors > 0) { 881 QEMUIOVector local_qiov; 882 883 qemu_iovec_init(&local_qiov, qiov->niov); 884 qemu_iovec_concat(&local_qiov, qiov, 0, 885 max_nb_sectors * BDRV_SECTOR_SIZE); 886 887 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 888 &local_qiov); 889 890 qemu_iovec_destroy(&local_qiov); 891 } else { 892 ret = 0; 893 } 894 895 /* Reading beyond end of file is supposed to produce zeroes */ 896 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 897 uint64_t offset = MAX(0, total_sectors - sector_num); 898 uint64_t bytes = (sector_num + nb_sectors - offset) * 899 BDRV_SECTOR_SIZE; 900 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 901 } 902 } 903 904 out: 905 return ret; 906 } 907 908 /* 909 * Handle a read request in coroutine context 910 */ 911 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 912 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 913 BdrvRequestFlags flags) 914 { 915 BlockDriver *drv = bs->drv; 916 BdrvTrackedRequest req; 917 918 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 919 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 920 uint8_t *head_buf = NULL; 921 uint8_t *tail_buf = NULL; 922 QEMUIOVector local_qiov; 923 bool use_local_qiov = false; 924 int ret; 925 926 if (!drv) { 927 return -ENOMEDIUM; 928 } 929 930 ret = bdrv_check_byte_request(bs, offset, bytes); 931 if (ret < 0) { 932 return ret; 933 } 934 935 if (bs->copy_on_read) { 936 flags |= BDRV_REQ_COPY_ON_READ; 937 } 938 939 /* throttling disk I/O */ 940 if (bs->io_limits_enabled) { 941 throttle_group_co_io_limits_intercept(bs, bytes, false); 942 } 943 944 /* Align read if necessary by padding qiov */ 945 if (offset & (align - 1)) { 946 head_buf = qemu_blockalign(bs, align); 947 qemu_iovec_init(&local_qiov, qiov->niov + 2); 948 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 949 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 950 use_local_qiov = true; 951 952 bytes += offset & (align - 1); 953 offset = offset & ~(align - 1); 954 } 955 956 if ((offset + bytes) & (align - 1)) { 957 if (!use_local_qiov) { 958 qemu_iovec_init(&local_qiov, qiov->niov + 1); 959 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 960 use_local_qiov = true; 961 } 962 tail_buf = qemu_blockalign(bs, align); 963 qemu_iovec_add(&local_qiov, tail_buf, 964 align - ((offset + bytes) & (align - 1))); 965 966 bytes = ROUND_UP(bytes, align); 967 } 968 969 tracked_request_begin(&req, bs, offset, bytes, false); 970 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 971 use_local_qiov ? &local_qiov : qiov, 972 flags); 973 tracked_request_end(&req); 974 975 if (use_local_qiov) { 976 qemu_iovec_destroy(&local_qiov); 977 qemu_vfree(head_buf); 978 qemu_vfree(tail_buf); 979 } 980 981 return ret; 982 } 983 984 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 985 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 986 BdrvRequestFlags flags) 987 { 988 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 989 return -EINVAL; 990 } 991 992 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 993 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 994 } 995 996 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 997 int nb_sectors, QEMUIOVector *qiov) 998 { 999 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1000 1001 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1002 } 1003 1004 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1005 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1006 { 1007 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1008 1009 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1010 BDRV_REQ_COPY_ON_READ); 1011 } 1012 1013 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1014 1015 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1016 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1017 { 1018 BlockDriver *drv = bs->drv; 1019 QEMUIOVector qiov; 1020 struct iovec iov = {0}; 1021 int ret = 0; 1022 1023 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1024 BDRV_REQUEST_MAX_SECTORS); 1025 1026 while (nb_sectors > 0 && !ret) { 1027 int num = nb_sectors; 1028 1029 /* Align request. Block drivers can expect the "bulk" of the request 1030 * to be aligned. 1031 */ 1032 if (bs->bl.write_zeroes_alignment 1033 && num > bs->bl.write_zeroes_alignment) { 1034 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1035 /* Make a small request up to the first aligned sector. */ 1036 num = bs->bl.write_zeroes_alignment; 1037 num -= sector_num % bs->bl.write_zeroes_alignment; 1038 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1039 /* Shorten the request to the last aligned sector. num cannot 1040 * underflow because num > bs->bl.write_zeroes_alignment. 1041 */ 1042 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1043 } 1044 } 1045 1046 /* limit request size */ 1047 if (num > max_write_zeroes) { 1048 num = max_write_zeroes; 1049 } 1050 1051 ret = -ENOTSUP; 1052 /* First try the efficient write zeroes operation */ 1053 if (drv->bdrv_co_write_zeroes) { 1054 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1055 } 1056 1057 if (ret == -ENOTSUP) { 1058 /* Fall back to bounce buffer if write zeroes is unsupported */ 1059 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1060 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1061 num = MIN(num, max_xfer_len); 1062 iov.iov_len = num * BDRV_SECTOR_SIZE; 1063 if (iov.iov_base == NULL) { 1064 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1065 if (iov.iov_base == NULL) { 1066 ret = -ENOMEM; 1067 goto fail; 1068 } 1069 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1070 } 1071 qemu_iovec_init_external(&qiov, &iov, 1); 1072 1073 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1074 1075 /* Keep bounce buffer around if it is big enough for all 1076 * all future requests. 1077 */ 1078 if (num < max_xfer_len) { 1079 qemu_vfree(iov.iov_base); 1080 iov.iov_base = NULL; 1081 } 1082 } 1083 1084 sector_num += num; 1085 nb_sectors -= num; 1086 } 1087 1088 fail: 1089 qemu_vfree(iov.iov_base); 1090 return ret; 1091 } 1092 1093 /* 1094 * Forwards an already correctly aligned write request to the BlockDriver. 1095 */ 1096 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1097 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1098 QEMUIOVector *qiov, int flags) 1099 { 1100 BlockDriver *drv = bs->drv; 1101 bool waited; 1102 int ret; 1103 1104 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1105 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1106 1107 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1108 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1109 assert(!qiov || bytes == qiov->size); 1110 1111 waited = wait_serialising_requests(req); 1112 assert(!waited || !req->serialising); 1113 assert(req->overlap_offset <= offset); 1114 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1115 1116 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1117 1118 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1119 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1120 qemu_iovec_is_zero(qiov)) { 1121 flags |= BDRV_REQ_ZERO_WRITE; 1122 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1123 flags |= BDRV_REQ_MAY_UNMAP; 1124 } 1125 } 1126 1127 if (ret < 0) { 1128 /* Do nothing, write notifier decided to fail this request */ 1129 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1130 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); 1131 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1132 } else { 1133 BLKDBG_EVENT(bs, BLKDBG_PWRITEV); 1134 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1135 } 1136 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); 1137 1138 if (ret == 0 && !bs->enable_write_cache) { 1139 ret = bdrv_co_flush(bs); 1140 } 1141 1142 bdrv_set_dirty(bs, sector_num, nb_sectors); 1143 1144 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); 1145 1146 if (ret >= 0) { 1147 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1148 } 1149 1150 return ret; 1151 } 1152 1153 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1154 int64_t offset, 1155 unsigned int bytes, 1156 BdrvRequestFlags flags, 1157 BdrvTrackedRequest *req) 1158 { 1159 uint8_t *buf = NULL; 1160 QEMUIOVector local_qiov; 1161 struct iovec iov; 1162 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1163 unsigned int head_padding_bytes, tail_padding_bytes; 1164 int ret = 0; 1165 1166 head_padding_bytes = offset & (align - 1); 1167 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1168 1169 1170 assert(flags & BDRV_REQ_ZERO_WRITE); 1171 if (head_padding_bytes || tail_padding_bytes) { 1172 buf = qemu_blockalign(bs, align); 1173 iov = (struct iovec) { 1174 .iov_base = buf, 1175 .iov_len = align, 1176 }; 1177 qemu_iovec_init_external(&local_qiov, &iov, 1); 1178 } 1179 if (head_padding_bytes) { 1180 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1181 1182 /* RMW the unaligned part before head. */ 1183 mark_request_serialising(req, align); 1184 wait_serialising_requests(req); 1185 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1186 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1187 align, &local_qiov, 0); 1188 if (ret < 0) { 1189 goto fail; 1190 } 1191 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1192 1193 memset(buf + head_padding_bytes, 0, zero_bytes); 1194 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1195 &local_qiov, 1196 flags & ~BDRV_REQ_ZERO_WRITE); 1197 if (ret < 0) { 1198 goto fail; 1199 } 1200 offset += zero_bytes; 1201 bytes -= zero_bytes; 1202 } 1203 1204 assert(!bytes || (offset & (align - 1)) == 0); 1205 if (bytes >= align) { 1206 /* Write the aligned part in the middle. */ 1207 uint64_t aligned_bytes = bytes & ~(align - 1); 1208 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1209 NULL, flags); 1210 if (ret < 0) { 1211 goto fail; 1212 } 1213 bytes -= aligned_bytes; 1214 offset += aligned_bytes; 1215 } 1216 1217 assert(!bytes || (offset & (align - 1)) == 0); 1218 if (bytes) { 1219 assert(align == tail_padding_bytes + bytes); 1220 /* RMW the unaligned part after tail. */ 1221 mark_request_serialising(req, align); 1222 wait_serialising_requests(req); 1223 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1224 ret = bdrv_aligned_preadv(bs, req, offset, align, 1225 align, &local_qiov, 0); 1226 if (ret < 0) { 1227 goto fail; 1228 } 1229 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1230 1231 memset(buf, 0, bytes); 1232 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1233 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1234 } 1235 fail: 1236 qemu_vfree(buf); 1237 return ret; 1238 1239 } 1240 1241 /* 1242 * Handle a write request in coroutine context 1243 */ 1244 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1245 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1246 BdrvRequestFlags flags) 1247 { 1248 BdrvTrackedRequest req; 1249 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1250 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1251 uint8_t *head_buf = NULL; 1252 uint8_t *tail_buf = NULL; 1253 QEMUIOVector local_qiov; 1254 bool use_local_qiov = false; 1255 int ret; 1256 1257 if (!bs->drv) { 1258 return -ENOMEDIUM; 1259 } 1260 if (bs->read_only) { 1261 return -EPERM; 1262 } 1263 1264 ret = bdrv_check_byte_request(bs, offset, bytes); 1265 if (ret < 0) { 1266 return ret; 1267 } 1268 1269 /* throttling disk I/O */ 1270 if (bs->io_limits_enabled) { 1271 throttle_group_co_io_limits_intercept(bs, bytes, true); 1272 } 1273 1274 /* 1275 * Align write if necessary by performing a read-modify-write cycle. 1276 * Pad qiov with the read parts and be sure to have a tracked request not 1277 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1278 */ 1279 tracked_request_begin(&req, bs, offset, bytes, true); 1280 1281 if (!qiov) { 1282 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1283 goto out; 1284 } 1285 1286 if (offset & (align - 1)) { 1287 QEMUIOVector head_qiov; 1288 struct iovec head_iov; 1289 1290 mark_request_serialising(&req, align); 1291 wait_serialising_requests(&req); 1292 1293 head_buf = qemu_blockalign(bs, align); 1294 head_iov = (struct iovec) { 1295 .iov_base = head_buf, 1296 .iov_len = align, 1297 }; 1298 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1299 1300 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1301 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1302 align, &head_qiov, 0); 1303 if (ret < 0) { 1304 goto fail; 1305 } 1306 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1307 1308 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1309 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1310 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1311 use_local_qiov = true; 1312 1313 bytes += offset & (align - 1); 1314 offset = offset & ~(align - 1); 1315 } 1316 1317 if ((offset + bytes) & (align - 1)) { 1318 QEMUIOVector tail_qiov; 1319 struct iovec tail_iov; 1320 size_t tail_bytes; 1321 bool waited; 1322 1323 mark_request_serialising(&req, align); 1324 waited = wait_serialising_requests(&req); 1325 assert(!waited || !use_local_qiov); 1326 1327 tail_buf = qemu_blockalign(bs, align); 1328 tail_iov = (struct iovec) { 1329 .iov_base = tail_buf, 1330 .iov_len = align, 1331 }; 1332 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1333 1334 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1335 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1336 align, &tail_qiov, 0); 1337 if (ret < 0) { 1338 goto fail; 1339 } 1340 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1341 1342 if (!use_local_qiov) { 1343 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1344 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1345 use_local_qiov = true; 1346 } 1347 1348 tail_bytes = (offset + bytes) & (align - 1); 1349 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1350 1351 bytes = ROUND_UP(bytes, align); 1352 } 1353 1354 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1355 use_local_qiov ? &local_qiov : qiov, 1356 flags); 1357 1358 fail: 1359 1360 if (use_local_qiov) { 1361 qemu_iovec_destroy(&local_qiov); 1362 } 1363 qemu_vfree(head_buf); 1364 qemu_vfree(tail_buf); 1365 out: 1366 tracked_request_end(&req); 1367 return ret; 1368 } 1369 1370 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1371 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1372 BdrvRequestFlags flags) 1373 { 1374 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1375 return -EINVAL; 1376 } 1377 1378 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1379 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1380 } 1381 1382 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1383 int nb_sectors, QEMUIOVector *qiov) 1384 { 1385 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1386 1387 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1388 } 1389 1390 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1391 int64_t sector_num, int nb_sectors, 1392 BdrvRequestFlags flags) 1393 { 1394 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1395 1396 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1397 flags &= ~BDRV_REQ_MAY_UNMAP; 1398 } 1399 1400 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1401 BDRV_REQ_ZERO_WRITE | flags); 1402 } 1403 1404 int bdrv_flush_all(void) 1405 { 1406 BlockDriverState *bs = NULL; 1407 int result = 0; 1408 1409 while ((bs = bdrv_next(bs))) { 1410 AioContext *aio_context = bdrv_get_aio_context(bs); 1411 int ret; 1412 1413 aio_context_acquire(aio_context); 1414 ret = bdrv_flush(bs); 1415 if (ret < 0 && !result) { 1416 result = ret; 1417 } 1418 aio_context_release(aio_context); 1419 } 1420 1421 return result; 1422 } 1423 1424 typedef struct BdrvCoGetBlockStatusData { 1425 BlockDriverState *bs; 1426 BlockDriverState *base; 1427 int64_t sector_num; 1428 int nb_sectors; 1429 int *pnum; 1430 int64_t ret; 1431 bool done; 1432 } BdrvCoGetBlockStatusData; 1433 1434 /* 1435 * Returns the allocation status of the specified sectors. 1436 * Drivers not implementing the functionality are assumed to not support 1437 * backing files, hence all their sectors are reported as allocated. 1438 * 1439 * If 'sector_num' is beyond the end of the disk image the return value is 0 1440 * and 'pnum' is set to 0. 1441 * 1442 * 'pnum' is set to the number of sectors (including and immediately following 1443 * the specified sector) that are known to be in the same 1444 * allocated/unallocated state. 1445 * 1446 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1447 * beyond the end of the disk image it will be clamped. 1448 */ 1449 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1450 int64_t sector_num, 1451 int nb_sectors, int *pnum) 1452 { 1453 int64_t total_sectors; 1454 int64_t n; 1455 int64_t ret, ret2; 1456 1457 total_sectors = bdrv_nb_sectors(bs); 1458 if (total_sectors < 0) { 1459 return total_sectors; 1460 } 1461 1462 if (sector_num >= total_sectors) { 1463 *pnum = 0; 1464 return 0; 1465 } 1466 1467 n = total_sectors - sector_num; 1468 if (n < nb_sectors) { 1469 nb_sectors = n; 1470 } 1471 1472 if (!bs->drv->bdrv_co_get_block_status) { 1473 *pnum = nb_sectors; 1474 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1475 if (bs->drv->protocol_name) { 1476 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1477 } 1478 return ret; 1479 } 1480 1481 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1482 if (ret < 0) { 1483 *pnum = 0; 1484 return ret; 1485 } 1486 1487 if (ret & BDRV_BLOCK_RAW) { 1488 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1489 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1490 *pnum, pnum); 1491 } 1492 1493 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1494 ret |= BDRV_BLOCK_ALLOCATED; 1495 } else { 1496 if (bdrv_unallocated_blocks_are_zero(bs)) { 1497 ret |= BDRV_BLOCK_ZERO; 1498 } else if (bs->backing_hd) { 1499 BlockDriverState *bs2 = bs->backing_hd; 1500 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1501 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1502 ret |= BDRV_BLOCK_ZERO; 1503 } 1504 } 1505 } 1506 1507 if (bs->file && 1508 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1509 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1510 int file_pnum; 1511 1512 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1513 *pnum, &file_pnum); 1514 if (ret2 >= 0) { 1515 /* Ignore errors. This is just providing extra information, it 1516 * is useful but not necessary. 1517 */ 1518 if (!file_pnum) { 1519 /* !file_pnum indicates an offset at or beyond the EOF; it is 1520 * perfectly valid for the format block driver to point to such 1521 * offsets, so catch it and mark everything as zero */ 1522 ret |= BDRV_BLOCK_ZERO; 1523 } else { 1524 /* Limit request to the range reported by the protocol driver */ 1525 *pnum = file_pnum; 1526 ret |= (ret2 & BDRV_BLOCK_ZERO); 1527 } 1528 } 1529 } 1530 1531 return ret; 1532 } 1533 1534 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, 1535 BlockDriverState *base, 1536 int64_t sector_num, 1537 int nb_sectors, 1538 int *pnum) 1539 { 1540 BlockDriverState *p; 1541 int64_t ret = 0; 1542 1543 assert(bs != base); 1544 for (p = bs; p != base; p = p->backing_hd) { 1545 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); 1546 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { 1547 break; 1548 } 1549 /* [sector_num, pnum] unallocated on this layer, which could be only 1550 * the first part of [sector_num, nb_sectors]. */ 1551 nb_sectors = MIN(nb_sectors, *pnum); 1552 } 1553 return ret; 1554 } 1555 1556 /* Coroutine wrapper for bdrv_get_block_status_above() */ 1557 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) 1558 { 1559 BdrvCoGetBlockStatusData *data = opaque; 1560 1561 data->ret = bdrv_co_get_block_status_above(data->bs, data->base, 1562 data->sector_num, 1563 data->nb_sectors, 1564 data->pnum); 1565 data->done = true; 1566 } 1567 1568 /* 1569 * Synchronous wrapper around bdrv_co_get_block_status_above(). 1570 * 1571 * See bdrv_co_get_block_status_above() for details. 1572 */ 1573 int64_t bdrv_get_block_status_above(BlockDriverState *bs, 1574 BlockDriverState *base, 1575 int64_t sector_num, 1576 int nb_sectors, int *pnum) 1577 { 1578 Coroutine *co; 1579 BdrvCoGetBlockStatusData data = { 1580 .bs = bs, 1581 .base = base, 1582 .sector_num = sector_num, 1583 .nb_sectors = nb_sectors, 1584 .pnum = pnum, 1585 .done = false, 1586 }; 1587 1588 if (qemu_in_coroutine()) { 1589 /* Fast-path if already in coroutine context */ 1590 bdrv_get_block_status_above_co_entry(&data); 1591 } else { 1592 AioContext *aio_context = bdrv_get_aio_context(bs); 1593 1594 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); 1595 qemu_coroutine_enter(co, &data); 1596 while (!data.done) { 1597 aio_poll(aio_context, true); 1598 } 1599 } 1600 return data.ret; 1601 } 1602 1603 int64_t bdrv_get_block_status(BlockDriverState *bs, 1604 int64_t sector_num, 1605 int nb_sectors, int *pnum) 1606 { 1607 return bdrv_get_block_status_above(bs, bs->backing_hd, 1608 sector_num, nb_sectors, pnum); 1609 } 1610 1611 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1612 int nb_sectors, int *pnum) 1613 { 1614 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1615 if (ret < 0) { 1616 return ret; 1617 } 1618 return !!(ret & BDRV_BLOCK_ALLOCATED); 1619 } 1620 1621 /* 1622 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1623 * 1624 * Return true if the given sector is allocated in any image between 1625 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1626 * sector is allocated in any image of the chain. Return false otherwise. 1627 * 1628 * 'pnum' is set to the number of sectors (including and immediately following 1629 * the specified sector) that are known to be in the same 1630 * allocated/unallocated state. 1631 * 1632 */ 1633 int bdrv_is_allocated_above(BlockDriverState *top, 1634 BlockDriverState *base, 1635 int64_t sector_num, 1636 int nb_sectors, int *pnum) 1637 { 1638 BlockDriverState *intermediate; 1639 int ret, n = nb_sectors; 1640 1641 intermediate = top; 1642 while (intermediate && intermediate != base) { 1643 int pnum_inter; 1644 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1645 &pnum_inter); 1646 if (ret < 0) { 1647 return ret; 1648 } else if (ret) { 1649 *pnum = pnum_inter; 1650 return 1; 1651 } 1652 1653 /* 1654 * [sector_num, nb_sectors] is unallocated on top but intermediate 1655 * might have 1656 * 1657 * [sector_num+x, nr_sectors] allocated. 1658 */ 1659 if (n > pnum_inter && 1660 (intermediate == top || 1661 sector_num + pnum_inter < intermediate->total_sectors)) { 1662 n = pnum_inter; 1663 } 1664 1665 intermediate = intermediate->backing_hd; 1666 } 1667 1668 *pnum = n; 1669 return 0; 1670 } 1671 1672 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1673 const uint8_t *buf, int nb_sectors) 1674 { 1675 BlockDriver *drv = bs->drv; 1676 int ret; 1677 1678 if (!drv) { 1679 return -ENOMEDIUM; 1680 } 1681 if (!drv->bdrv_write_compressed) { 1682 return -ENOTSUP; 1683 } 1684 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1685 if (ret < 0) { 1686 return ret; 1687 } 1688 1689 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1690 1691 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1692 } 1693 1694 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1695 int64_t pos, int size) 1696 { 1697 QEMUIOVector qiov; 1698 struct iovec iov = { 1699 .iov_base = (void *) buf, 1700 .iov_len = size, 1701 }; 1702 1703 qemu_iovec_init_external(&qiov, &iov, 1); 1704 return bdrv_writev_vmstate(bs, &qiov, pos); 1705 } 1706 1707 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1708 { 1709 BlockDriver *drv = bs->drv; 1710 1711 if (!drv) { 1712 return -ENOMEDIUM; 1713 } else if (drv->bdrv_save_vmstate) { 1714 return drv->bdrv_save_vmstate(bs, qiov, pos); 1715 } else if (bs->file) { 1716 return bdrv_writev_vmstate(bs->file, qiov, pos); 1717 } 1718 1719 return -ENOTSUP; 1720 } 1721 1722 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1723 int64_t pos, int size) 1724 { 1725 BlockDriver *drv = bs->drv; 1726 if (!drv) 1727 return -ENOMEDIUM; 1728 if (drv->bdrv_load_vmstate) 1729 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1730 if (bs->file) 1731 return bdrv_load_vmstate(bs->file, buf, pos, size); 1732 return -ENOTSUP; 1733 } 1734 1735 /**************************************************************/ 1736 /* async I/Os */ 1737 1738 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1739 QEMUIOVector *qiov, int nb_sectors, 1740 BlockCompletionFunc *cb, void *opaque) 1741 { 1742 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1743 1744 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1745 cb, opaque, false); 1746 } 1747 1748 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1749 QEMUIOVector *qiov, int nb_sectors, 1750 BlockCompletionFunc *cb, void *opaque) 1751 { 1752 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1753 1754 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1755 cb, opaque, true); 1756 } 1757 1758 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1759 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1760 BlockCompletionFunc *cb, void *opaque) 1761 { 1762 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1763 1764 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1765 BDRV_REQ_ZERO_WRITE | flags, 1766 cb, opaque, true); 1767 } 1768 1769 1770 typedef struct MultiwriteCB { 1771 int error; 1772 int num_requests; 1773 int num_callbacks; 1774 struct { 1775 BlockCompletionFunc *cb; 1776 void *opaque; 1777 QEMUIOVector *free_qiov; 1778 } callbacks[]; 1779 } MultiwriteCB; 1780 1781 static void multiwrite_user_cb(MultiwriteCB *mcb) 1782 { 1783 int i; 1784 1785 for (i = 0; i < mcb->num_callbacks; i++) { 1786 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1787 if (mcb->callbacks[i].free_qiov) { 1788 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1789 } 1790 g_free(mcb->callbacks[i].free_qiov); 1791 } 1792 } 1793 1794 static void multiwrite_cb(void *opaque, int ret) 1795 { 1796 MultiwriteCB *mcb = opaque; 1797 1798 trace_multiwrite_cb(mcb, ret); 1799 1800 if (ret < 0 && !mcb->error) { 1801 mcb->error = ret; 1802 } 1803 1804 mcb->num_requests--; 1805 if (mcb->num_requests == 0) { 1806 multiwrite_user_cb(mcb); 1807 g_free(mcb); 1808 } 1809 } 1810 1811 static int multiwrite_req_compare(const void *a, const void *b) 1812 { 1813 const BlockRequest *req1 = a, *req2 = b; 1814 1815 /* 1816 * Note that we can't simply subtract req2->sector from req1->sector 1817 * here as that could overflow the return value. 1818 */ 1819 if (req1->sector > req2->sector) { 1820 return 1; 1821 } else if (req1->sector < req2->sector) { 1822 return -1; 1823 } else { 1824 return 0; 1825 } 1826 } 1827 1828 /* 1829 * Takes a bunch of requests and tries to merge them. Returns the number of 1830 * requests that remain after merging. 1831 */ 1832 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1833 int num_reqs, MultiwriteCB *mcb) 1834 { 1835 int i, outidx; 1836 1837 // Sort requests by start sector 1838 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1839 1840 // Check if adjacent requests touch the same clusters. If so, combine them, 1841 // filling up gaps with zero sectors. 1842 outidx = 0; 1843 for (i = 1; i < num_reqs; i++) { 1844 int merge = 0; 1845 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1846 1847 // Handle exactly sequential writes and overlapping writes. 1848 if (reqs[i].sector <= oldreq_last) { 1849 merge = 1; 1850 } 1851 1852 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1853 merge = 0; 1854 } 1855 1856 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1857 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1858 merge = 0; 1859 } 1860 1861 if (merge) { 1862 size_t size; 1863 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1864 qemu_iovec_init(qiov, 1865 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1866 1867 // Add the first request to the merged one. If the requests are 1868 // overlapping, drop the last sectors of the first request. 1869 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1870 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1871 1872 // We should need to add any zeros between the two requests 1873 assert (reqs[i].sector <= oldreq_last); 1874 1875 // Add the second request 1876 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1877 1878 // Add tail of first request, if necessary 1879 if (qiov->size < reqs[outidx].qiov->size) { 1880 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1881 reqs[outidx].qiov->size - qiov->size); 1882 } 1883 1884 reqs[outidx].nb_sectors = qiov->size >> 9; 1885 reqs[outidx].qiov = qiov; 1886 1887 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1888 } else { 1889 outidx++; 1890 reqs[outidx].sector = reqs[i].sector; 1891 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1892 reqs[outidx].qiov = reqs[i].qiov; 1893 } 1894 } 1895 1896 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); 1897 1898 return outidx + 1; 1899 } 1900 1901 /* 1902 * Submit multiple AIO write requests at once. 1903 * 1904 * On success, the function returns 0 and all requests in the reqs array have 1905 * been submitted. In error case this function returns -1, and any of the 1906 * requests may or may not be submitted yet. In particular, this means that the 1907 * callback will be called for some of the requests, for others it won't. The 1908 * caller must check the error field of the BlockRequest to wait for the right 1909 * callbacks (if error != 0, no callback will be called). 1910 * 1911 * The implementation may modify the contents of the reqs array, e.g. to merge 1912 * requests. However, the fields opaque and error are left unmodified as they 1913 * are used to signal failure for a single request to the caller. 1914 */ 1915 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1916 { 1917 MultiwriteCB *mcb; 1918 int i; 1919 1920 /* don't submit writes if we don't have a medium */ 1921 if (bs->drv == NULL) { 1922 for (i = 0; i < num_reqs; i++) { 1923 reqs[i].error = -ENOMEDIUM; 1924 } 1925 return -1; 1926 } 1927 1928 if (num_reqs == 0) { 1929 return 0; 1930 } 1931 1932 // Create MultiwriteCB structure 1933 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1934 mcb->num_requests = 0; 1935 mcb->num_callbacks = num_reqs; 1936 1937 for (i = 0; i < num_reqs; i++) { 1938 mcb->callbacks[i].cb = reqs[i].cb; 1939 mcb->callbacks[i].opaque = reqs[i].opaque; 1940 } 1941 1942 // Check for mergable requests 1943 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1944 1945 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1946 1947 /* Run the aio requests. */ 1948 mcb->num_requests = num_reqs; 1949 for (i = 0; i < num_reqs; i++) { 1950 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1951 reqs[i].nb_sectors, reqs[i].flags, 1952 multiwrite_cb, mcb, 1953 true); 1954 } 1955 1956 return 0; 1957 } 1958 1959 void bdrv_aio_cancel(BlockAIOCB *acb) 1960 { 1961 qemu_aio_ref(acb); 1962 bdrv_aio_cancel_async(acb); 1963 while (acb->refcnt > 1) { 1964 if (acb->aiocb_info->get_aio_context) { 1965 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1966 } else if (acb->bs) { 1967 aio_poll(bdrv_get_aio_context(acb->bs), true); 1968 } else { 1969 abort(); 1970 } 1971 } 1972 qemu_aio_unref(acb); 1973 } 1974 1975 /* Async version of aio cancel. The caller is not blocked if the acb implements 1976 * cancel_async, otherwise we do nothing and let the request normally complete. 1977 * In either case the completion callback must be called. */ 1978 void bdrv_aio_cancel_async(BlockAIOCB *acb) 1979 { 1980 if (acb->aiocb_info->cancel_async) { 1981 acb->aiocb_info->cancel_async(acb); 1982 } 1983 } 1984 1985 /**************************************************************/ 1986 /* async block device emulation */ 1987 1988 typedef struct BlockAIOCBSync { 1989 BlockAIOCB common; 1990 QEMUBH *bh; 1991 int ret; 1992 /* vector translation state */ 1993 QEMUIOVector *qiov; 1994 uint8_t *bounce; 1995 int is_write; 1996 } BlockAIOCBSync; 1997 1998 static const AIOCBInfo bdrv_em_aiocb_info = { 1999 .aiocb_size = sizeof(BlockAIOCBSync), 2000 }; 2001 2002 static void bdrv_aio_bh_cb(void *opaque) 2003 { 2004 BlockAIOCBSync *acb = opaque; 2005 2006 if (!acb->is_write && acb->ret >= 0) { 2007 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 2008 } 2009 qemu_vfree(acb->bounce); 2010 acb->common.cb(acb->common.opaque, acb->ret); 2011 qemu_bh_delete(acb->bh); 2012 acb->bh = NULL; 2013 qemu_aio_unref(acb); 2014 } 2015 2016 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 2017 int64_t sector_num, 2018 QEMUIOVector *qiov, 2019 int nb_sectors, 2020 BlockCompletionFunc *cb, 2021 void *opaque, 2022 int is_write) 2023 2024 { 2025 BlockAIOCBSync *acb; 2026 2027 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 2028 acb->is_write = is_write; 2029 acb->qiov = qiov; 2030 acb->bounce = qemu_try_blockalign(bs, qiov->size); 2031 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 2032 2033 if (acb->bounce == NULL) { 2034 acb->ret = -ENOMEM; 2035 } else if (is_write) { 2036 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2037 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2038 } else { 2039 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2040 } 2041 2042 qemu_bh_schedule(acb->bh); 2043 2044 return &acb->common; 2045 } 2046 2047 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2048 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2049 BlockCompletionFunc *cb, void *opaque) 2050 { 2051 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2052 } 2053 2054 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2055 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2056 BlockCompletionFunc *cb, void *opaque) 2057 { 2058 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2059 } 2060 2061 2062 typedef struct BlockAIOCBCoroutine { 2063 BlockAIOCB common; 2064 BlockRequest req; 2065 bool is_write; 2066 bool need_bh; 2067 bool *done; 2068 QEMUBH* bh; 2069 } BlockAIOCBCoroutine; 2070 2071 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2072 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2073 }; 2074 2075 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2076 { 2077 if (!acb->need_bh) { 2078 acb->common.cb(acb->common.opaque, acb->req.error); 2079 qemu_aio_unref(acb); 2080 } 2081 } 2082 2083 static void bdrv_co_em_bh(void *opaque) 2084 { 2085 BlockAIOCBCoroutine *acb = opaque; 2086 2087 assert(!acb->need_bh); 2088 qemu_bh_delete(acb->bh); 2089 bdrv_co_complete(acb); 2090 } 2091 2092 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2093 { 2094 acb->need_bh = false; 2095 if (acb->req.error != -EINPROGRESS) { 2096 BlockDriverState *bs = acb->common.bs; 2097 2098 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2099 qemu_bh_schedule(acb->bh); 2100 } 2101 } 2102 2103 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2104 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2105 { 2106 BlockAIOCBCoroutine *acb = opaque; 2107 BlockDriverState *bs = acb->common.bs; 2108 2109 if (!acb->is_write) { 2110 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2111 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2112 } else { 2113 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2114 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2115 } 2116 2117 bdrv_co_complete(acb); 2118 } 2119 2120 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2121 int64_t sector_num, 2122 QEMUIOVector *qiov, 2123 int nb_sectors, 2124 BdrvRequestFlags flags, 2125 BlockCompletionFunc *cb, 2126 void *opaque, 2127 bool is_write) 2128 { 2129 Coroutine *co; 2130 BlockAIOCBCoroutine *acb; 2131 2132 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2133 acb->need_bh = true; 2134 acb->req.error = -EINPROGRESS; 2135 acb->req.sector = sector_num; 2136 acb->req.nb_sectors = nb_sectors; 2137 acb->req.qiov = qiov; 2138 acb->req.flags = flags; 2139 acb->is_write = is_write; 2140 2141 co = qemu_coroutine_create(bdrv_co_do_rw); 2142 qemu_coroutine_enter(co, acb); 2143 2144 bdrv_co_maybe_schedule_bh(acb); 2145 return &acb->common; 2146 } 2147 2148 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2149 { 2150 BlockAIOCBCoroutine *acb = opaque; 2151 BlockDriverState *bs = acb->common.bs; 2152 2153 acb->req.error = bdrv_co_flush(bs); 2154 bdrv_co_complete(acb); 2155 } 2156 2157 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2158 BlockCompletionFunc *cb, void *opaque) 2159 { 2160 trace_bdrv_aio_flush(bs, opaque); 2161 2162 Coroutine *co; 2163 BlockAIOCBCoroutine *acb; 2164 2165 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2166 acb->need_bh = true; 2167 acb->req.error = -EINPROGRESS; 2168 2169 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2170 qemu_coroutine_enter(co, acb); 2171 2172 bdrv_co_maybe_schedule_bh(acb); 2173 return &acb->common; 2174 } 2175 2176 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2177 { 2178 BlockAIOCBCoroutine *acb = opaque; 2179 BlockDriverState *bs = acb->common.bs; 2180 2181 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2182 bdrv_co_complete(acb); 2183 } 2184 2185 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2186 int64_t sector_num, int nb_sectors, 2187 BlockCompletionFunc *cb, void *opaque) 2188 { 2189 Coroutine *co; 2190 BlockAIOCBCoroutine *acb; 2191 2192 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2193 2194 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2195 acb->need_bh = true; 2196 acb->req.error = -EINPROGRESS; 2197 acb->req.sector = sector_num; 2198 acb->req.nb_sectors = nb_sectors; 2199 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2200 qemu_coroutine_enter(co, acb); 2201 2202 bdrv_co_maybe_schedule_bh(acb); 2203 return &acb->common; 2204 } 2205 2206 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2207 BlockCompletionFunc *cb, void *opaque) 2208 { 2209 BlockAIOCB *acb; 2210 2211 acb = g_slice_alloc(aiocb_info->aiocb_size); 2212 acb->aiocb_info = aiocb_info; 2213 acb->bs = bs; 2214 acb->cb = cb; 2215 acb->opaque = opaque; 2216 acb->refcnt = 1; 2217 return acb; 2218 } 2219 2220 void qemu_aio_ref(void *p) 2221 { 2222 BlockAIOCB *acb = p; 2223 acb->refcnt++; 2224 } 2225 2226 void qemu_aio_unref(void *p) 2227 { 2228 BlockAIOCB *acb = p; 2229 assert(acb->refcnt > 0); 2230 if (--acb->refcnt == 0) { 2231 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 2232 } 2233 } 2234 2235 /**************************************************************/ 2236 /* Coroutine block device emulation */ 2237 2238 typedef struct CoroutineIOCompletion { 2239 Coroutine *coroutine; 2240 int ret; 2241 } CoroutineIOCompletion; 2242 2243 static void bdrv_co_io_em_complete(void *opaque, int ret) 2244 { 2245 CoroutineIOCompletion *co = opaque; 2246 2247 co->ret = ret; 2248 qemu_coroutine_enter(co->coroutine, NULL); 2249 } 2250 2251 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2252 int nb_sectors, QEMUIOVector *iov, 2253 bool is_write) 2254 { 2255 CoroutineIOCompletion co = { 2256 .coroutine = qemu_coroutine_self(), 2257 }; 2258 BlockAIOCB *acb; 2259 2260 if (is_write) { 2261 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2262 bdrv_co_io_em_complete, &co); 2263 } else { 2264 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2265 bdrv_co_io_em_complete, &co); 2266 } 2267 2268 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2269 if (!acb) { 2270 return -EIO; 2271 } 2272 qemu_coroutine_yield(); 2273 2274 return co.ret; 2275 } 2276 2277 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2278 int64_t sector_num, int nb_sectors, 2279 QEMUIOVector *iov) 2280 { 2281 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2282 } 2283 2284 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2285 int64_t sector_num, int nb_sectors, 2286 QEMUIOVector *iov) 2287 { 2288 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2289 } 2290 2291 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2292 { 2293 RwCo *rwco = opaque; 2294 2295 rwco->ret = bdrv_co_flush(rwco->bs); 2296 } 2297 2298 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2299 { 2300 int ret; 2301 2302 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2303 bdrv_is_sg(bs)) { 2304 return 0; 2305 } 2306 2307 /* Write back cached data to the OS even with cache=unsafe */ 2308 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2309 if (bs->drv->bdrv_co_flush_to_os) { 2310 ret = bs->drv->bdrv_co_flush_to_os(bs); 2311 if (ret < 0) { 2312 return ret; 2313 } 2314 } 2315 2316 /* But don't actually force it to the disk with cache=unsafe */ 2317 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2318 goto flush_parent; 2319 } 2320 2321 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2322 if (bs->drv->bdrv_co_flush_to_disk) { 2323 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2324 } else if (bs->drv->bdrv_aio_flush) { 2325 BlockAIOCB *acb; 2326 CoroutineIOCompletion co = { 2327 .coroutine = qemu_coroutine_self(), 2328 }; 2329 2330 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2331 if (acb == NULL) { 2332 ret = -EIO; 2333 } else { 2334 qemu_coroutine_yield(); 2335 ret = co.ret; 2336 } 2337 } else { 2338 /* 2339 * Some block drivers always operate in either writethrough or unsafe 2340 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2341 * know how the server works (because the behaviour is hardcoded or 2342 * depends on server-side configuration), so we can't ensure that 2343 * everything is safe on disk. Returning an error doesn't work because 2344 * that would break guests even if the server operates in writethrough 2345 * mode. 2346 * 2347 * Let's hope the user knows what he's doing. 2348 */ 2349 ret = 0; 2350 } 2351 if (ret < 0) { 2352 return ret; 2353 } 2354 2355 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2356 * in the case of cache=unsafe, so there are no useless flushes. 2357 */ 2358 flush_parent: 2359 return bdrv_co_flush(bs->file); 2360 } 2361 2362 int bdrv_flush(BlockDriverState *bs) 2363 { 2364 Coroutine *co; 2365 RwCo rwco = { 2366 .bs = bs, 2367 .ret = NOT_DONE, 2368 }; 2369 2370 if (qemu_in_coroutine()) { 2371 /* Fast-path if already in coroutine context */ 2372 bdrv_flush_co_entry(&rwco); 2373 } else { 2374 AioContext *aio_context = bdrv_get_aio_context(bs); 2375 2376 co = qemu_coroutine_create(bdrv_flush_co_entry); 2377 qemu_coroutine_enter(co, &rwco); 2378 while (rwco.ret == NOT_DONE) { 2379 aio_poll(aio_context, true); 2380 } 2381 } 2382 2383 return rwco.ret; 2384 } 2385 2386 typedef struct DiscardCo { 2387 BlockDriverState *bs; 2388 int64_t sector_num; 2389 int nb_sectors; 2390 int ret; 2391 } DiscardCo; 2392 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2393 { 2394 DiscardCo *rwco = opaque; 2395 2396 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2397 } 2398 2399 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2400 int nb_sectors) 2401 { 2402 int max_discard, ret; 2403 2404 if (!bs->drv) { 2405 return -ENOMEDIUM; 2406 } 2407 2408 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2409 if (ret < 0) { 2410 return ret; 2411 } else if (bs->read_only) { 2412 return -EPERM; 2413 } 2414 2415 /* Do nothing if disabled. */ 2416 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2417 return 0; 2418 } 2419 2420 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2421 return 0; 2422 } 2423 2424 bdrv_set_dirty(bs, sector_num, nb_sectors); 2425 2426 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2427 while (nb_sectors > 0) { 2428 int ret; 2429 int num = nb_sectors; 2430 2431 /* align request */ 2432 if (bs->bl.discard_alignment && 2433 num >= bs->bl.discard_alignment && 2434 sector_num % bs->bl.discard_alignment) { 2435 if (num > bs->bl.discard_alignment) { 2436 num = bs->bl.discard_alignment; 2437 } 2438 num -= sector_num % bs->bl.discard_alignment; 2439 } 2440 2441 /* limit request size */ 2442 if (num > max_discard) { 2443 num = max_discard; 2444 } 2445 2446 if (bs->drv->bdrv_co_discard) { 2447 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2448 } else { 2449 BlockAIOCB *acb; 2450 CoroutineIOCompletion co = { 2451 .coroutine = qemu_coroutine_self(), 2452 }; 2453 2454 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2455 bdrv_co_io_em_complete, &co); 2456 if (acb == NULL) { 2457 return -EIO; 2458 } else { 2459 qemu_coroutine_yield(); 2460 ret = co.ret; 2461 } 2462 } 2463 if (ret && ret != -ENOTSUP) { 2464 return ret; 2465 } 2466 2467 sector_num += num; 2468 nb_sectors -= num; 2469 } 2470 return 0; 2471 } 2472 2473 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2474 { 2475 Coroutine *co; 2476 DiscardCo rwco = { 2477 .bs = bs, 2478 .sector_num = sector_num, 2479 .nb_sectors = nb_sectors, 2480 .ret = NOT_DONE, 2481 }; 2482 2483 if (qemu_in_coroutine()) { 2484 /* Fast-path if already in coroutine context */ 2485 bdrv_discard_co_entry(&rwco); 2486 } else { 2487 AioContext *aio_context = bdrv_get_aio_context(bs); 2488 2489 co = qemu_coroutine_create(bdrv_discard_co_entry); 2490 qemu_coroutine_enter(co, &rwco); 2491 while (rwco.ret == NOT_DONE) { 2492 aio_poll(aio_context, true); 2493 } 2494 } 2495 2496 return rwco.ret; 2497 } 2498 2499 /* needed for generic scsi interface */ 2500 2501 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2502 { 2503 BlockDriver *drv = bs->drv; 2504 2505 if (drv && drv->bdrv_ioctl) 2506 return drv->bdrv_ioctl(bs, req, buf); 2507 return -ENOTSUP; 2508 } 2509 2510 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2511 unsigned long int req, void *buf, 2512 BlockCompletionFunc *cb, void *opaque) 2513 { 2514 BlockDriver *drv = bs->drv; 2515 2516 if (drv && drv->bdrv_aio_ioctl) 2517 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 2518 return NULL; 2519 } 2520 2521 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2522 { 2523 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2524 } 2525 2526 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2527 { 2528 return memset(qemu_blockalign(bs, size), 0, size); 2529 } 2530 2531 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2532 { 2533 size_t align = bdrv_opt_mem_align(bs); 2534 2535 /* Ensure that NULL is never returned on success */ 2536 assert(align > 0); 2537 if (size == 0) { 2538 size = align; 2539 } 2540 2541 return qemu_try_memalign(align, size); 2542 } 2543 2544 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2545 { 2546 void *mem = qemu_try_blockalign(bs, size); 2547 2548 if (mem) { 2549 memset(mem, 0, size); 2550 } 2551 2552 return mem; 2553 } 2554 2555 /* 2556 * Check if all memory in this vector is sector aligned. 2557 */ 2558 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2559 { 2560 int i; 2561 size_t alignment = bdrv_min_mem_align(bs); 2562 2563 for (i = 0; i < qiov->niov; i++) { 2564 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2565 return false; 2566 } 2567 if (qiov->iov[i].iov_len % alignment) { 2568 return false; 2569 } 2570 } 2571 2572 return true; 2573 } 2574 2575 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2576 NotifierWithReturn *notifier) 2577 { 2578 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2579 } 2580 2581 void bdrv_io_plug(BlockDriverState *bs) 2582 { 2583 BlockDriver *drv = bs->drv; 2584 if (drv && drv->bdrv_io_plug) { 2585 drv->bdrv_io_plug(bs); 2586 } else if (bs->file) { 2587 bdrv_io_plug(bs->file); 2588 } 2589 } 2590 2591 void bdrv_io_unplug(BlockDriverState *bs) 2592 { 2593 BlockDriver *drv = bs->drv; 2594 if (drv && drv->bdrv_io_unplug) { 2595 drv->bdrv_io_unplug(bs); 2596 } else if (bs->file) { 2597 bdrv_io_unplug(bs->file); 2598 } 2599 } 2600 2601 void bdrv_flush_io_queue(BlockDriverState *bs) 2602 { 2603 BlockDriver *drv = bs->drv; 2604 if (drv && drv->bdrv_flush_io_queue) { 2605 drv->bdrv_flush_io_queue(bs); 2606 } else if (bs->file) { 2607 bdrv_flush_io_queue(bs->file); 2608 } 2609 bdrv_start_throttled_reqs(bs); 2610 } 2611