1 /* 2 * Block layer I/O functions 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "trace.h" 26 #include "block/blockjob.h" 27 #include "block/block_int.h" 28 #include "block/throttle-groups.h" 29 #include "qemu/error-report.h" 30 31 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 32 33 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 34 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 35 BlockCompletionFunc *cb, void *opaque); 36 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 37 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 38 BlockCompletionFunc *cb, void *opaque); 39 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 40 int64_t sector_num, int nb_sectors, 41 QEMUIOVector *iov); 42 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 43 int64_t sector_num, int nb_sectors, 44 QEMUIOVector *iov); 45 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 46 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 47 BdrvRequestFlags flags); 48 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 49 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 50 BdrvRequestFlags flags); 51 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 52 int64_t sector_num, 53 QEMUIOVector *qiov, 54 int nb_sectors, 55 BdrvRequestFlags flags, 56 BlockCompletionFunc *cb, 57 void *opaque, 58 bool is_write); 59 static void coroutine_fn bdrv_co_do_rw(void *opaque); 60 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 61 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); 62 63 /* throttling disk I/O limits */ 64 void bdrv_set_io_limits(BlockDriverState *bs, 65 ThrottleConfig *cfg) 66 { 67 int i; 68 69 throttle_group_config(bs, cfg); 70 71 for (i = 0; i < 2; i++) { 72 qemu_co_enter_next(&bs->throttled_reqs[i]); 73 } 74 } 75 76 /* this function drain all the throttled IOs */ 77 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 78 { 79 bool drained = false; 80 bool enabled = bs->io_limits_enabled; 81 int i; 82 83 bs->io_limits_enabled = false; 84 85 for (i = 0; i < 2; i++) { 86 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 87 drained = true; 88 } 89 } 90 91 bs->io_limits_enabled = enabled; 92 93 return drained; 94 } 95 96 void bdrv_io_limits_disable(BlockDriverState *bs) 97 { 98 bs->io_limits_enabled = false; 99 bdrv_start_throttled_reqs(bs); 100 throttle_group_unregister_bs(bs); 101 } 102 103 /* should be called before bdrv_set_io_limits if a limit is set */ 104 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) 105 { 106 assert(!bs->io_limits_enabled); 107 throttle_group_register_bs(bs, group); 108 bs->io_limits_enabled = true; 109 } 110 111 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) 112 { 113 /* this bs is not part of any group */ 114 if (!bs->throttle_state) { 115 return; 116 } 117 118 /* this bs is a part of the same group than the one we want */ 119 if (!g_strcmp0(throttle_group_get_name(bs), group)) { 120 return; 121 } 122 123 /* need to change the group this bs belong to */ 124 bdrv_io_limits_disable(bs); 125 bdrv_io_limits_enable(bs, group); 126 } 127 128 void bdrv_setup_io_funcs(BlockDriver *bdrv) 129 { 130 /* Block drivers without coroutine functions need emulation */ 131 if (!bdrv->bdrv_co_readv) { 132 bdrv->bdrv_co_readv = bdrv_co_readv_em; 133 bdrv->bdrv_co_writev = bdrv_co_writev_em; 134 135 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 136 * the block driver lacks aio we need to emulate that too. 137 */ 138 if (!bdrv->bdrv_aio_readv) { 139 /* add AIO emulation layer */ 140 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 141 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 142 } 143 } 144 } 145 146 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 147 { 148 BlockDriver *drv = bs->drv; 149 Error *local_err = NULL; 150 151 memset(&bs->bl, 0, sizeof(bs->bl)); 152 153 if (!drv) { 154 return; 155 } 156 157 /* Take some limits from the children as a default */ 158 if (bs->file) { 159 bdrv_refresh_limits(bs->file, &local_err); 160 if (local_err) { 161 error_propagate(errp, local_err); 162 return; 163 } 164 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; 165 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; 166 bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; 167 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; 168 } else { 169 bs->bl.min_mem_alignment = 512; 170 bs->bl.opt_mem_alignment = getpagesize(); 171 } 172 173 if (bs->backing_hd) { 174 bdrv_refresh_limits(bs->backing_hd, &local_err); 175 if (local_err) { 176 error_propagate(errp, local_err); 177 return; 178 } 179 bs->bl.opt_transfer_length = 180 MAX(bs->bl.opt_transfer_length, 181 bs->backing_hd->bl.opt_transfer_length); 182 bs->bl.max_transfer_length = 183 MIN_NON_ZERO(bs->bl.max_transfer_length, 184 bs->backing_hd->bl.max_transfer_length); 185 bs->bl.opt_mem_alignment = 186 MAX(bs->bl.opt_mem_alignment, 187 bs->backing_hd->bl.opt_mem_alignment); 188 bs->bl.min_mem_alignment = 189 MAX(bs->bl.min_mem_alignment, 190 bs->backing_hd->bl.min_mem_alignment); 191 } 192 193 /* Then let the driver override it */ 194 if (drv->bdrv_refresh_limits) { 195 drv->bdrv_refresh_limits(bs, errp); 196 } 197 } 198 199 /** 200 * The copy-on-read flag is actually a reference count so multiple users may 201 * use the feature without worrying about clobbering its previous state. 202 * Copy-on-read stays enabled until all users have called to disable it. 203 */ 204 void bdrv_enable_copy_on_read(BlockDriverState *bs) 205 { 206 bs->copy_on_read++; 207 } 208 209 void bdrv_disable_copy_on_read(BlockDriverState *bs) 210 { 211 assert(bs->copy_on_read > 0); 212 bs->copy_on_read--; 213 } 214 215 /* Check if any requests are in-flight (including throttled requests) */ 216 static bool bdrv_requests_pending(BlockDriverState *bs) 217 { 218 if (!QLIST_EMPTY(&bs->tracked_requests)) { 219 return true; 220 } 221 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 222 return true; 223 } 224 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 225 return true; 226 } 227 if (bs->file && bdrv_requests_pending(bs->file)) { 228 return true; 229 } 230 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 231 return true; 232 } 233 return false; 234 } 235 236 /* 237 * Wait for pending requests to complete on a single BlockDriverState subtree 238 * 239 * See the warning in bdrv_drain_all(). This function can only be called if 240 * you are sure nothing can generate I/O because you have op blockers 241 * installed. 242 * 243 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 244 * AioContext. 245 */ 246 void bdrv_drain(BlockDriverState *bs) 247 { 248 bool busy = true; 249 250 while (busy) { 251 /* Keep iterating */ 252 bdrv_flush_io_queue(bs); 253 busy = bdrv_requests_pending(bs); 254 busy |= aio_poll(bdrv_get_aio_context(bs), busy); 255 } 256 } 257 258 /* 259 * Wait for pending requests to complete across all BlockDriverStates 260 * 261 * This function does not flush data to disk, use bdrv_flush_all() for that 262 * after calling this function. 263 * 264 * Note that completion of an asynchronous I/O operation can trigger any 265 * number of other I/O operations on other devices---for example a coroutine 266 * can be arbitrarily complex and a constant flow of I/O can come until the 267 * coroutine is complete. Because of this, it is not possible to have a 268 * function to drain a single device's I/O queue. 269 */ 270 void bdrv_drain_all(void) 271 { 272 /* Always run first iteration so any pending completion BHs run */ 273 bool busy = true; 274 BlockDriverState *bs = NULL; 275 GSList *aio_ctxs = NULL, *ctx; 276 277 while ((bs = bdrv_next(bs))) { 278 AioContext *aio_context = bdrv_get_aio_context(bs); 279 280 aio_context_acquire(aio_context); 281 if (bs->job) { 282 block_job_pause(bs->job); 283 } 284 aio_context_release(aio_context); 285 286 if (!aio_ctxs || !g_slist_find(aio_ctxs, aio_context)) { 287 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); 288 } 289 } 290 291 while (busy) { 292 busy = false; 293 294 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { 295 AioContext *aio_context = ctx->data; 296 bs = NULL; 297 298 aio_context_acquire(aio_context); 299 while ((bs = bdrv_next(bs))) { 300 if (aio_context == bdrv_get_aio_context(bs)) { 301 bdrv_flush_io_queue(bs); 302 if (bdrv_requests_pending(bs)) { 303 busy = true; 304 aio_poll(aio_context, busy); 305 } 306 } 307 } 308 busy |= aio_poll(aio_context, false); 309 aio_context_release(aio_context); 310 } 311 } 312 313 bs = NULL; 314 while ((bs = bdrv_next(bs))) { 315 AioContext *aio_context = bdrv_get_aio_context(bs); 316 317 aio_context_acquire(aio_context); 318 if (bs->job) { 319 block_job_resume(bs->job); 320 } 321 aio_context_release(aio_context); 322 } 323 g_slist_free(aio_ctxs); 324 } 325 326 /** 327 * Remove an active request from the tracked requests list 328 * 329 * This function should be called when a tracked request is completing. 330 */ 331 static void tracked_request_end(BdrvTrackedRequest *req) 332 { 333 if (req->serialising) { 334 req->bs->serialising_in_flight--; 335 } 336 337 QLIST_REMOVE(req, list); 338 qemu_co_queue_restart_all(&req->wait_queue); 339 } 340 341 /** 342 * Add an active request to the tracked requests list 343 */ 344 static void tracked_request_begin(BdrvTrackedRequest *req, 345 BlockDriverState *bs, 346 int64_t offset, 347 unsigned int bytes, bool is_write) 348 { 349 *req = (BdrvTrackedRequest){ 350 .bs = bs, 351 .offset = offset, 352 .bytes = bytes, 353 .is_write = is_write, 354 .co = qemu_coroutine_self(), 355 .serialising = false, 356 .overlap_offset = offset, 357 .overlap_bytes = bytes, 358 }; 359 360 qemu_co_queue_init(&req->wait_queue); 361 362 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 363 } 364 365 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 366 { 367 int64_t overlap_offset = req->offset & ~(align - 1); 368 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 369 - overlap_offset; 370 371 if (!req->serialising) { 372 req->bs->serialising_in_flight++; 373 req->serialising = true; 374 } 375 376 req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 377 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 378 } 379 380 /** 381 * Round a region to cluster boundaries 382 */ 383 void bdrv_round_to_clusters(BlockDriverState *bs, 384 int64_t sector_num, int nb_sectors, 385 int64_t *cluster_sector_num, 386 int *cluster_nb_sectors) 387 { 388 BlockDriverInfo bdi; 389 390 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 391 *cluster_sector_num = sector_num; 392 *cluster_nb_sectors = nb_sectors; 393 } else { 394 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 395 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 396 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 397 nb_sectors, c); 398 } 399 } 400 401 static int bdrv_get_cluster_size(BlockDriverState *bs) 402 { 403 BlockDriverInfo bdi; 404 int ret; 405 406 ret = bdrv_get_info(bs, &bdi); 407 if (ret < 0 || bdi.cluster_size == 0) { 408 return bs->request_alignment; 409 } else { 410 return bdi.cluster_size; 411 } 412 } 413 414 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 415 int64_t offset, unsigned int bytes) 416 { 417 /* aaaa bbbb */ 418 if (offset >= req->overlap_offset + req->overlap_bytes) { 419 return false; 420 } 421 /* bbbb aaaa */ 422 if (req->overlap_offset >= offset + bytes) { 423 return false; 424 } 425 return true; 426 } 427 428 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 429 { 430 BlockDriverState *bs = self->bs; 431 BdrvTrackedRequest *req; 432 bool retry; 433 bool waited = false; 434 435 if (!bs->serialising_in_flight) { 436 return false; 437 } 438 439 do { 440 retry = false; 441 QLIST_FOREACH(req, &bs->tracked_requests, list) { 442 if (req == self || (!req->serialising && !self->serialising)) { 443 continue; 444 } 445 if (tracked_request_overlaps(req, self->overlap_offset, 446 self->overlap_bytes)) 447 { 448 /* Hitting this means there was a reentrant request, for 449 * example, a block driver issuing nested requests. This must 450 * never happen since it means deadlock. 451 */ 452 assert(qemu_coroutine_self() != req->co); 453 454 /* If the request is already (indirectly) waiting for us, or 455 * will wait for us as soon as it wakes up, then just go on 456 * (instead of producing a deadlock in the former case). */ 457 if (!req->waiting_for) { 458 self->waiting_for = req; 459 qemu_co_queue_wait(&req->wait_queue); 460 self->waiting_for = NULL; 461 retry = true; 462 waited = true; 463 break; 464 } 465 } 466 } 467 } while (retry); 468 469 return waited; 470 } 471 472 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 473 size_t size) 474 { 475 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 476 return -EIO; 477 } 478 479 if (!bdrv_is_inserted(bs)) { 480 return -ENOMEDIUM; 481 } 482 483 if (offset < 0) { 484 return -EIO; 485 } 486 487 return 0; 488 } 489 490 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 491 int nb_sectors) 492 { 493 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 494 return -EIO; 495 } 496 497 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 498 nb_sectors * BDRV_SECTOR_SIZE); 499 } 500 501 typedef struct RwCo { 502 BlockDriverState *bs; 503 int64_t offset; 504 QEMUIOVector *qiov; 505 bool is_write; 506 int ret; 507 BdrvRequestFlags flags; 508 } RwCo; 509 510 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 511 { 512 RwCo *rwco = opaque; 513 514 if (!rwco->is_write) { 515 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, 516 rwco->qiov->size, rwco->qiov, 517 rwco->flags); 518 } else { 519 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, 520 rwco->qiov->size, rwco->qiov, 521 rwco->flags); 522 } 523 } 524 525 /* 526 * Process a vectored synchronous request using coroutines 527 */ 528 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, 529 QEMUIOVector *qiov, bool is_write, 530 BdrvRequestFlags flags) 531 { 532 Coroutine *co; 533 RwCo rwco = { 534 .bs = bs, 535 .offset = offset, 536 .qiov = qiov, 537 .is_write = is_write, 538 .ret = NOT_DONE, 539 .flags = flags, 540 }; 541 542 /** 543 * In sync call context, when the vcpu is blocked, this throttling timer 544 * will not fire; so the I/O throttling function has to be disabled here 545 * if it has been enabled. 546 */ 547 if (bs->io_limits_enabled) { 548 fprintf(stderr, "Disabling I/O throttling on '%s' due " 549 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 550 bdrv_io_limits_disable(bs); 551 } 552 553 if (qemu_in_coroutine()) { 554 /* Fast-path if already in coroutine context */ 555 bdrv_rw_co_entry(&rwco); 556 } else { 557 AioContext *aio_context = bdrv_get_aio_context(bs); 558 559 co = qemu_coroutine_create(bdrv_rw_co_entry); 560 qemu_coroutine_enter(co, &rwco); 561 while (rwco.ret == NOT_DONE) { 562 aio_poll(aio_context, true); 563 } 564 } 565 return rwco.ret; 566 } 567 568 /* 569 * Process a synchronous request using coroutines 570 */ 571 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 572 int nb_sectors, bool is_write, BdrvRequestFlags flags) 573 { 574 QEMUIOVector qiov; 575 struct iovec iov = { 576 .iov_base = (void *)buf, 577 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 578 }; 579 580 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 581 return -EINVAL; 582 } 583 584 qemu_iovec_init_external(&qiov, &iov, 1); 585 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, 586 &qiov, is_write, flags); 587 } 588 589 /* return < 0 if error. See bdrv_write() for the return codes */ 590 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 591 uint8_t *buf, int nb_sectors) 592 { 593 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 594 } 595 596 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 597 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 598 uint8_t *buf, int nb_sectors) 599 { 600 bool enabled; 601 int ret; 602 603 enabled = bs->io_limits_enabled; 604 bs->io_limits_enabled = false; 605 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 606 bs->io_limits_enabled = enabled; 607 return ret; 608 } 609 610 /* Return < 0 if error. Important errors are: 611 -EIO generic I/O error (may happen for all errors) 612 -ENOMEDIUM No media inserted. 613 -EINVAL Invalid sector number or nb_sectors 614 -EACCES Trying to write a read-only device 615 */ 616 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 617 const uint8_t *buf, int nb_sectors) 618 { 619 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 620 } 621 622 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, 623 int nb_sectors, BdrvRequestFlags flags) 624 { 625 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 626 BDRV_REQ_ZERO_WRITE | flags); 627 } 628 629 /* 630 * Completely zero out a block device with the help of bdrv_write_zeroes. 631 * The operation is sped up by checking the block status and only writing 632 * zeroes to the device if they currently do not return zeroes. Optional 633 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). 634 * 635 * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 636 */ 637 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) 638 { 639 int64_t target_sectors, ret, nb_sectors, sector_num = 0; 640 int n; 641 642 target_sectors = bdrv_nb_sectors(bs); 643 if (target_sectors < 0) { 644 return target_sectors; 645 } 646 647 for (;;) { 648 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); 649 if (nb_sectors <= 0) { 650 return 0; 651 } 652 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); 653 if (ret < 0) { 654 error_report("error getting block status at sector %" PRId64 ": %s", 655 sector_num, strerror(-ret)); 656 return ret; 657 } 658 if (ret & BDRV_BLOCK_ZERO) { 659 sector_num += n; 660 continue; 661 } 662 ret = bdrv_write_zeroes(bs, sector_num, n, flags); 663 if (ret < 0) { 664 error_report("error writing zeroes at sector %" PRId64 ": %s", 665 sector_num, strerror(-ret)); 666 return ret; 667 } 668 sector_num += n; 669 } 670 } 671 672 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) 673 { 674 QEMUIOVector qiov; 675 struct iovec iov = { 676 .iov_base = (void *)buf, 677 .iov_len = bytes, 678 }; 679 int ret; 680 681 if (bytes < 0) { 682 return -EINVAL; 683 } 684 685 qemu_iovec_init_external(&qiov, &iov, 1); 686 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); 687 if (ret < 0) { 688 return ret; 689 } 690 691 return bytes; 692 } 693 694 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 695 { 696 int ret; 697 698 ret = bdrv_prwv_co(bs, offset, qiov, true, 0); 699 if (ret < 0) { 700 return ret; 701 } 702 703 return qiov->size; 704 } 705 706 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 707 const void *buf, int bytes) 708 { 709 QEMUIOVector qiov; 710 struct iovec iov = { 711 .iov_base = (void *) buf, 712 .iov_len = bytes, 713 }; 714 715 if (bytes < 0) { 716 return -EINVAL; 717 } 718 719 qemu_iovec_init_external(&qiov, &iov, 1); 720 return bdrv_pwritev(bs, offset, &qiov); 721 } 722 723 /* 724 * Writes to the file and ensures that no writes are reordered across this 725 * request (acts as a barrier) 726 * 727 * Returns 0 on success, -errno in error cases. 728 */ 729 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 730 const void *buf, int count) 731 { 732 int ret; 733 734 ret = bdrv_pwrite(bs, offset, buf, count); 735 if (ret < 0) { 736 return ret; 737 } 738 739 /* No flush needed for cache modes that already do it */ 740 if (bs->enable_write_cache) { 741 bdrv_flush(bs); 742 } 743 744 return 0; 745 } 746 747 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 748 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 749 { 750 /* Perform I/O through a temporary buffer so that users who scribble over 751 * their read buffer while the operation is in progress do not end up 752 * modifying the image file. This is critical for zero-copy guest I/O 753 * where anything might happen inside guest memory. 754 */ 755 void *bounce_buffer; 756 757 BlockDriver *drv = bs->drv; 758 struct iovec iov; 759 QEMUIOVector bounce_qiov; 760 int64_t cluster_sector_num; 761 int cluster_nb_sectors; 762 size_t skip_bytes; 763 int ret; 764 765 /* Cover entire cluster so no additional backing file I/O is required when 766 * allocating cluster in the image file. 767 */ 768 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 769 &cluster_sector_num, &cluster_nb_sectors); 770 771 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 772 cluster_sector_num, cluster_nb_sectors); 773 774 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 775 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); 776 if (bounce_buffer == NULL) { 777 ret = -ENOMEM; 778 goto err; 779 } 780 781 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 782 783 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 784 &bounce_qiov); 785 if (ret < 0) { 786 goto err; 787 } 788 789 if (drv->bdrv_co_write_zeroes && 790 buffer_is_zero(bounce_buffer, iov.iov_len)) { 791 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 792 cluster_nb_sectors, 0); 793 } else { 794 /* This does not change the data on the disk, it is not necessary 795 * to flush even in cache=writethrough mode. 796 */ 797 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 798 &bounce_qiov); 799 } 800 801 if (ret < 0) { 802 /* It might be okay to ignore write errors for guest requests. If this 803 * is a deliberate copy-on-read then we don't want to ignore the error. 804 * Simply report it in all cases. 805 */ 806 goto err; 807 } 808 809 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 810 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 811 nb_sectors * BDRV_SECTOR_SIZE); 812 813 err: 814 qemu_vfree(bounce_buffer); 815 return ret; 816 } 817 818 /* 819 * Forwards an already correctly aligned request to the BlockDriver. This 820 * handles copy on read and zeroing after EOF; any other features must be 821 * implemented by the caller. 822 */ 823 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, 824 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 825 int64_t align, QEMUIOVector *qiov, int flags) 826 { 827 BlockDriver *drv = bs->drv; 828 int ret; 829 830 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 831 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 832 833 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 834 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 835 assert(!qiov || bytes == qiov->size); 836 837 /* Handle Copy on Read and associated serialisation */ 838 if (flags & BDRV_REQ_COPY_ON_READ) { 839 /* If we touch the same cluster it counts as an overlap. This 840 * guarantees that allocating writes will be serialized and not race 841 * with each other for the same cluster. For example, in copy-on-read 842 * it ensures that the CoR read and write operations are atomic and 843 * guest writes cannot interleave between them. */ 844 mark_request_serialising(req, bdrv_get_cluster_size(bs)); 845 } 846 847 wait_serialising_requests(req); 848 849 if (flags & BDRV_REQ_COPY_ON_READ) { 850 int pnum; 851 852 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 853 if (ret < 0) { 854 goto out; 855 } 856 857 if (!ret || pnum != nb_sectors) { 858 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 859 goto out; 860 } 861 } 862 863 /* Forward the request to the BlockDriver */ 864 if (!bs->zero_beyond_eof) { 865 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 866 } else { 867 /* Read zeros after EOF */ 868 int64_t total_sectors, max_nb_sectors; 869 870 total_sectors = bdrv_nb_sectors(bs); 871 if (total_sectors < 0) { 872 ret = total_sectors; 873 goto out; 874 } 875 876 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), 877 align >> BDRV_SECTOR_BITS); 878 if (nb_sectors < max_nb_sectors) { 879 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 880 } else if (max_nb_sectors > 0) { 881 QEMUIOVector local_qiov; 882 883 qemu_iovec_init(&local_qiov, qiov->niov); 884 qemu_iovec_concat(&local_qiov, qiov, 0, 885 max_nb_sectors * BDRV_SECTOR_SIZE); 886 887 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, 888 &local_qiov); 889 890 qemu_iovec_destroy(&local_qiov); 891 } else { 892 ret = 0; 893 } 894 895 /* Reading beyond end of file is supposed to produce zeroes */ 896 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 897 uint64_t offset = MAX(0, total_sectors - sector_num); 898 uint64_t bytes = (sector_num + nb_sectors - offset) * 899 BDRV_SECTOR_SIZE; 900 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 901 } 902 } 903 904 out: 905 return ret; 906 } 907 908 /* 909 * Handle a read request in coroutine context 910 */ 911 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, 912 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 913 BdrvRequestFlags flags) 914 { 915 BlockDriver *drv = bs->drv; 916 BdrvTrackedRequest req; 917 918 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 919 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 920 uint8_t *head_buf = NULL; 921 uint8_t *tail_buf = NULL; 922 QEMUIOVector local_qiov; 923 bool use_local_qiov = false; 924 int ret; 925 926 if (!drv) { 927 return -ENOMEDIUM; 928 } 929 930 ret = bdrv_check_byte_request(bs, offset, bytes); 931 if (ret < 0) { 932 return ret; 933 } 934 935 if (bs->copy_on_read) { 936 flags |= BDRV_REQ_COPY_ON_READ; 937 } 938 939 /* throttling disk I/O */ 940 if (bs->io_limits_enabled) { 941 throttle_group_co_io_limits_intercept(bs, bytes, false); 942 } 943 944 /* Align read if necessary by padding qiov */ 945 if (offset & (align - 1)) { 946 head_buf = qemu_blockalign(bs, align); 947 qemu_iovec_init(&local_qiov, qiov->niov + 2); 948 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 949 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 950 use_local_qiov = true; 951 952 bytes += offset & (align - 1); 953 offset = offset & ~(align - 1); 954 } 955 956 if ((offset + bytes) & (align - 1)) { 957 if (!use_local_qiov) { 958 qemu_iovec_init(&local_qiov, qiov->niov + 1); 959 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 960 use_local_qiov = true; 961 } 962 tail_buf = qemu_blockalign(bs, align); 963 qemu_iovec_add(&local_qiov, tail_buf, 964 align - ((offset + bytes) & (align - 1))); 965 966 bytes = ROUND_UP(bytes, align); 967 } 968 969 tracked_request_begin(&req, bs, offset, bytes, false); 970 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, 971 use_local_qiov ? &local_qiov : qiov, 972 flags); 973 tracked_request_end(&req); 974 975 if (use_local_qiov) { 976 qemu_iovec_destroy(&local_qiov); 977 qemu_vfree(head_buf); 978 qemu_vfree(tail_buf); 979 } 980 981 return ret; 982 } 983 984 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 985 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 986 BdrvRequestFlags flags) 987 { 988 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 989 return -EINVAL; 990 } 991 992 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, 993 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 994 } 995 996 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 997 int nb_sectors, QEMUIOVector *qiov) 998 { 999 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1000 1001 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 1002 } 1003 1004 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1005 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1006 { 1007 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 1008 1009 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 1010 BDRV_REQ_COPY_ON_READ); 1011 } 1012 1013 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 1014 1015 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 1016 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) 1017 { 1018 BlockDriver *drv = bs->drv; 1019 QEMUIOVector qiov; 1020 struct iovec iov = {0}; 1021 int ret = 0; 1022 1023 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, 1024 BDRV_REQUEST_MAX_SECTORS); 1025 1026 while (nb_sectors > 0 && !ret) { 1027 int num = nb_sectors; 1028 1029 /* Align request. Block drivers can expect the "bulk" of the request 1030 * to be aligned. 1031 */ 1032 if (bs->bl.write_zeroes_alignment 1033 && num > bs->bl.write_zeroes_alignment) { 1034 if (sector_num % bs->bl.write_zeroes_alignment != 0) { 1035 /* Make a small request up to the first aligned sector. */ 1036 num = bs->bl.write_zeroes_alignment; 1037 num -= sector_num % bs->bl.write_zeroes_alignment; 1038 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { 1039 /* Shorten the request to the last aligned sector. num cannot 1040 * underflow because num > bs->bl.write_zeroes_alignment. 1041 */ 1042 num -= (sector_num + num) % bs->bl.write_zeroes_alignment; 1043 } 1044 } 1045 1046 /* limit request size */ 1047 if (num > max_write_zeroes) { 1048 num = max_write_zeroes; 1049 } 1050 1051 ret = -ENOTSUP; 1052 /* First try the efficient write zeroes operation */ 1053 if (drv->bdrv_co_write_zeroes) { 1054 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); 1055 } 1056 1057 if (ret == -ENOTSUP) { 1058 /* Fall back to bounce buffer if write zeroes is unsupported */ 1059 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, 1060 MAX_WRITE_ZEROES_BOUNCE_BUFFER); 1061 num = MIN(num, max_xfer_len); 1062 iov.iov_len = num * BDRV_SECTOR_SIZE; 1063 if (iov.iov_base == NULL) { 1064 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); 1065 if (iov.iov_base == NULL) { 1066 ret = -ENOMEM; 1067 goto fail; 1068 } 1069 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); 1070 } 1071 qemu_iovec_init_external(&qiov, &iov, 1); 1072 1073 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); 1074 1075 /* Keep bounce buffer around if it is big enough for all 1076 * all future requests. 1077 */ 1078 if (num < max_xfer_len) { 1079 qemu_vfree(iov.iov_base); 1080 iov.iov_base = NULL; 1081 } 1082 } 1083 1084 sector_num += num; 1085 nb_sectors -= num; 1086 } 1087 1088 fail: 1089 qemu_vfree(iov.iov_base); 1090 return ret; 1091 } 1092 1093 /* 1094 * Forwards an already correctly aligned write request to the BlockDriver. 1095 */ 1096 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, 1097 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1098 QEMUIOVector *qiov, int flags) 1099 { 1100 BlockDriver *drv = bs->drv; 1101 bool waited; 1102 int ret; 1103 1104 int64_t sector_num = offset >> BDRV_SECTOR_BITS; 1105 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; 1106 1107 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1108 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1109 assert(!qiov || bytes == qiov->size); 1110 1111 waited = wait_serialising_requests(req); 1112 assert(!waited || !req->serialising); 1113 assert(req->overlap_offset <= offset); 1114 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1115 1116 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 1117 1118 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1119 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && 1120 qemu_iovec_is_zero(qiov)) { 1121 flags |= BDRV_REQ_ZERO_WRITE; 1122 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 1123 flags |= BDRV_REQ_MAY_UNMAP; 1124 } 1125 } 1126 1127 if (ret < 0) { 1128 /* Do nothing, write notifier decided to fail this request */ 1129 } else if (flags & BDRV_REQ_ZERO_WRITE) { 1130 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); 1131 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); 1132 } else { 1133 BLKDBG_EVENT(bs, BLKDBG_PWRITEV); 1134 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1135 } 1136 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); 1137 1138 if (ret == 0 && !bs->enable_write_cache) { 1139 ret = bdrv_co_flush(bs); 1140 } 1141 1142 bdrv_set_dirty(bs, sector_num, nb_sectors); 1143 1144 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); 1145 1146 if (ret >= 0) { 1147 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 1148 } 1149 1150 return ret; 1151 } 1152 1153 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, 1154 int64_t offset, 1155 unsigned int bytes, 1156 BdrvRequestFlags flags, 1157 BdrvTrackedRequest *req) 1158 { 1159 uint8_t *buf = NULL; 1160 QEMUIOVector local_qiov; 1161 struct iovec iov; 1162 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1163 unsigned int head_padding_bytes, tail_padding_bytes; 1164 int ret = 0; 1165 1166 head_padding_bytes = offset & (align - 1); 1167 tail_padding_bytes = align - ((offset + bytes) & (align - 1)); 1168 1169 1170 assert(flags & BDRV_REQ_ZERO_WRITE); 1171 if (head_padding_bytes || tail_padding_bytes) { 1172 buf = qemu_blockalign(bs, align); 1173 iov = (struct iovec) { 1174 .iov_base = buf, 1175 .iov_len = align, 1176 }; 1177 qemu_iovec_init_external(&local_qiov, &iov, 1); 1178 } 1179 if (head_padding_bytes) { 1180 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 1181 1182 /* RMW the unaligned part before head. */ 1183 mark_request_serialising(req, align); 1184 wait_serialising_requests(req); 1185 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1186 ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, 1187 align, &local_qiov, 0); 1188 if (ret < 0) { 1189 goto fail; 1190 } 1191 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1192 1193 memset(buf + head_padding_bytes, 0, zero_bytes); 1194 ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, 1195 &local_qiov, 1196 flags & ~BDRV_REQ_ZERO_WRITE); 1197 if (ret < 0) { 1198 goto fail; 1199 } 1200 offset += zero_bytes; 1201 bytes -= zero_bytes; 1202 } 1203 1204 assert(!bytes || (offset & (align - 1)) == 0); 1205 if (bytes >= align) { 1206 /* Write the aligned part in the middle. */ 1207 uint64_t aligned_bytes = bytes & ~(align - 1); 1208 ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, 1209 NULL, flags); 1210 if (ret < 0) { 1211 goto fail; 1212 } 1213 bytes -= aligned_bytes; 1214 offset += aligned_bytes; 1215 } 1216 1217 assert(!bytes || (offset & (align - 1)) == 0); 1218 if (bytes) { 1219 assert(align == tail_padding_bytes + bytes); 1220 /* RMW the unaligned part after tail. */ 1221 mark_request_serialising(req, align); 1222 wait_serialising_requests(req); 1223 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1224 ret = bdrv_aligned_preadv(bs, req, offset, align, 1225 align, &local_qiov, 0); 1226 if (ret < 0) { 1227 goto fail; 1228 } 1229 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1230 1231 memset(buf, 0, bytes); 1232 ret = bdrv_aligned_pwritev(bs, req, offset, align, 1233 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 1234 } 1235 fail: 1236 qemu_vfree(buf); 1237 return ret; 1238 1239 } 1240 1241 /* 1242 * Handle a write request in coroutine context 1243 */ 1244 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, 1245 int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 1246 BdrvRequestFlags flags) 1247 { 1248 BdrvTrackedRequest req; 1249 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ 1250 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); 1251 uint8_t *head_buf = NULL; 1252 uint8_t *tail_buf = NULL; 1253 QEMUIOVector local_qiov; 1254 bool use_local_qiov = false; 1255 int ret; 1256 1257 if (!bs->drv) { 1258 return -ENOMEDIUM; 1259 } 1260 if (bs->read_only) { 1261 return -EPERM; 1262 } 1263 1264 ret = bdrv_check_byte_request(bs, offset, bytes); 1265 if (ret < 0) { 1266 return ret; 1267 } 1268 1269 /* throttling disk I/O */ 1270 if (bs->io_limits_enabled) { 1271 throttle_group_co_io_limits_intercept(bs, bytes, true); 1272 } 1273 1274 /* 1275 * Align write if necessary by performing a read-modify-write cycle. 1276 * Pad qiov with the read parts and be sure to have a tracked request not 1277 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 1278 */ 1279 tracked_request_begin(&req, bs, offset, bytes, true); 1280 1281 if (!qiov) { 1282 ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); 1283 goto out; 1284 } 1285 1286 if (offset & (align - 1)) { 1287 QEMUIOVector head_qiov; 1288 struct iovec head_iov; 1289 1290 mark_request_serialising(&req, align); 1291 wait_serialising_requests(&req); 1292 1293 head_buf = qemu_blockalign(bs, align); 1294 head_iov = (struct iovec) { 1295 .iov_base = head_buf, 1296 .iov_len = align, 1297 }; 1298 qemu_iovec_init_external(&head_qiov, &head_iov, 1); 1299 1300 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); 1301 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, 1302 align, &head_qiov, 0); 1303 if (ret < 0) { 1304 goto fail; 1305 } 1306 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 1307 1308 qemu_iovec_init(&local_qiov, qiov->niov + 2); 1309 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 1310 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1311 use_local_qiov = true; 1312 1313 bytes += offset & (align - 1); 1314 offset = offset & ~(align - 1); 1315 } 1316 1317 if ((offset + bytes) & (align - 1)) { 1318 QEMUIOVector tail_qiov; 1319 struct iovec tail_iov; 1320 size_t tail_bytes; 1321 bool waited; 1322 1323 mark_request_serialising(&req, align); 1324 waited = wait_serialising_requests(&req); 1325 assert(!waited || !use_local_qiov); 1326 1327 tail_buf = qemu_blockalign(bs, align); 1328 tail_iov = (struct iovec) { 1329 .iov_base = tail_buf, 1330 .iov_len = align, 1331 }; 1332 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 1333 1334 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); 1335 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, 1336 align, &tail_qiov, 0); 1337 if (ret < 0) { 1338 goto fail; 1339 } 1340 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 1341 1342 if (!use_local_qiov) { 1343 qemu_iovec_init(&local_qiov, qiov->niov + 1); 1344 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 1345 use_local_qiov = true; 1346 } 1347 1348 tail_bytes = (offset + bytes) & (align - 1); 1349 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 1350 1351 bytes = ROUND_UP(bytes, align); 1352 } 1353 1354 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, 1355 use_local_qiov ? &local_qiov : qiov, 1356 flags); 1357 1358 fail: 1359 1360 if (use_local_qiov) { 1361 qemu_iovec_destroy(&local_qiov); 1362 } 1363 qemu_vfree(head_buf); 1364 qemu_vfree(tail_buf); 1365 out: 1366 tracked_request_end(&req); 1367 return ret; 1368 } 1369 1370 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1371 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 1372 BdrvRequestFlags flags) 1373 { 1374 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 1375 return -EINVAL; 1376 } 1377 1378 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, 1379 nb_sectors << BDRV_SECTOR_BITS, qiov, flags); 1380 } 1381 1382 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1383 int nb_sectors, QEMUIOVector *qiov) 1384 { 1385 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1386 1387 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 1388 } 1389 1390 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 1391 int64_t sector_num, int nb_sectors, 1392 BdrvRequestFlags flags) 1393 { 1394 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); 1395 1396 if (!(bs->open_flags & BDRV_O_UNMAP)) { 1397 flags &= ~BDRV_REQ_MAY_UNMAP; 1398 } 1399 1400 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 1401 BDRV_REQ_ZERO_WRITE | flags); 1402 } 1403 1404 int bdrv_flush_all(void) 1405 { 1406 BlockDriverState *bs = NULL; 1407 int result = 0; 1408 1409 while ((bs = bdrv_next(bs))) { 1410 AioContext *aio_context = bdrv_get_aio_context(bs); 1411 int ret; 1412 1413 aio_context_acquire(aio_context); 1414 ret = bdrv_flush(bs); 1415 if (ret < 0 && !result) { 1416 result = ret; 1417 } 1418 aio_context_release(aio_context); 1419 } 1420 1421 return result; 1422 } 1423 1424 typedef struct BdrvCoGetBlockStatusData { 1425 BlockDriverState *bs; 1426 BlockDriverState *base; 1427 int64_t sector_num; 1428 int nb_sectors; 1429 int *pnum; 1430 int64_t ret; 1431 bool done; 1432 } BdrvCoGetBlockStatusData; 1433 1434 /* 1435 * Returns the allocation status of the specified sectors. 1436 * Drivers not implementing the functionality are assumed to not support 1437 * backing files, hence all their sectors are reported as allocated. 1438 * 1439 * If 'sector_num' is beyond the end of the disk image the return value is 0 1440 * and 'pnum' is set to 0. 1441 * 1442 * 'pnum' is set to the number of sectors (including and immediately following 1443 * the specified sector) that are known to be in the same 1444 * allocated/unallocated state. 1445 * 1446 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 1447 * beyond the end of the disk image it will be clamped. 1448 */ 1449 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 1450 int64_t sector_num, 1451 int nb_sectors, int *pnum) 1452 { 1453 int64_t total_sectors; 1454 int64_t n; 1455 int64_t ret, ret2; 1456 1457 total_sectors = bdrv_nb_sectors(bs); 1458 if (total_sectors < 0) { 1459 return total_sectors; 1460 } 1461 1462 if (sector_num >= total_sectors) { 1463 *pnum = 0; 1464 return 0; 1465 } 1466 1467 n = total_sectors - sector_num; 1468 if (n < nb_sectors) { 1469 nb_sectors = n; 1470 } 1471 1472 if (!bs->drv->bdrv_co_get_block_status) { 1473 *pnum = nb_sectors; 1474 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 1475 if (bs->drv->protocol_name) { 1476 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 1477 } 1478 return ret; 1479 } 1480 1481 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 1482 if (ret < 0) { 1483 *pnum = 0; 1484 return ret; 1485 } 1486 1487 if (ret & BDRV_BLOCK_RAW) { 1488 assert(ret & BDRV_BLOCK_OFFSET_VALID); 1489 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1490 *pnum, pnum); 1491 } 1492 1493 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 1494 ret |= BDRV_BLOCK_ALLOCATED; 1495 } else { 1496 if (bdrv_unallocated_blocks_are_zero(bs)) { 1497 ret |= BDRV_BLOCK_ZERO; 1498 } else if (bs->backing_hd) { 1499 BlockDriverState *bs2 = bs->backing_hd; 1500 int64_t nb_sectors2 = bdrv_nb_sectors(bs2); 1501 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { 1502 ret |= BDRV_BLOCK_ZERO; 1503 } 1504 } 1505 } 1506 1507 if (bs->file && 1508 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 1509 (ret & BDRV_BLOCK_OFFSET_VALID)) { 1510 int file_pnum; 1511 1512 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 1513 *pnum, &file_pnum); 1514 if (ret2 >= 0) { 1515 /* Ignore errors. This is just providing extra information, it 1516 * is useful but not necessary. 1517 */ 1518 if (!file_pnum) { 1519 /* !file_pnum indicates an offset at or beyond the EOF; it is 1520 * perfectly valid for the format block driver to point to such 1521 * offsets, so catch it and mark everything as zero */ 1522 ret |= BDRV_BLOCK_ZERO; 1523 } else { 1524 /* Limit request to the range reported by the protocol driver */ 1525 *pnum = file_pnum; 1526 ret |= (ret2 & BDRV_BLOCK_ZERO); 1527 } 1528 } 1529 } 1530 1531 return ret; 1532 } 1533 1534 /* Coroutine wrapper for bdrv_get_block_status() */ 1535 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) 1536 { 1537 BdrvCoGetBlockStatusData *data = opaque; 1538 BlockDriverState *bs = data->bs; 1539 1540 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, 1541 data->pnum); 1542 data->done = true; 1543 } 1544 1545 /* 1546 * Synchronous wrapper around bdrv_co_get_block_status(). 1547 * 1548 * See bdrv_co_get_block_status() for details. 1549 */ 1550 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, 1551 int nb_sectors, int *pnum) 1552 { 1553 Coroutine *co; 1554 BdrvCoGetBlockStatusData data = { 1555 .bs = bs, 1556 .sector_num = sector_num, 1557 .nb_sectors = nb_sectors, 1558 .pnum = pnum, 1559 .done = false, 1560 }; 1561 1562 if (qemu_in_coroutine()) { 1563 /* Fast-path if already in coroutine context */ 1564 bdrv_get_block_status_co_entry(&data); 1565 } else { 1566 AioContext *aio_context = bdrv_get_aio_context(bs); 1567 1568 co = qemu_coroutine_create(bdrv_get_block_status_co_entry); 1569 qemu_coroutine_enter(co, &data); 1570 while (!data.done) { 1571 aio_poll(aio_context, true); 1572 } 1573 } 1574 return data.ret; 1575 } 1576 1577 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 1578 int nb_sectors, int *pnum) 1579 { 1580 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 1581 if (ret < 0) { 1582 return ret; 1583 } 1584 return !!(ret & BDRV_BLOCK_ALLOCATED); 1585 } 1586 1587 /* 1588 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 1589 * 1590 * Return true if the given sector is allocated in any image between 1591 * BASE and TOP (inclusive). BASE can be NULL to check if the given 1592 * sector is allocated in any image of the chain. Return false otherwise. 1593 * 1594 * 'pnum' is set to the number of sectors (including and immediately following 1595 * the specified sector) that are known to be in the same 1596 * allocated/unallocated state. 1597 * 1598 */ 1599 int bdrv_is_allocated_above(BlockDriverState *top, 1600 BlockDriverState *base, 1601 int64_t sector_num, 1602 int nb_sectors, int *pnum) 1603 { 1604 BlockDriverState *intermediate; 1605 int ret, n = nb_sectors; 1606 1607 intermediate = top; 1608 while (intermediate && intermediate != base) { 1609 int pnum_inter; 1610 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 1611 &pnum_inter); 1612 if (ret < 0) { 1613 return ret; 1614 } else if (ret) { 1615 *pnum = pnum_inter; 1616 return 1; 1617 } 1618 1619 /* 1620 * [sector_num, nb_sectors] is unallocated on top but intermediate 1621 * might have 1622 * 1623 * [sector_num+x, nr_sectors] allocated. 1624 */ 1625 if (n > pnum_inter && 1626 (intermediate == top || 1627 sector_num + pnum_inter < intermediate->total_sectors)) { 1628 n = pnum_inter; 1629 } 1630 1631 intermediate = intermediate->backing_hd; 1632 } 1633 1634 *pnum = n; 1635 return 0; 1636 } 1637 1638 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 1639 const uint8_t *buf, int nb_sectors) 1640 { 1641 BlockDriver *drv = bs->drv; 1642 int ret; 1643 1644 if (!drv) { 1645 return -ENOMEDIUM; 1646 } 1647 if (!drv->bdrv_write_compressed) { 1648 return -ENOTSUP; 1649 } 1650 ret = bdrv_check_request(bs, sector_num, nb_sectors); 1651 if (ret < 0) { 1652 return ret; 1653 } 1654 1655 assert(QLIST_EMPTY(&bs->dirty_bitmaps)); 1656 1657 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 1658 } 1659 1660 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 1661 int64_t pos, int size) 1662 { 1663 QEMUIOVector qiov; 1664 struct iovec iov = { 1665 .iov_base = (void *) buf, 1666 .iov_len = size, 1667 }; 1668 1669 qemu_iovec_init_external(&qiov, &iov, 1); 1670 return bdrv_writev_vmstate(bs, &qiov, pos); 1671 } 1672 1673 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 1674 { 1675 BlockDriver *drv = bs->drv; 1676 1677 if (!drv) { 1678 return -ENOMEDIUM; 1679 } else if (drv->bdrv_save_vmstate) { 1680 return drv->bdrv_save_vmstate(bs, qiov, pos); 1681 } else if (bs->file) { 1682 return bdrv_writev_vmstate(bs->file, qiov, pos); 1683 } 1684 1685 return -ENOTSUP; 1686 } 1687 1688 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1689 int64_t pos, int size) 1690 { 1691 BlockDriver *drv = bs->drv; 1692 if (!drv) 1693 return -ENOMEDIUM; 1694 if (drv->bdrv_load_vmstate) 1695 return drv->bdrv_load_vmstate(bs, buf, pos, size); 1696 if (bs->file) 1697 return bdrv_load_vmstate(bs->file, buf, pos, size); 1698 return -ENOTSUP; 1699 } 1700 1701 /**************************************************************/ 1702 /* async I/Os */ 1703 1704 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 1705 QEMUIOVector *qiov, int nb_sectors, 1706 BlockCompletionFunc *cb, void *opaque) 1707 { 1708 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 1709 1710 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1711 cb, opaque, false); 1712 } 1713 1714 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 1715 QEMUIOVector *qiov, int nb_sectors, 1716 BlockCompletionFunc *cb, void *opaque) 1717 { 1718 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 1719 1720 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, 1721 cb, opaque, true); 1722 } 1723 1724 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, 1725 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, 1726 BlockCompletionFunc *cb, void *opaque) 1727 { 1728 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); 1729 1730 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, 1731 BDRV_REQ_ZERO_WRITE | flags, 1732 cb, opaque, true); 1733 } 1734 1735 1736 typedef struct MultiwriteCB { 1737 int error; 1738 int num_requests; 1739 int num_callbacks; 1740 struct { 1741 BlockCompletionFunc *cb; 1742 void *opaque; 1743 QEMUIOVector *free_qiov; 1744 } callbacks[]; 1745 } MultiwriteCB; 1746 1747 static void multiwrite_user_cb(MultiwriteCB *mcb) 1748 { 1749 int i; 1750 1751 for (i = 0; i < mcb->num_callbacks; i++) { 1752 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 1753 if (mcb->callbacks[i].free_qiov) { 1754 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 1755 } 1756 g_free(mcb->callbacks[i].free_qiov); 1757 } 1758 } 1759 1760 static void multiwrite_cb(void *opaque, int ret) 1761 { 1762 MultiwriteCB *mcb = opaque; 1763 1764 trace_multiwrite_cb(mcb, ret); 1765 1766 if (ret < 0 && !mcb->error) { 1767 mcb->error = ret; 1768 } 1769 1770 mcb->num_requests--; 1771 if (mcb->num_requests == 0) { 1772 multiwrite_user_cb(mcb); 1773 g_free(mcb); 1774 } 1775 } 1776 1777 static int multiwrite_req_compare(const void *a, const void *b) 1778 { 1779 const BlockRequest *req1 = a, *req2 = b; 1780 1781 /* 1782 * Note that we can't simply subtract req2->sector from req1->sector 1783 * here as that could overflow the return value. 1784 */ 1785 if (req1->sector > req2->sector) { 1786 return 1; 1787 } else if (req1->sector < req2->sector) { 1788 return -1; 1789 } else { 1790 return 0; 1791 } 1792 } 1793 1794 /* 1795 * Takes a bunch of requests and tries to merge them. Returns the number of 1796 * requests that remain after merging. 1797 */ 1798 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 1799 int num_reqs, MultiwriteCB *mcb) 1800 { 1801 int i, outidx; 1802 1803 // Sort requests by start sector 1804 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 1805 1806 // Check if adjacent requests touch the same clusters. If so, combine them, 1807 // filling up gaps with zero sectors. 1808 outidx = 0; 1809 for (i = 1; i < num_reqs; i++) { 1810 int merge = 0; 1811 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 1812 1813 // Handle exactly sequential writes and overlapping writes. 1814 if (reqs[i].sector <= oldreq_last) { 1815 merge = 1; 1816 } 1817 1818 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 1819 merge = 0; 1820 } 1821 1822 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + 1823 reqs[i].nb_sectors > bs->bl.max_transfer_length) { 1824 merge = 0; 1825 } 1826 1827 if (merge) { 1828 size_t size; 1829 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 1830 qemu_iovec_init(qiov, 1831 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 1832 1833 // Add the first request to the merged one. If the requests are 1834 // overlapping, drop the last sectors of the first request. 1835 size = (reqs[i].sector - reqs[outidx].sector) << 9; 1836 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 1837 1838 // We should need to add any zeros between the two requests 1839 assert (reqs[i].sector <= oldreq_last); 1840 1841 // Add the second request 1842 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 1843 1844 // Add tail of first request, if necessary 1845 if (qiov->size < reqs[outidx].qiov->size) { 1846 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, 1847 reqs[outidx].qiov->size - qiov->size); 1848 } 1849 1850 reqs[outidx].nb_sectors = qiov->size >> 9; 1851 reqs[outidx].qiov = qiov; 1852 1853 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 1854 } else { 1855 outidx++; 1856 reqs[outidx].sector = reqs[i].sector; 1857 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 1858 reqs[outidx].qiov = reqs[i].qiov; 1859 } 1860 } 1861 1862 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); 1863 1864 return outidx + 1; 1865 } 1866 1867 /* 1868 * Submit multiple AIO write requests at once. 1869 * 1870 * On success, the function returns 0 and all requests in the reqs array have 1871 * been submitted. In error case this function returns -1, and any of the 1872 * requests may or may not be submitted yet. In particular, this means that the 1873 * callback will be called for some of the requests, for others it won't. The 1874 * caller must check the error field of the BlockRequest to wait for the right 1875 * callbacks (if error != 0, no callback will be called). 1876 * 1877 * The implementation may modify the contents of the reqs array, e.g. to merge 1878 * requests. However, the fields opaque and error are left unmodified as they 1879 * are used to signal failure for a single request to the caller. 1880 */ 1881 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 1882 { 1883 MultiwriteCB *mcb; 1884 int i; 1885 1886 /* don't submit writes if we don't have a medium */ 1887 if (bs->drv == NULL) { 1888 for (i = 0; i < num_reqs; i++) { 1889 reqs[i].error = -ENOMEDIUM; 1890 } 1891 return -1; 1892 } 1893 1894 if (num_reqs == 0) { 1895 return 0; 1896 } 1897 1898 // Create MultiwriteCB structure 1899 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 1900 mcb->num_requests = 0; 1901 mcb->num_callbacks = num_reqs; 1902 1903 for (i = 0; i < num_reqs; i++) { 1904 mcb->callbacks[i].cb = reqs[i].cb; 1905 mcb->callbacks[i].opaque = reqs[i].opaque; 1906 } 1907 1908 // Check for mergable requests 1909 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 1910 1911 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 1912 1913 /* Run the aio requests. */ 1914 mcb->num_requests = num_reqs; 1915 for (i = 0; i < num_reqs; i++) { 1916 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, 1917 reqs[i].nb_sectors, reqs[i].flags, 1918 multiwrite_cb, mcb, 1919 true); 1920 } 1921 1922 return 0; 1923 } 1924 1925 void bdrv_aio_cancel(BlockAIOCB *acb) 1926 { 1927 qemu_aio_ref(acb); 1928 bdrv_aio_cancel_async(acb); 1929 while (acb->refcnt > 1) { 1930 if (acb->aiocb_info->get_aio_context) { 1931 aio_poll(acb->aiocb_info->get_aio_context(acb), true); 1932 } else if (acb->bs) { 1933 aio_poll(bdrv_get_aio_context(acb->bs), true); 1934 } else { 1935 abort(); 1936 } 1937 } 1938 qemu_aio_unref(acb); 1939 } 1940 1941 /* Async version of aio cancel. The caller is not blocked if the acb implements 1942 * cancel_async, otherwise we do nothing and let the request normally complete. 1943 * In either case the completion callback must be called. */ 1944 void bdrv_aio_cancel_async(BlockAIOCB *acb) 1945 { 1946 if (acb->aiocb_info->cancel_async) { 1947 acb->aiocb_info->cancel_async(acb); 1948 } 1949 } 1950 1951 /**************************************************************/ 1952 /* async block device emulation */ 1953 1954 typedef struct BlockAIOCBSync { 1955 BlockAIOCB common; 1956 QEMUBH *bh; 1957 int ret; 1958 /* vector translation state */ 1959 QEMUIOVector *qiov; 1960 uint8_t *bounce; 1961 int is_write; 1962 } BlockAIOCBSync; 1963 1964 static const AIOCBInfo bdrv_em_aiocb_info = { 1965 .aiocb_size = sizeof(BlockAIOCBSync), 1966 }; 1967 1968 static void bdrv_aio_bh_cb(void *opaque) 1969 { 1970 BlockAIOCBSync *acb = opaque; 1971 1972 if (!acb->is_write && acb->ret >= 0) { 1973 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 1974 } 1975 qemu_vfree(acb->bounce); 1976 acb->common.cb(acb->common.opaque, acb->ret); 1977 qemu_bh_delete(acb->bh); 1978 acb->bh = NULL; 1979 qemu_aio_unref(acb); 1980 } 1981 1982 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 1983 int64_t sector_num, 1984 QEMUIOVector *qiov, 1985 int nb_sectors, 1986 BlockCompletionFunc *cb, 1987 void *opaque, 1988 int is_write) 1989 1990 { 1991 BlockAIOCBSync *acb; 1992 1993 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 1994 acb->is_write = is_write; 1995 acb->qiov = qiov; 1996 acb->bounce = qemu_try_blockalign(bs, qiov->size); 1997 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); 1998 1999 if (acb->bounce == NULL) { 2000 acb->ret = -ENOMEM; 2001 } else if (is_write) { 2002 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 2003 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 2004 } else { 2005 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 2006 } 2007 2008 qemu_bh_schedule(acb->bh); 2009 2010 return &acb->common; 2011 } 2012 2013 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 2014 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2015 BlockCompletionFunc *cb, void *opaque) 2016 { 2017 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 2018 } 2019 2020 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 2021 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 2022 BlockCompletionFunc *cb, void *opaque) 2023 { 2024 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 2025 } 2026 2027 2028 typedef struct BlockAIOCBCoroutine { 2029 BlockAIOCB common; 2030 BlockRequest req; 2031 bool is_write; 2032 bool need_bh; 2033 bool *done; 2034 QEMUBH* bh; 2035 } BlockAIOCBCoroutine; 2036 2037 static const AIOCBInfo bdrv_em_co_aiocb_info = { 2038 .aiocb_size = sizeof(BlockAIOCBCoroutine), 2039 }; 2040 2041 static void bdrv_co_complete(BlockAIOCBCoroutine *acb) 2042 { 2043 if (!acb->need_bh) { 2044 acb->common.cb(acb->common.opaque, acb->req.error); 2045 qemu_aio_unref(acb); 2046 } 2047 } 2048 2049 static void bdrv_co_em_bh(void *opaque) 2050 { 2051 BlockAIOCBCoroutine *acb = opaque; 2052 2053 assert(!acb->need_bh); 2054 qemu_bh_delete(acb->bh); 2055 bdrv_co_complete(acb); 2056 } 2057 2058 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) 2059 { 2060 acb->need_bh = false; 2061 if (acb->req.error != -EINPROGRESS) { 2062 BlockDriverState *bs = acb->common.bs; 2063 2064 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); 2065 qemu_bh_schedule(acb->bh); 2066 } 2067 } 2068 2069 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 2070 static void coroutine_fn bdrv_co_do_rw(void *opaque) 2071 { 2072 BlockAIOCBCoroutine *acb = opaque; 2073 BlockDriverState *bs = acb->common.bs; 2074 2075 if (!acb->is_write) { 2076 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 2077 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2078 } else { 2079 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 2080 acb->req.nb_sectors, acb->req.qiov, acb->req.flags); 2081 } 2082 2083 bdrv_co_complete(acb); 2084 } 2085 2086 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 2087 int64_t sector_num, 2088 QEMUIOVector *qiov, 2089 int nb_sectors, 2090 BdrvRequestFlags flags, 2091 BlockCompletionFunc *cb, 2092 void *opaque, 2093 bool is_write) 2094 { 2095 Coroutine *co; 2096 BlockAIOCBCoroutine *acb; 2097 2098 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2099 acb->need_bh = true; 2100 acb->req.error = -EINPROGRESS; 2101 acb->req.sector = sector_num; 2102 acb->req.nb_sectors = nb_sectors; 2103 acb->req.qiov = qiov; 2104 acb->req.flags = flags; 2105 acb->is_write = is_write; 2106 2107 co = qemu_coroutine_create(bdrv_co_do_rw); 2108 qemu_coroutine_enter(co, acb); 2109 2110 bdrv_co_maybe_schedule_bh(acb); 2111 return &acb->common; 2112 } 2113 2114 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 2115 { 2116 BlockAIOCBCoroutine *acb = opaque; 2117 BlockDriverState *bs = acb->common.bs; 2118 2119 acb->req.error = bdrv_co_flush(bs); 2120 bdrv_co_complete(acb); 2121 } 2122 2123 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, 2124 BlockCompletionFunc *cb, void *opaque) 2125 { 2126 trace_bdrv_aio_flush(bs, opaque); 2127 2128 Coroutine *co; 2129 BlockAIOCBCoroutine *acb; 2130 2131 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2132 acb->need_bh = true; 2133 acb->req.error = -EINPROGRESS; 2134 2135 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 2136 qemu_coroutine_enter(co, acb); 2137 2138 bdrv_co_maybe_schedule_bh(acb); 2139 return &acb->common; 2140 } 2141 2142 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 2143 { 2144 BlockAIOCBCoroutine *acb = opaque; 2145 BlockDriverState *bs = acb->common.bs; 2146 2147 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 2148 bdrv_co_complete(acb); 2149 } 2150 2151 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, 2152 int64_t sector_num, int nb_sectors, 2153 BlockCompletionFunc *cb, void *opaque) 2154 { 2155 Coroutine *co; 2156 BlockAIOCBCoroutine *acb; 2157 2158 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 2159 2160 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 2161 acb->need_bh = true; 2162 acb->req.error = -EINPROGRESS; 2163 acb->req.sector = sector_num; 2164 acb->req.nb_sectors = nb_sectors; 2165 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 2166 qemu_coroutine_enter(co, acb); 2167 2168 bdrv_co_maybe_schedule_bh(acb); 2169 return &acb->common; 2170 } 2171 2172 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 2173 BlockCompletionFunc *cb, void *opaque) 2174 { 2175 BlockAIOCB *acb; 2176 2177 acb = g_slice_alloc(aiocb_info->aiocb_size); 2178 acb->aiocb_info = aiocb_info; 2179 acb->bs = bs; 2180 acb->cb = cb; 2181 acb->opaque = opaque; 2182 acb->refcnt = 1; 2183 return acb; 2184 } 2185 2186 void qemu_aio_ref(void *p) 2187 { 2188 BlockAIOCB *acb = p; 2189 acb->refcnt++; 2190 } 2191 2192 void qemu_aio_unref(void *p) 2193 { 2194 BlockAIOCB *acb = p; 2195 assert(acb->refcnt > 0); 2196 if (--acb->refcnt == 0) { 2197 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 2198 } 2199 } 2200 2201 /**************************************************************/ 2202 /* Coroutine block device emulation */ 2203 2204 typedef struct CoroutineIOCompletion { 2205 Coroutine *coroutine; 2206 int ret; 2207 } CoroutineIOCompletion; 2208 2209 static void bdrv_co_io_em_complete(void *opaque, int ret) 2210 { 2211 CoroutineIOCompletion *co = opaque; 2212 2213 co->ret = ret; 2214 qemu_coroutine_enter(co->coroutine, NULL); 2215 } 2216 2217 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 2218 int nb_sectors, QEMUIOVector *iov, 2219 bool is_write) 2220 { 2221 CoroutineIOCompletion co = { 2222 .coroutine = qemu_coroutine_self(), 2223 }; 2224 BlockAIOCB *acb; 2225 2226 if (is_write) { 2227 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 2228 bdrv_co_io_em_complete, &co); 2229 } else { 2230 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 2231 bdrv_co_io_em_complete, &co); 2232 } 2233 2234 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 2235 if (!acb) { 2236 return -EIO; 2237 } 2238 qemu_coroutine_yield(); 2239 2240 return co.ret; 2241 } 2242 2243 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 2244 int64_t sector_num, int nb_sectors, 2245 QEMUIOVector *iov) 2246 { 2247 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 2248 } 2249 2250 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 2251 int64_t sector_num, int nb_sectors, 2252 QEMUIOVector *iov) 2253 { 2254 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 2255 } 2256 2257 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 2258 { 2259 RwCo *rwco = opaque; 2260 2261 rwco->ret = bdrv_co_flush(rwco->bs); 2262 } 2263 2264 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 2265 { 2266 int ret; 2267 2268 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 2269 bdrv_is_sg(bs)) { 2270 return 0; 2271 } 2272 2273 /* Write back cached data to the OS even with cache=unsafe */ 2274 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 2275 if (bs->drv->bdrv_co_flush_to_os) { 2276 ret = bs->drv->bdrv_co_flush_to_os(bs); 2277 if (ret < 0) { 2278 return ret; 2279 } 2280 } 2281 2282 /* But don't actually force it to the disk with cache=unsafe */ 2283 if (bs->open_flags & BDRV_O_NO_FLUSH) { 2284 goto flush_parent; 2285 } 2286 2287 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2288 if (bs->drv->bdrv_co_flush_to_disk) { 2289 ret = bs->drv->bdrv_co_flush_to_disk(bs); 2290 } else if (bs->drv->bdrv_aio_flush) { 2291 BlockAIOCB *acb; 2292 CoroutineIOCompletion co = { 2293 .coroutine = qemu_coroutine_self(), 2294 }; 2295 2296 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 2297 if (acb == NULL) { 2298 ret = -EIO; 2299 } else { 2300 qemu_coroutine_yield(); 2301 ret = co.ret; 2302 } 2303 } else { 2304 /* 2305 * Some block drivers always operate in either writethrough or unsafe 2306 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 2307 * know how the server works (because the behaviour is hardcoded or 2308 * depends on server-side configuration), so we can't ensure that 2309 * everything is safe on disk. Returning an error doesn't work because 2310 * that would break guests even if the server operates in writethrough 2311 * mode. 2312 * 2313 * Let's hope the user knows what he's doing. 2314 */ 2315 ret = 0; 2316 } 2317 if (ret < 0) { 2318 return ret; 2319 } 2320 2321 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 2322 * in the case of cache=unsafe, so there are no useless flushes. 2323 */ 2324 flush_parent: 2325 return bdrv_co_flush(bs->file); 2326 } 2327 2328 int bdrv_flush(BlockDriverState *bs) 2329 { 2330 Coroutine *co; 2331 RwCo rwco = { 2332 .bs = bs, 2333 .ret = NOT_DONE, 2334 }; 2335 2336 if (qemu_in_coroutine()) { 2337 /* Fast-path if already in coroutine context */ 2338 bdrv_flush_co_entry(&rwco); 2339 } else { 2340 AioContext *aio_context = bdrv_get_aio_context(bs); 2341 2342 co = qemu_coroutine_create(bdrv_flush_co_entry); 2343 qemu_coroutine_enter(co, &rwco); 2344 while (rwco.ret == NOT_DONE) { 2345 aio_poll(aio_context, true); 2346 } 2347 } 2348 2349 return rwco.ret; 2350 } 2351 2352 typedef struct DiscardCo { 2353 BlockDriverState *bs; 2354 int64_t sector_num; 2355 int nb_sectors; 2356 int ret; 2357 } DiscardCo; 2358 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 2359 { 2360 DiscardCo *rwco = opaque; 2361 2362 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 2363 } 2364 2365 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 2366 int nb_sectors) 2367 { 2368 int max_discard, ret; 2369 2370 if (!bs->drv) { 2371 return -ENOMEDIUM; 2372 } 2373 2374 ret = bdrv_check_request(bs, sector_num, nb_sectors); 2375 if (ret < 0) { 2376 return ret; 2377 } else if (bs->read_only) { 2378 return -EPERM; 2379 } 2380 2381 bdrv_reset_dirty(bs, sector_num, nb_sectors); 2382 2383 /* Do nothing if disabled. */ 2384 if (!(bs->open_flags & BDRV_O_UNMAP)) { 2385 return 0; 2386 } 2387 2388 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { 2389 return 0; 2390 } 2391 2392 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); 2393 while (nb_sectors > 0) { 2394 int ret; 2395 int num = nb_sectors; 2396 2397 /* align request */ 2398 if (bs->bl.discard_alignment && 2399 num >= bs->bl.discard_alignment && 2400 sector_num % bs->bl.discard_alignment) { 2401 if (num > bs->bl.discard_alignment) { 2402 num = bs->bl.discard_alignment; 2403 } 2404 num -= sector_num % bs->bl.discard_alignment; 2405 } 2406 2407 /* limit request size */ 2408 if (num > max_discard) { 2409 num = max_discard; 2410 } 2411 2412 if (bs->drv->bdrv_co_discard) { 2413 ret = bs->drv->bdrv_co_discard(bs, sector_num, num); 2414 } else { 2415 BlockAIOCB *acb; 2416 CoroutineIOCompletion co = { 2417 .coroutine = qemu_coroutine_self(), 2418 }; 2419 2420 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 2421 bdrv_co_io_em_complete, &co); 2422 if (acb == NULL) { 2423 return -EIO; 2424 } else { 2425 qemu_coroutine_yield(); 2426 ret = co.ret; 2427 } 2428 } 2429 if (ret && ret != -ENOTSUP) { 2430 return ret; 2431 } 2432 2433 sector_num += num; 2434 nb_sectors -= num; 2435 } 2436 return 0; 2437 } 2438 2439 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2440 { 2441 Coroutine *co; 2442 DiscardCo rwco = { 2443 .bs = bs, 2444 .sector_num = sector_num, 2445 .nb_sectors = nb_sectors, 2446 .ret = NOT_DONE, 2447 }; 2448 2449 if (qemu_in_coroutine()) { 2450 /* Fast-path if already in coroutine context */ 2451 bdrv_discard_co_entry(&rwco); 2452 } else { 2453 AioContext *aio_context = bdrv_get_aio_context(bs); 2454 2455 co = qemu_coroutine_create(bdrv_discard_co_entry); 2456 qemu_coroutine_enter(co, &rwco); 2457 while (rwco.ret == NOT_DONE) { 2458 aio_poll(aio_context, true); 2459 } 2460 } 2461 2462 return rwco.ret; 2463 } 2464 2465 /* needed for generic scsi interface */ 2466 2467 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 2468 { 2469 BlockDriver *drv = bs->drv; 2470 2471 if (drv && drv->bdrv_ioctl) 2472 return drv->bdrv_ioctl(bs, req, buf); 2473 return -ENOTSUP; 2474 } 2475 2476 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 2477 unsigned long int req, void *buf, 2478 BlockCompletionFunc *cb, void *opaque) 2479 { 2480 BlockDriver *drv = bs->drv; 2481 2482 if (drv && drv->bdrv_aio_ioctl) 2483 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 2484 return NULL; 2485 } 2486 2487 void *qemu_blockalign(BlockDriverState *bs, size_t size) 2488 { 2489 return qemu_memalign(bdrv_opt_mem_align(bs), size); 2490 } 2491 2492 void *qemu_blockalign0(BlockDriverState *bs, size_t size) 2493 { 2494 return memset(qemu_blockalign(bs, size), 0, size); 2495 } 2496 2497 void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 2498 { 2499 size_t align = bdrv_opt_mem_align(bs); 2500 2501 /* Ensure that NULL is never returned on success */ 2502 assert(align > 0); 2503 if (size == 0) { 2504 size = align; 2505 } 2506 2507 return qemu_try_memalign(align, size); 2508 } 2509 2510 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 2511 { 2512 void *mem = qemu_try_blockalign(bs, size); 2513 2514 if (mem) { 2515 memset(mem, 0, size); 2516 } 2517 2518 return mem; 2519 } 2520 2521 /* 2522 * Check if all memory in this vector is sector aligned. 2523 */ 2524 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 2525 { 2526 int i; 2527 size_t alignment = bdrv_min_mem_align(bs); 2528 2529 for (i = 0; i < qiov->niov; i++) { 2530 if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 2531 return false; 2532 } 2533 if (qiov->iov[i].iov_len % alignment) { 2534 return false; 2535 } 2536 } 2537 2538 return true; 2539 } 2540 2541 void bdrv_add_before_write_notifier(BlockDriverState *bs, 2542 NotifierWithReturn *notifier) 2543 { 2544 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 2545 } 2546 2547 void bdrv_io_plug(BlockDriverState *bs) 2548 { 2549 BlockDriver *drv = bs->drv; 2550 if (drv && drv->bdrv_io_plug) { 2551 drv->bdrv_io_plug(bs); 2552 } else if (bs->file) { 2553 bdrv_io_plug(bs->file); 2554 } 2555 } 2556 2557 void bdrv_io_unplug(BlockDriverState *bs) 2558 { 2559 BlockDriver *drv = bs->drv; 2560 if (drv && drv->bdrv_io_unplug) { 2561 drv->bdrv_io_unplug(bs); 2562 } else if (bs->file) { 2563 bdrv_io_unplug(bs->file); 2564 } 2565 } 2566 2567 void bdrv_flush_io_queue(BlockDriverState *bs) 2568 { 2569 BlockDriver *drv = bs->drv; 2570 if (drv && drv->bdrv_flush_io_queue) { 2571 drv->bdrv_flush_io_queue(bs); 2572 } else if (bs->file) { 2573 bdrv_flush_io_queue(bs->file); 2574 } 2575 bdrv_start_throttled_reqs(bs); 2576 } 2577