1 /* 2 * preallocate filter driver 3 * 4 * The driver performs preallocate operation: it is injected above 5 * some node, and before each write over EOF it does additional preallocating 6 * write-zeroes request. 7 * 8 * Copyright (c) 2020 Virtuozzo International GmbH. 9 * 10 * Author: 11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com> 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 * 23 * You should have received a copy of the GNU General Public License 24 * along with this program. If not, see <http://www.gnu.org/licenses/>. 25 */ 26 27 #include "qemu/osdep.h" 28 29 #include "qapi/error.h" 30 #include "qemu/module.h" 31 #include "qemu/option.h" 32 #include "qemu/units.h" 33 #include "block/block-io.h" 34 #include "block/block_int.h" 35 36 37 typedef struct PreallocateOpts { 38 int64_t prealloc_size; 39 int64_t prealloc_align; 40 } PreallocateOpts; 41 42 typedef struct BDRVPreallocateState { 43 PreallocateOpts opts; 44 45 /* 46 * Track real data end, to crop preallocation on close. If < 0 the status is 47 * unknown. 48 * 49 * @data_end is a maximum of file size on open (or when we get write/resize 50 * permissions) and all write request ends after it. So it's safe to 51 * truncate to data_end if it is valid. 52 */ 53 int64_t data_end; 54 55 /* 56 * Start of trailing preallocated area which reads as zero. May be smaller 57 * than data_end, if user does over-EOF write zero operation. If < 0 the 58 * status is unknown. 59 * 60 * If both @zero_start and @file_end are valid, the region 61 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end 62 * is not valid, @zero_start doesn't make much sense. 63 */ 64 int64_t zero_start; 65 66 /* 67 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs), 68 * to avoid extra lseek() calls on each write operation. If < 0 the status 69 * is unknown. 70 */ 71 int64_t file_end; 72 73 /* 74 * All three states @data_end, @zero_start and @file_end are guaranteed to 75 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and 76 * BLK_PERM_WRITE permissions on file child. 77 */ 78 79 /* Gives up the resize permission on children when parents don't need it */ 80 QEMUBH *drop_resize_bh; 81 } BDRVPreallocateState; 82 83 static int preallocate_drop_resize(BlockDriverState *bs, Error **errp); 84 static void preallocate_drop_resize_bh(void *opaque); 85 86 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" 87 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" 88 static QemuOptsList runtime_opts = { 89 .name = "preallocate", 90 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 91 .desc = { 92 { 93 .name = PREALLOCATE_OPT_PREALLOC_ALIGN, 94 .type = QEMU_OPT_SIZE, 95 .help = "on preallocation, align file length to this number, " 96 "default 1M", 97 }, 98 { 99 .name = PREALLOCATE_OPT_PREALLOC_SIZE, 100 .type = QEMU_OPT_SIZE, 101 .help = "how much to preallocate, default 128M", 102 }, 103 { /* end of list */ } 104 }, 105 }; 106 107 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options, 108 BlockDriverState *child_bs, Error **errp) 109 { 110 QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 111 112 if (!qemu_opts_absorb_qdict(opts, options, errp)) { 113 return false; 114 } 115 116 dest->prealloc_align = 117 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); 118 dest->prealloc_size = 119 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); 120 121 qemu_opts_del(opts); 122 123 if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) { 124 error_setg(errp, "prealloc-align parameter of preallocate filter " 125 "is not aligned to %llu", BDRV_SECTOR_SIZE); 126 return false; 127 } 128 129 if (!QEMU_IS_ALIGNED(dest->prealloc_align, 130 child_bs->bl.request_alignment)) { 131 error_setg(errp, "prealloc-align parameter of preallocate filter " 132 "is not aligned to underlying node request alignment " 133 "(%" PRIi32 ")", child_bs->bl.request_alignment); 134 return false; 135 } 136 137 return true; 138 } 139 140 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, 141 Error **errp) 142 { 143 BDRVPreallocateState *s = bs->opaque; 144 int ret; 145 146 GLOBAL_STATE_CODE(); 147 148 /* 149 * s->data_end and friends should be initialized on permission update. 150 * For this to work, mark them invalid. 151 */ 152 s->file_end = s->zero_start = s->data_end = -EINVAL; 153 s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs); 154 155 ret = bdrv_open_file_child(NULL, options, "file", bs, errp); 156 if (ret < 0) { 157 return ret; 158 } 159 160 GRAPH_RDLOCK_GUARD_MAINLOOP(); 161 162 if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) { 163 return -EINVAL; 164 } 165 166 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | 167 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); 168 169 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | 170 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & 171 bs->file->bs->supported_zero_flags); 172 173 return 0; 174 } 175 176 static int GRAPH_RDLOCK 177 preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp) 178 { 179 BDRVPreallocateState *s = bs->opaque; 180 int ret; 181 182 if (s->file_end < 0) { 183 s->file_end = bdrv_getlength(bs->file->bs); 184 if (s->file_end < 0) { 185 error_setg_errno(errp, -s->file_end, "Failed to get file length"); 186 return s->file_end; 187 } 188 } 189 190 if (s->data_end < s->file_end) { 191 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, 192 NULL); 193 if (ret < 0) { 194 error_setg_errno(errp, -ret, "Failed to drop preallocation"); 195 s->file_end = ret; 196 return ret; 197 } 198 s->file_end = s->data_end; 199 } 200 201 return 0; 202 } 203 204 static void preallocate_close(BlockDriverState *bs) 205 { 206 BDRVPreallocateState *s = bs->opaque; 207 208 GLOBAL_STATE_CODE(); 209 GRAPH_RDLOCK_GUARD_MAINLOOP(); 210 211 qemu_bh_cancel(s->drop_resize_bh); 212 qemu_bh_delete(s->drop_resize_bh); 213 214 if (s->data_end >= 0) { 215 preallocate_truncate_to_real_size(bs, NULL); 216 } 217 } 218 219 220 /* 221 * Handle reopen. 222 * 223 * We must implement reopen handlers, otherwise reopen just don't work. Handle 224 * new options and don't care about preallocation state, as it is handled in 225 * set/check permission handlers. 226 */ 227 228 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state, 229 BlockReopenQueue *queue, Error **errp) 230 { 231 PreallocateOpts *opts = g_new0(PreallocateOpts, 1); 232 int ret; 233 234 GLOBAL_STATE_CODE(); 235 GRAPH_RDLOCK_GUARD_MAINLOOP(); 236 237 if (!preallocate_absorb_opts(opts, reopen_state->options, 238 reopen_state->bs->file->bs, errp)) { 239 g_free(opts); 240 return -EINVAL; 241 } 242 243 /* 244 * Drop the preallocation already here if reopening read-only. The child 245 * might also be reopened read-only and then scheduling a BH during the 246 * permission update is too late. 247 */ 248 if ((reopen_state->flags & BDRV_O_RDWR) == 0) { 249 ret = preallocate_drop_resize(reopen_state->bs, errp); 250 if (ret < 0) { 251 g_free(opts); 252 return ret; 253 } 254 } 255 256 reopen_state->opaque = opts; 257 258 return 0; 259 } 260 261 static void preallocate_reopen_commit(BDRVReopenState *state) 262 { 263 BDRVPreallocateState *s = state->bs->opaque; 264 265 s->opts = *(PreallocateOpts *)state->opaque; 266 267 g_free(state->opaque); 268 state->opaque = NULL; 269 } 270 271 static void preallocate_reopen_abort(BDRVReopenState *state) 272 { 273 g_free(state->opaque); 274 state->opaque = NULL; 275 } 276 277 static int coroutine_fn GRAPH_RDLOCK 278 preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes, 279 QEMUIOVector *qiov, size_t qiov_offset, 280 BdrvRequestFlags flags) 281 { 282 return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, 283 flags); 284 } 285 286 static int coroutine_fn GRAPH_RDLOCK 287 preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 288 { 289 return bdrv_co_pdiscard(bs->file, offset, bytes); 290 } 291 292 static bool can_write_resize(uint64_t perm) 293 { 294 return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE); 295 } 296 297 static bool GRAPH_RDLOCK has_prealloc_perms(BlockDriverState *bs) 298 { 299 BDRVPreallocateState *s = bs->opaque; 300 301 if (can_write_resize(bs->file->perm)) { 302 assert(!(bs->file->shared_perm & BLK_PERM_WRITE)); 303 assert(!(bs->file->shared_perm & BLK_PERM_RESIZE)); 304 return true; 305 } 306 307 assert(s->data_end < 0); 308 assert(s->zero_start < 0); 309 assert(s->file_end < 0); 310 return false; 311 } 312 313 /* 314 * Call on each write. Returns true if @want_merge_zero is true and the region 315 * [offset, offset + bytes) is zeroed (as a result of this call or earlier 316 * preallocation). 317 * 318 * want_merge_zero is used to merge write-zero request with preallocation in 319 * one bdrv_co_pwrite_zeroes() call. 320 */ 321 static bool coroutine_fn GRAPH_RDLOCK 322 handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes, 323 bool want_merge_zero) 324 { 325 BDRVPreallocateState *s = bs->opaque; 326 int64_t end = offset + bytes; 327 int64_t prealloc_start, prealloc_end; 328 int ret; 329 uint32_t file_align = bs->file->bs->bl.request_alignment; 330 uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align); 331 332 assert(QEMU_IS_ALIGNED(prealloc_align, file_align)); 333 334 if (!has_prealloc_perms(bs)) { 335 /* We don't have state neither should try to recover it */ 336 return false; 337 } 338 339 if (s->data_end < 0) { 340 s->data_end = bdrv_co_getlength(bs->file->bs); 341 if (s->data_end < 0) { 342 return false; 343 } 344 345 if (s->file_end < 0) { 346 s->file_end = s->data_end; 347 } 348 } 349 350 if (end <= s->data_end) { 351 return false; 352 } 353 354 /* We have valid s->data_end, and request writes beyond it. */ 355 356 s->data_end = end; 357 if (s->zero_start < 0 || !want_merge_zero) { 358 s->zero_start = end; 359 } 360 361 if (s->file_end < 0) { 362 s->file_end = bdrv_co_getlength(bs->file->bs); 363 if (s->file_end < 0) { 364 return false; 365 } 366 } 367 368 /* Now s->data_end, s->zero_start and s->file_end are valid. */ 369 370 if (end <= s->file_end) { 371 /* No preallocation needed. */ 372 return want_merge_zero && offset >= s->zero_start; 373 } 374 375 /* Now we want new preallocation, as request writes beyond s->file_end. */ 376 377 prealloc_start = QEMU_ALIGN_UP( 378 want_merge_zero ? MIN(offset, s->file_end) : s->file_end, 379 file_align); 380 prealloc_end = QEMU_ALIGN_UP( 381 MAX(prealloc_start, end) + s->opts.prealloc_size, 382 prealloc_align); 383 384 want_merge_zero = want_merge_zero && (prealloc_start <= offset); 385 386 ret = bdrv_co_pwrite_zeroes( 387 bs->file, prealloc_start, prealloc_end - prealloc_start, 388 BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); 389 if (ret < 0) { 390 s->file_end = ret; 391 return false; 392 } 393 394 s->file_end = prealloc_end; 395 return want_merge_zero; 396 } 397 398 static int coroutine_fn GRAPH_RDLOCK 399 preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, 400 int64_t bytes, BdrvRequestFlags flags) 401 { 402 bool want_merge_zero = 403 !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); 404 if (handle_write(bs, offset, bytes, want_merge_zero)) { 405 return 0; 406 } 407 408 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); 409 } 410 411 static int coroutine_fn GRAPH_RDLOCK 412 preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes, 413 QEMUIOVector *qiov, size_t qiov_offset, 414 BdrvRequestFlags flags) 415 { 416 handle_write(bs, offset, bytes, false); 417 418 return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, 419 flags); 420 } 421 422 static int coroutine_fn GRAPH_RDLOCK 423 preallocate_co_truncate(BlockDriverState *bs, int64_t offset, 424 bool exact, PreallocMode prealloc, 425 BdrvRequestFlags flags, Error **errp) 426 { 427 ERRP_GUARD(); 428 BDRVPreallocateState *s = bs->opaque; 429 int ret; 430 431 if (s->data_end >= 0 && offset > s->data_end) { 432 if (s->file_end < 0) { 433 s->file_end = bdrv_co_getlength(bs->file->bs); 434 if (s->file_end < 0) { 435 error_setg(errp, "failed to get file length"); 436 return s->file_end; 437 } 438 } 439 440 if (prealloc == PREALLOC_MODE_FALLOC) { 441 /* 442 * If offset <= s->file_end, the task is already done, just 443 * update s->data_end, to move part of "filter preallocation" 444 * to "preallocation requested by user". 445 * Otherwise just proceed to preallocate missing part. 446 */ 447 if (offset <= s->file_end) { 448 s->data_end = offset; 449 return 0; 450 } 451 } else { 452 /* 453 * We have to drop our preallocation, to 454 * - avoid "Cannot use preallocation for shrinking files" in 455 * case of offset < file_end 456 * - give PREALLOC_MODE_OFF a chance to keep small disk 457 * usage 458 * - give PREALLOC_MODE_FULL a chance to actually write the 459 * whole region as user expects 460 */ 461 if (s->file_end > s->data_end) { 462 ret = bdrv_co_truncate(bs->file, s->data_end, true, 463 PREALLOC_MODE_OFF, 0, errp); 464 if (ret < 0) { 465 s->file_end = ret; 466 error_prepend(errp, "preallocate-filter: failed to drop " 467 "write-zero preallocation: "); 468 return ret; 469 } 470 s->file_end = s->data_end; 471 } 472 } 473 474 s->data_end = offset; 475 } 476 477 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); 478 if (ret < 0) { 479 s->file_end = s->zero_start = s->data_end = ret; 480 return ret; 481 } 482 483 if (has_prealloc_perms(bs)) { 484 s->file_end = s->zero_start = s->data_end = offset; 485 } 486 return 0; 487 } 488 489 static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs) 490 { 491 return bdrv_co_flush(bs->file->bs); 492 } 493 494 static int64_t coroutine_fn GRAPH_RDLOCK 495 preallocate_co_getlength(BlockDriverState *bs) 496 { 497 int64_t ret; 498 BDRVPreallocateState *s = bs->opaque; 499 500 if (s->data_end >= 0) { 501 return s->data_end; 502 } 503 504 ret = bdrv_co_getlength(bs->file->bs); 505 506 if (has_prealloc_perms(bs)) { 507 s->file_end = s->zero_start = s->data_end = ret; 508 } 509 510 return ret; 511 } 512 513 static int GRAPH_RDLOCK 514 preallocate_drop_resize(BlockDriverState *bs, Error **errp) 515 { 516 BDRVPreallocateState *s = bs->opaque; 517 int ret; 518 519 if (s->data_end < 0) { 520 return 0; 521 } 522 523 /* 524 * Before switching children to be read-only, truncate them to remove 525 * the preallocation and let them have the real size. 526 */ 527 ret = preallocate_truncate_to_real_size(bs, errp); 528 if (ret < 0) { 529 return ret; 530 } 531 532 /* 533 * We'll drop our permissions and will allow other users to take write and 534 * resize permissions (see preallocate_child_perm). Anyone will be able to 535 * change the child, so mark all states invalid. We'll regain control if a 536 * parent requests write access again. 537 */ 538 s->data_end = s->file_end = s->zero_start = -EINVAL; 539 540 bdrv_child_refresh_perms(bs, bs->file, NULL); 541 542 return 0; 543 } 544 545 static void preallocate_drop_resize_bh(void *opaque) 546 { 547 GLOBAL_STATE_CODE(); 548 GRAPH_RDLOCK_GUARD_MAINLOOP(); 549 550 /* 551 * In case of errors, we'll simply keep the exclusive lock on the image 552 * indefinitely. 553 */ 554 preallocate_drop_resize(opaque, NULL); 555 } 556 557 static void GRAPH_RDLOCK 558 preallocate_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) 559 { 560 BDRVPreallocateState *s = bs->opaque; 561 562 if (can_write_resize(perm)) { 563 qemu_bh_cancel(s->drop_resize_bh); 564 if (s->data_end < 0) { 565 s->data_end = s->file_end = s->zero_start = 566 bs->file->bs->total_sectors * BDRV_SECTOR_SIZE; 567 } 568 } else { 569 qemu_bh_schedule(s->drop_resize_bh); 570 } 571 } 572 573 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, 574 BdrvChildRole role, BlockReopenQueue *reopen_queue, 575 uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) 576 { 577 BDRVPreallocateState *s = bs->opaque; 578 579 bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); 580 581 /* 582 * We need exclusive write and resize permissions on the child not only when 583 * the parent can write to it, but also after the parent gave up write 584 * permissions until preallocate_drop_resize() has completed. 585 */ 586 if (can_write_resize(perm) || s->data_end != -EINVAL) { 587 *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; 588 589 /* 590 * Don't share, to keep our states s->file_end, s->data_end and 591 * s->zero_start valid. 592 */ 593 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); 594 } 595 } 596 597 static BlockDriver bdrv_preallocate_filter = { 598 .format_name = "preallocate", 599 .instance_size = sizeof(BDRVPreallocateState), 600 601 .bdrv_co_getlength = preallocate_co_getlength, 602 .bdrv_open = preallocate_open, 603 .bdrv_close = preallocate_close, 604 605 .bdrv_reopen_prepare = preallocate_reopen_prepare, 606 .bdrv_reopen_commit = preallocate_reopen_commit, 607 .bdrv_reopen_abort = preallocate_reopen_abort, 608 609 .bdrv_co_preadv_part = preallocate_co_preadv_part, 610 .bdrv_co_pwritev_part = preallocate_co_pwritev_part, 611 .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, 612 .bdrv_co_pdiscard = preallocate_co_pdiscard, 613 .bdrv_co_flush = preallocate_co_flush, 614 .bdrv_co_truncate = preallocate_co_truncate, 615 616 .bdrv_set_perm = preallocate_set_perm, 617 .bdrv_child_perm = preallocate_child_perm, 618 619 .is_filter = true, 620 }; 621 622 static void bdrv_preallocate_init(void) 623 { 624 bdrv_register(&bdrv_preallocate_filter); 625 } 626 627 block_init(bdrv_preallocate_init); 628