1 /* 2 * preallocate filter driver 3 * 4 * The driver performs preallocate operation: it is injected above 5 * some node, and before each write over EOF it does additional preallocating 6 * write-zeroes request. 7 * 8 * Copyright (c) 2020 Virtuozzo International GmbH. 9 * 10 * Author: 11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com> 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 * 23 * You should have received a copy of the GNU General Public License 24 * along with this program. If not, see <http://www.gnu.org/licenses/>. 25 */ 26 27 #include "qemu/osdep.h" 28 29 #include "qapi/error.h" 30 #include "qemu/module.h" 31 #include "qemu/option.h" 32 #include "qemu/units.h" 33 #include "block/block-io.h" 34 #include "block/block_int.h" 35 36 37 typedef struct PreallocateOpts { 38 int64_t prealloc_size; 39 int64_t prealloc_align; 40 } PreallocateOpts; 41 42 typedef struct BDRVPreallocateState { 43 PreallocateOpts opts; 44 45 /* 46 * Track real data end, to crop preallocation on close. If < 0 the status is 47 * unknown. 48 * 49 * @data_end is a maximum of file size on open (or when we get write/resize 50 * permissions) and all write request ends after it. So it's safe to 51 * truncate to data_end if it is valid. 52 */ 53 int64_t data_end; 54 55 /* 56 * Start of trailing preallocated area which reads as zero. May be smaller 57 * than data_end, if user does over-EOF write zero operation. If < 0 the 58 * status is unknown. 59 * 60 * If both @zero_start and @file_end are valid, the region 61 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end 62 * is not valid, @zero_start doesn't make much sense. 63 */ 64 int64_t zero_start; 65 66 /* 67 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs), 68 * to avoid extra lseek() calls on each write operation. If < 0 the status 69 * is unknown. 70 */ 71 int64_t file_end; 72 73 /* 74 * All three states @data_end, @zero_start and @file_end are guaranteed to 75 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and 76 * BLK_PERM_WRITE permissions on file child. 77 */ 78 } BDRVPreallocateState; 79 80 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" 81 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" 82 static QemuOptsList runtime_opts = { 83 .name = "preallocate", 84 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 85 .desc = { 86 { 87 .name = PREALLOCATE_OPT_PREALLOC_ALIGN, 88 .type = QEMU_OPT_SIZE, 89 .help = "on preallocation, align file length to this number, " 90 "default 1M", 91 }, 92 { 93 .name = PREALLOCATE_OPT_PREALLOC_SIZE, 94 .type = QEMU_OPT_SIZE, 95 .help = "how much to preallocate, default 128M", 96 }, 97 { /* end of list */ } 98 }, 99 }; 100 101 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options, 102 BlockDriverState *child_bs, Error **errp) 103 { 104 QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 105 106 if (!qemu_opts_absorb_qdict(opts, options, errp)) { 107 return false; 108 } 109 110 dest->prealloc_align = 111 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); 112 dest->prealloc_size = 113 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); 114 115 qemu_opts_del(opts); 116 117 if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) { 118 error_setg(errp, "prealloc-align parameter of preallocate filter " 119 "is not aligned to %llu", BDRV_SECTOR_SIZE); 120 return false; 121 } 122 123 if (!QEMU_IS_ALIGNED(dest->prealloc_align, 124 child_bs->bl.request_alignment)) { 125 error_setg(errp, "prealloc-align parameter of preallocate filter " 126 "is not aligned to underlying node request alignment " 127 "(%" PRIi32 ")", child_bs->bl.request_alignment); 128 return false; 129 } 130 131 return true; 132 } 133 134 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, 135 Error **errp) 136 { 137 BDRVPreallocateState *s = bs->opaque; 138 int ret; 139 140 /* 141 * s->data_end and friends should be initialized on permission update. 142 * For this to work, mark them invalid. 143 */ 144 s->file_end = s->zero_start = s->data_end = -EINVAL; 145 146 ret = bdrv_open_file_child(NULL, options, "file", bs, errp); 147 if (ret < 0) { 148 return ret; 149 } 150 151 if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) { 152 return -EINVAL; 153 } 154 155 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | 156 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); 157 158 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | 159 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & 160 bs->file->bs->supported_zero_flags); 161 162 return 0; 163 } 164 165 static void preallocate_close(BlockDriverState *bs) 166 { 167 int ret; 168 BDRVPreallocateState *s = bs->opaque; 169 170 if (s->data_end < 0) { 171 return; 172 } 173 174 if (s->file_end < 0) { 175 s->file_end = bdrv_getlength(bs->file->bs); 176 if (s->file_end < 0) { 177 return; 178 } 179 } 180 181 if (s->data_end < s->file_end) { 182 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, 183 NULL); 184 s->file_end = ret < 0 ? ret : s->data_end; 185 } 186 } 187 188 189 /* 190 * Handle reopen. 191 * 192 * We must implement reopen handlers, otherwise reopen just don't work. Handle 193 * new options and don't care about preallocation state, as it is handled in 194 * set/check permission handlers. 195 */ 196 197 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state, 198 BlockReopenQueue *queue, Error **errp) 199 { 200 PreallocateOpts *opts = g_new0(PreallocateOpts, 1); 201 202 if (!preallocate_absorb_opts(opts, reopen_state->options, 203 reopen_state->bs->file->bs, errp)) { 204 g_free(opts); 205 return -EINVAL; 206 } 207 208 reopen_state->opaque = opts; 209 210 return 0; 211 } 212 213 static void preallocate_reopen_commit(BDRVReopenState *state) 214 { 215 BDRVPreallocateState *s = state->bs->opaque; 216 217 s->opts = *(PreallocateOpts *)state->opaque; 218 219 g_free(state->opaque); 220 state->opaque = NULL; 221 } 222 223 static void preallocate_reopen_abort(BDRVReopenState *state) 224 { 225 g_free(state->opaque); 226 state->opaque = NULL; 227 } 228 229 static int coroutine_fn GRAPH_RDLOCK 230 preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes, 231 QEMUIOVector *qiov, size_t qiov_offset, 232 BdrvRequestFlags flags) 233 { 234 return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, 235 flags); 236 } 237 238 static int coroutine_fn GRAPH_RDLOCK 239 preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 240 { 241 return bdrv_co_pdiscard(bs->file, offset, bytes); 242 } 243 244 static bool can_write_resize(uint64_t perm) 245 { 246 return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE); 247 } 248 249 static bool has_prealloc_perms(BlockDriverState *bs) 250 { 251 BDRVPreallocateState *s = bs->opaque; 252 253 if (can_write_resize(bs->file->perm)) { 254 assert(!(bs->file->shared_perm & BLK_PERM_WRITE)); 255 assert(!(bs->file->shared_perm & BLK_PERM_RESIZE)); 256 return true; 257 } 258 259 assert(s->data_end < 0); 260 assert(s->zero_start < 0); 261 assert(s->file_end < 0); 262 return false; 263 } 264 265 /* 266 * Call on each write. Returns true if @want_merge_zero is true and the region 267 * [offset, offset + bytes) is zeroed (as a result of this call or earlier 268 * preallocation). 269 * 270 * want_merge_zero is used to merge write-zero request with preallocation in 271 * one bdrv_co_pwrite_zeroes() call. 272 */ 273 static bool coroutine_fn GRAPH_RDLOCK 274 handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes, 275 bool want_merge_zero) 276 { 277 BDRVPreallocateState *s = bs->opaque; 278 int64_t end = offset + bytes; 279 int64_t prealloc_start, prealloc_end; 280 int ret; 281 uint32_t file_align = bs->file->bs->bl.request_alignment; 282 uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align); 283 284 assert(QEMU_IS_ALIGNED(prealloc_align, file_align)); 285 286 if (!has_prealloc_perms(bs)) { 287 /* We don't have state neither should try to recover it */ 288 return false; 289 } 290 291 if (s->data_end < 0) { 292 s->data_end = bdrv_co_getlength(bs->file->bs); 293 if (s->data_end < 0) { 294 return false; 295 } 296 297 if (s->file_end < 0) { 298 s->file_end = s->data_end; 299 } 300 } 301 302 if (end <= s->data_end) { 303 return false; 304 } 305 306 /* We have valid s->data_end, and request writes beyond it. */ 307 308 s->data_end = end; 309 if (s->zero_start < 0 || !want_merge_zero) { 310 s->zero_start = end; 311 } 312 313 if (s->file_end < 0) { 314 s->file_end = bdrv_co_getlength(bs->file->bs); 315 if (s->file_end < 0) { 316 return false; 317 } 318 } 319 320 /* Now s->data_end, s->zero_start and s->file_end are valid. */ 321 322 if (end <= s->file_end) { 323 /* No preallocation needed. */ 324 return want_merge_zero && offset >= s->zero_start; 325 } 326 327 /* Now we want new preallocation, as request writes beyond s->file_end. */ 328 329 prealloc_start = QEMU_ALIGN_UP( 330 want_merge_zero ? MIN(offset, s->file_end) : s->file_end, 331 file_align); 332 prealloc_end = QEMU_ALIGN_UP( 333 MAX(prealloc_start, end) + s->opts.prealloc_size, 334 prealloc_align); 335 336 want_merge_zero = want_merge_zero && (prealloc_start <= offset); 337 338 ret = bdrv_co_pwrite_zeroes( 339 bs->file, prealloc_start, prealloc_end - prealloc_start, 340 BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); 341 if (ret < 0) { 342 s->file_end = ret; 343 return false; 344 } 345 346 s->file_end = prealloc_end; 347 return want_merge_zero; 348 } 349 350 static int coroutine_fn GRAPH_RDLOCK 351 preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, 352 int64_t bytes, BdrvRequestFlags flags) 353 { 354 bool want_merge_zero = 355 !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); 356 if (handle_write(bs, offset, bytes, want_merge_zero)) { 357 return 0; 358 } 359 360 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); 361 } 362 363 static int coroutine_fn GRAPH_RDLOCK 364 preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes, 365 QEMUIOVector *qiov, size_t qiov_offset, 366 BdrvRequestFlags flags) 367 { 368 handle_write(bs, offset, bytes, false); 369 370 return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, 371 flags); 372 } 373 374 static int coroutine_fn GRAPH_RDLOCK 375 preallocate_co_truncate(BlockDriverState *bs, int64_t offset, 376 bool exact, PreallocMode prealloc, 377 BdrvRequestFlags flags, Error **errp) 378 { 379 ERRP_GUARD(); 380 BDRVPreallocateState *s = bs->opaque; 381 int ret; 382 383 if (s->data_end >= 0 && offset > s->data_end) { 384 if (s->file_end < 0) { 385 s->file_end = bdrv_co_getlength(bs->file->bs); 386 if (s->file_end < 0) { 387 error_setg(errp, "failed to get file length"); 388 return s->file_end; 389 } 390 } 391 392 if (prealloc == PREALLOC_MODE_FALLOC) { 393 /* 394 * If offset <= s->file_end, the task is already done, just 395 * update s->data_end, to move part of "filter preallocation" 396 * to "preallocation requested by user". 397 * Otherwise just proceed to preallocate missing part. 398 */ 399 if (offset <= s->file_end) { 400 s->data_end = offset; 401 return 0; 402 } 403 } else { 404 /* 405 * We have to drop our preallocation, to 406 * - avoid "Cannot use preallocation for shrinking files" in 407 * case of offset < file_end 408 * - give PREALLOC_MODE_OFF a chance to keep small disk 409 * usage 410 * - give PREALLOC_MODE_FULL a chance to actually write the 411 * whole region as user expects 412 */ 413 if (s->file_end > s->data_end) { 414 ret = bdrv_co_truncate(bs->file, s->data_end, true, 415 PREALLOC_MODE_OFF, 0, errp); 416 if (ret < 0) { 417 s->file_end = ret; 418 error_prepend(errp, "preallocate-filter: failed to drop " 419 "write-zero preallocation: "); 420 return ret; 421 } 422 s->file_end = s->data_end; 423 } 424 } 425 426 s->data_end = offset; 427 } 428 429 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); 430 if (ret < 0) { 431 s->file_end = s->zero_start = s->data_end = ret; 432 return ret; 433 } 434 435 if (has_prealloc_perms(bs)) { 436 s->file_end = s->zero_start = s->data_end = offset; 437 } 438 return 0; 439 } 440 441 static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs) 442 { 443 return bdrv_co_flush(bs->file->bs); 444 } 445 446 static int64_t coroutine_fn GRAPH_RDLOCK 447 preallocate_co_getlength(BlockDriverState *bs) 448 { 449 int64_t ret; 450 BDRVPreallocateState *s = bs->opaque; 451 452 if (s->data_end >= 0) { 453 return s->data_end; 454 } 455 456 ret = bdrv_co_getlength(bs->file->bs); 457 458 if (has_prealloc_perms(bs)) { 459 s->file_end = s->zero_start = s->data_end = ret; 460 } 461 462 return ret; 463 } 464 465 static int preallocate_check_perm(BlockDriverState *bs, 466 uint64_t perm, uint64_t shared, Error **errp) 467 { 468 BDRVPreallocateState *s = bs->opaque; 469 470 if (s->data_end >= 0 && !can_write_resize(perm)) { 471 /* 472 * Lose permissions. 473 * We should truncate in check_perm, as in set_perm bs->file->perm will 474 * be already changed, and we should not violate it. 475 */ 476 if (s->file_end < 0) { 477 s->file_end = bdrv_getlength(bs->file->bs); 478 if (s->file_end < 0) { 479 error_setg(errp, "Failed to get file length"); 480 return s->file_end; 481 } 482 } 483 484 if (s->data_end < s->file_end) { 485 int ret = bdrv_truncate(bs->file, s->data_end, true, 486 PREALLOC_MODE_OFF, 0, NULL); 487 if (ret < 0) { 488 error_setg(errp, "Failed to drop preallocation"); 489 s->file_end = ret; 490 return ret; 491 } 492 s->file_end = s->data_end; 493 } 494 } 495 496 return 0; 497 } 498 499 static void preallocate_set_perm(BlockDriverState *bs, 500 uint64_t perm, uint64_t shared) 501 { 502 BDRVPreallocateState *s = bs->opaque; 503 504 if (can_write_resize(perm)) { 505 if (s->data_end < 0) { 506 s->data_end = s->file_end = s->zero_start = 507 bdrv_getlength(bs->file->bs); 508 } 509 } else { 510 /* 511 * We drop our permissions, as well as allow shared 512 * permissions (see preallocate_child_perm), anyone will be able to 513 * change the child, so mark all states invalid. We'll regain control if 514 * get good permissions back. 515 */ 516 s->data_end = s->file_end = s->zero_start = -EINVAL; 517 } 518 } 519 520 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, 521 BdrvChildRole role, BlockReopenQueue *reopen_queue, 522 uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) 523 { 524 bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); 525 526 if (can_write_resize(perm)) { 527 /* This should come by default, but let's enforce: */ 528 *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; 529 530 /* 531 * Don't share, to keep our states s->file_end, s->data_end and 532 * s->zero_start valid. 533 */ 534 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); 535 } 536 } 537 538 BlockDriver bdrv_preallocate_filter = { 539 .format_name = "preallocate", 540 .instance_size = sizeof(BDRVPreallocateState), 541 542 .bdrv_co_getlength = preallocate_co_getlength, 543 .bdrv_open = preallocate_open, 544 .bdrv_close = preallocate_close, 545 546 .bdrv_reopen_prepare = preallocate_reopen_prepare, 547 .bdrv_reopen_commit = preallocate_reopen_commit, 548 .bdrv_reopen_abort = preallocate_reopen_abort, 549 550 .bdrv_co_preadv_part = preallocate_co_preadv_part, 551 .bdrv_co_pwritev_part = preallocate_co_pwritev_part, 552 .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, 553 .bdrv_co_pdiscard = preallocate_co_pdiscard, 554 .bdrv_co_flush = preallocate_co_flush, 555 .bdrv_co_truncate = preallocate_co_truncate, 556 557 .bdrv_check_perm = preallocate_check_perm, 558 .bdrv_set_perm = preallocate_set_perm, 559 .bdrv_child_perm = preallocate_child_perm, 560 561 .is_filter = true, 562 }; 563 564 static void bdrv_preallocate_init(void) 565 { 566 bdrv_register(&bdrv_preallocate_filter); 567 } 568 569 block_init(bdrv_preallocate_init); 570