1 /* 2 * preallocate filter driver 3 * 4 * The driver performs preallocate operation: it is injected above 5 * some node, and before each write over EOF it does additional preallocating 6 * write-zeroes request. 7 * 8 * Copyright (c) 2020 Virtuozzo International GmbH. 9 * 10 * Author: 11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com> 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 * 23 * You should have received a copy of the GNU General Public License 24 * along with this program. If not, see <http://www.gnu.org/licenses/>. 25 */ 26 27 #include "qemu/osdep.h" 28 29 #include "qapi/error.h" 30 #include "qemu/module.h" 31 #include "qemu/option.h" 32 #include "qemu/units.h" 33 #include "block/block_int.h" 34 35 36 typedef struct PreallocateOpts { 37 int64_t prealloc_size; 38 int64_t prealloc_align; 39 } PreallocateOpts; 40 41 typedef struct BDRVPreallocateState { 42 PreallocateOpts opts; 43 44 /* 45 * Track real data end, to crop preallocation on close. If < 0 the status is 46 * unknown. 47 * 48 * @data_end is a maximum of file size on open (or when we get write/resize 49 * permissions) and all write request ends after it. So it's safe to 50 * truncate to data_end if it is valid. 51 */ 52 int64_t data_end; 53 54 /* 55 * Start of trailing preallocated area which reads as zero. May be smaller 56 * than data_end, if user does over-EOF write zero operation. If < 0 the 57 * status is unknown. 58 * 59 * If both @zero_start and @file_end are valid, the region 60 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end 61 * is not valid, @zero_start doesn't make much sense. 62 */ 63 int64_t zero_start; 64 65 /* 66 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs), 67 * to avoid extra lseek() calls on each write operation. If < 0 the status 68 * is unknown. 69 */ 70 int64_t file_end; 71 72 /* 73 * All three states @data_end, @zero_start and @file_end are guaranteed to 74 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and 75 * BLK_PERM_WRITE permissions on file child. 76 */ 77 } BDRVPreallocateState; 78 79 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" 80 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" 81 static QemuOptsList runtime_opts = { 82 .name = "preallocate", 83 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 84 .desc = { 85 { 86 .name = PREALLOCATE_OPT_PREALLOC_ALIGN, 87 .type = QEMU_OPT_SIZE, 88 .help = "on preallocation, align file length to this number, " 89 "default 1M", 90 }, 91 { 92 .name = PREALLOCATE_OPT_PREALLOC_SIZE, 93 .type = QEMU_OPT_SIZE, 94 .help = "how much to preallocate, default 128M", 95 }, 96 { /* end of list */ } 97 }, 98 }; 99 100 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options, 101 BlockDriverState *child_bs, Error **errp) 102 { 103 QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 104 105 if (!qemu_opts_absorb_qdict(opts, options, errp)) { 106 return false; 107 } 108 109 dest->prealloc_align = 110 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); 111 dest->prealloc_size = 112 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); 113 114 qemu_opts_del(opts); 115 116 if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) { 117 error_setg(errp, "prealloc-align parameter of preallocate filter " 118 "is not aligned to %llu", BDRV_SECTOR_SIZE); 119 return false; 120 } 121 122 if (!QEMU_IS_ALIGNED(dest->prealloc_align, 123 child_bs->bl.request_alignment)) { 124 error_setg(errp, "prealloc-align parameter of preallocate filter " 125 "is not aligned to underlying node request alignment " 126 "(%" PRIi32 ")", child_bs->bl.request_alignment); 127 return false; 128 } 129 130 return true; 131 } 132 133 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, 134 Error **errp) 135 { 136 BDRVPreallocateState *s = bs->opaque; 137 int ret; 138 139 /* 140 * s->data_end and friends should be initialized on permission update. 141 * For this to work, mark them invalid. 142 */ 143 s->file_end = s->zero_start = s->data_end = -EINVAL; 144 145 ret = bdrv_open_file_child(NULL, options, "file", bs, errp); 146 if (ret < 0) { 147 return ret; 148 } 149 150 if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) { 151 return -EINVAL; 152 } 153 154 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | 155 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); 156 157 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | 158 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & 159 bs->file->bs->supported_zero_flags); 160 161 return 0; 162 } 163 164 static void preallocate_close(BlockDriverState *bs) 165 { 166 int ret; 167 BDRVPreallocateState *s = bs->opaque; 168 169 if (s->data_end < 0) { 170 return; 171 } 172 173 if (s->file_end < 0) { 174 s->file_end = bdrv_getlength(bs->file->bs); 175 if (s->file_end < 0) { 176 return; 177 } 178 } 179 180 if (s->data_end < s->file_end) { 181 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, 182 NULL); 183 s->file_end = ret < 0 ? ret : s->data_end; 184 } 185 } 186 187 188 /* 189 * Handle reopen. 190 * 191 * We must implement reopen handlers, otherwise reopen just don't work. Handle 192 * new options and don't care about preallocation state, as it is handled in 193 * set/check permission handlers. 194 */ 195 196 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state, 197 BlockReopenQueue *queue, Error **errp) 198 { 199 PreallocateOpts *opts = g_new0(PreallocateOpts, 1); 200 201 if (!preallocate_absorb_opts(opts, reopen_state->options, 202 reopen_state->bs->file->bs, errp)) { 203 g_free(opts); 204 return -EINVAL; 205 } 206 207 reopen_state->opaque = opts; 208 209 return 0; 210 } 211 212 static void preallocate_reopen_commit(BDRVReopenState *state) 213 { 214 BDRVPreallocateState *s = state->bs->opaque; 215 216 s->opts = *(PreallocateOpts *)state->opaque; 217 218 g_free(state->opaque); 219 state->opaque = NULL; 220 } 221 222 static void preallocate_reopen_abort(BDRVReopenState *state) 223 { 224 g_free(state->opaque); 225 state->opaque = NULL; 226 } 227 228 static coroutine_fn int preallocate_co_preadv_part( 229 BlockDriverState *bs, int64_t offset, int64_t bytes, 230 QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags) 231 { 232 return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, 233 flags); 234 } 235 236 static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs, 237 int64_t offset, int64_t bytes) 238 { 239 return bdrv_co_pdiscard(bs->file, offset, bytes); 240 } 241 242 static bool can_write_resize(uint64_t perm) 243 { 244 return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE); 245 } 246 247 static bool has_prealloc_perms(BlockDriverState *bs) 248 { 249 BDRVPreallocateState *s = bs->opaque; 250 251 if (can_write_resize(bs->file->perm)) { 252 assert(!(bs->file->shared_perm & BLK_PERM_WRITE)); 253 assert(!(bs->file->shared_perm & BLK_PERM_RESIZE)); 254 return true; 255 } 256 257 assert(s->data_end < 0); 258 assert(s->zero_start < 0); 259 assert(s->file_end < 0); 260 return false; 261 } 262 263 /* 264 * Call on each write. Returns true if @want_merge_zero is true and the region 265 * [offset, offset + bytes) is zeroed (as a result of this call or earlier 266 * preallocation). 267 * 268 * want_merge_zero is used to merge write-zero request with preallocation in 269 * one bdrv_co_pwrite_zeroes() call. 270 */ 271 static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset, 272 int64_t bytes, bool want_merge_zero) 273 { 274 BDRVPreallocateState *s = bs->opaque; 275 int64_t end = offset + bytes; 276 int64_t prealloc_start, prealloc_end; 277 int ret; 278 uint32_t file_align = bs->file->bs->bl.request_alignment; 279 uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align); 280 281 assert(QEMU_IS_ALIGNED(prealloc_align, file_align)); 282 283 if (!has_prealloc_perms(bs)) { 284 /* We don't have state neither should try to recover it */ 285 return false; 286 } 287 288 if (s->data_end < 0) { 289 s->data_end = bdrv_getlength(bs->file->bs); 290 if (s->data_end < 0) { 291 return false; 292 } 293 294 if (s->file_end < 0) { 295 s->file_end = s->data_end; 296 } 297 } 298 299 if (end <= s->data_end) { 300 return false; 301 } 302 303 /* We have valid s->data_end, and request writes beyond it. */ 304 305 s->data_end = end; 306 if (s->zero_start < 0 || !want_merge_zero) { 307 s->zero_start = end; 308 } 309 310 if (s->file_end < 0) { 311 s->file_end = bdrv_getlength(bs->file->bs); 312 if (s->file_end < 0) { 313 return false; 314 } 315 } 316 317 /* Now s->data_end, s->zero_start and s->file_end are valid. */ 318 319 if (end <= s->file_end) { 320 /* No preallocation needed. */ 321 return want_merge_zero && offset >= s->zero_start; 322 } 323 324 /* Now we want new preallocation, as request writes beyond s->file_end. */ 325 326 prealloc_start = QEMU_ALIGN_UP( 327 want_merge_zero ? MIN(offset, s->file_end) : s->file_end, 328 file_align); 329 prealloc_end = QEMU_ALIGN_UP( 330 MAX(prealloc_start, end) + s->opts.prealloc_size, 331 prealloc_align); 332 333 want_merge_zero = want_merge_zero && (prealloc_start <= offset); 334 335 ret = bdrv_co_pwrite_zeroes( 336 bs->file, prealloc_start, prealloc_end - prealloc_start, 337 BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); 338 if (ret < 0) { 339 s->file_end = ret; 340 return false; 341 } 342 343 s->file_end = prealloc_end; 344 return want_merge_zero; 345 } 346 347 static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, 348 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 349 { 350 bool want_merge_zero = 351 !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); 352 if (handle_write(bs, offset, bytes, want_merge_zero)) { 353 return 0; 354 } 355 356 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); 357 } 358 359 static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs, 360 int64_t offset, 361 int64_t bytes, 362 QEMUIOVector *qiov, 363 size_t qiov_offset, 364 BdrvRequestFlags flags) 365 { 366 handle_write(bs, offset, bytes, false); 367 368 return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, 369 flags); 370 } 371 372 static int coroutine_fn 373 preallocate_co_truncate(BlockDriverState *bs, int64_t offset, 374 bool exact, PreallocMode prealloc, 375 BdrvRequestFlags flags, Error **errp) 376 { 377 ERRP_GUARD(); 378 BDRVPreallocateState *s = bs->opaque; 379 int ret; 380 381 if (s->data_end >= 0 && offset > s->data_end) { 382 if (s->file_end < 0) { 383 s->file_end = bdrv_getlength(bs->file->bs); 384 if (s->file_end < 0) { 385 error_setg(errp, "failed to get file length"); 386 return s->file_end; 387 } 388 } 389 390 if (prealloc == PREALLOC_MODE_FALLOC) { 391 /* 392 * If offset <= s->file_end, the task is already done, just 393 * update s->data_end, to move part of "filter preallocation" 394 * to "preallocation requested by user". 395 * Otherwise just proceed to preallocate missing part. 396 */ 397 if (offset <= s->file_end) { 398 s->data_end = offset; 399 return 0; 400 } 401 } else { 402 /* 403 * We have to drop our preallocation, to 404 * - avoid "Cannot use preallocation for shrinking files" in 405 * case of offset < file_end 406 * - give PREALLOC_MODE_OFF a chance to keep small disk 407 * usage 408 * - give PREALLOC_MODE_FULL a chance to actually write the 409 * whole region as user expects 410 */ 411 if (s->file_end > s->data_end) { 412 ret = bdrv_co_truncate(bs->file, s->data_end, true, 413 PREALLOC_MODE_OFF, 0, errp); 414 if (ret < 0) { 415 s->file_end = ret; 416 error_prepend(errp, "preallocate-filter: failed to drop " 417 "write-zero preallocation: "); 418 return ret; 419 } 420 s->file_end = s->data_end; 421 } 422 } 423 424 s->data_end = offset; 425 } 426 427 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); 428 if (ret < 0) { 429 s->file_end = s->zero_start = s->data_end = ret; 430 return ret; 431 } 432 433 if (has_prealloc_perms(bs)) { 434 s->file_end = s->zero_start = s->data_end = offset; 435 } 436 return 0; 437 } 438 439 static int coroutine_fn preallocate_co_flush(BlockDriverState *bs) 440 { 441 return bdrv_co_flush(bs->file->bs); 442 } 443 444 static int64_t preallocate_getlength(BlockDriverState *bs) 445 { 446 int64_t ret; 447 BDRVPreallocateState *s = bs->opaque; 448 449 if (s->data_end >= 0) { 450 return s->data_end; 451 } 452 453 ret = bdrv_getlength(bs->file->bs); 454 455 if (has_prealloc_perms(bs)) { 456 s->file_end = s->zero_start = s->data_end = ret; 457 } 458 459 return ret; 460 } 461 462 static int preallocate_check_perm(BlockDriverState *bs, 463 uint64_t perm, uint64_t shared, Error **errp) 464 { 465 BDRVPreallocateState *s = bs->opaque; 466 467 if (s->data_end >= 0 && !can_write_resize(perm)) { 468 /* 469 * Lose permissions. 470 * We should truncate in check_perm, as in set_perm bs->file->perm will 471 * be already changed, and we should not violate it. 472 */ 473 if (s->file_end < 0) { 474 s->file_end = bdrv_getlength(bs->file->bs); 475 if (s->file_end < 0) { 476 error_setg(errp, "Failed to get file length"); 477 return s->file_end; 478 } 479 } 480 481 if (s->data_end < s->file_end) { 482 int ret = bdrv_truncate(bs->file, s->data_end, true, 483 PREALLOC_MODE_OFF, 0, NULL); 484 if (ret < 0) { 485 error_setg(errp, "Failed to drop preallocation"); 486 s->file_end = ret; 487 return ret; 488 } 489 s->file_end = s->data_end; 490 } 491 } 492 493 return 0; 494 } 495 496 static void preallocate_set_perm(BlockDriverState *bs, 497 uint64_t perm, uint64_t shared) 498 { 499 BDRVPreallocateState *s = bs->opaque; 500 501 if (can_write_resize(perm)) { 502 if (s->data_end < 0) { 503 s->data_end = s->file_end = s->zero_start = 504 bdrv_getlength(bs->file->bs); 505 } 506 } else { 507 /* 508 * We drop our permissions, as well as allow shared 509 * permissions (see preallocate_child_perm), anyone will be able to 510 * change the child, so mark all states invalid. We'll regain control if 511 * get good permissions back. 512 */ 513 s->data_end = s->file_end = s->zero_start = -EINVAL; 514 } 515 } 516 517 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, 518 BdrvChildRole role, BlockReopenQueue *reopen_queue, 519 uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) 520 { 521 bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); 522 523 if (can_write_resize(perm)) { 524 /* This should come by default, but let's enforce: */ 525 *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; 526 527 /* 528 * Don't share, to keep our states s->file_end, s->data_end and 529 * s->zero_start valid. 530 */ 531 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); 532 } 533 } 534 535 BlockDriver bdrv_preallocate_filter = { 536 .format_name = "preallocate", 537 .instance_size = sizeof(BDRVPreallocateState), 538 539 .bdrv_getlength = preallocate_getlength, 540 .bdrv_open = preallocate_open, 541 .bdrv_close = preallocate_close, 542 543 .bdrv_reopen_prepare = preallocate_reopen_prepare, 544 .bdrv_reopen_commit = preallocate_reopen_commit, 545 .bdrv_reopen_abort = preallocate_reopen_abort, 546 547 .bdrv_co_preadv_part = preallocate_co_preadv_part, 548 .bdrv_co_pwritev_part = preallocate_co_pwritev_part, 549 .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, 550 .bdrv_co_pdiscard = preallocate_co_pdiscard, 551 .bdrv_co_flush = preallocate_co_flush, 552 .bdrv_co_truncate = preallocate_co_truncate, 553 554 .bdrv_check_perm = preallocate_check_perm, 555 .bdrv_set_perm = preallocate_set_perm, 556 .bdrv_child_perm = preallocate_child_perm, 557 558 .has_variable_length = true, 559 .is_filter = true, 560 }; 561 562 static void bdrv_preallocate_init(void) 563 { 564 bdrv_register(&bdrv_preallocate_filter); 565 } 566 567 block_init(bdrv_preallocate_init); 568