xref: /openbmc/qemu/block/preallocate.c (revision e0091133)
1 /*
2  * preallocate filter driver
3  *
4  * The driver performs preallocate operation: it is injected above
5  * some node, and before each write over EOF it does additional preallocating
6  * write-zeroes request.
7  *
8  * Copyright (c) 2020 Virtuozzo International GmbH.
9  *
10  * Author:
11  *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program. If not, see <http://www.gnu.org/licenses/>.
25  */
26 
27 #include "qemu/osdep.h"
28 
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block_int.h"
34 
35 
36 typedef struct PreallocateOpts {
37     int64_t prealloc_size;
38     int64_t prealloc_align;
39 } PreallocateOpts;
40 
41 typedef struct BDRVPreallocateState {
42     PreallocateOpts opts;
43 
44     /*
45      * Track real data end, to crop preallocation on close. If < 0 the status is
46      * unknown.
47      *
48      * @data_end is a maximum of file size on open (or when we get write/resize
49      * permissions) and all write request ends after it. So it's safe to
50      * truncate to data_end if it is valid.
51      */
52     int64_t data_end;
53 
54     /*
55      * Start of trailing preallocated area which reads as zero. May be smaller
56      * than data_end, if user does over-EOF write zero operation. If < 0 the
57      * status is unknown.
58      *
59      * If both @zero_start and @file_end are valid, the region
60      * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
61      * is not valid, @zero_start doesn't make much sense.
62      */
63     int64_t zero_start;
64 
65     /*
66      * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
67      * to avoid extra lseek() calls on each write operation. If < 0 the status
68      * is unknown.
69      */
70     int64_t file_end;
71 
72     /*
73      * All three states @data_end, @zero_start and @file_end are guaranteed to
74      * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
75      * BLK_PERM_WRITE permissions on file child.
76      */
77 } BDRVPreallocateState;
78 
79 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
80 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
81 static QemuOptsList runtime_opts = {
82     .name = "preallocate",
83     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
84     .desc = {
85         {
86             .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
87             .type = QEMU_OPT_SIZE,
88             .help = "on preallocation, align file length to this number, "
89                 "default 1M",
90         },
91         {
92             .name = PREALLOCATE_OPT_PREALLOC_SIZE,
93             .type = QEMU_OPT_SIZE,
94             .help = "how much to preallocate, default 128M",
95         },
96         { /* end of list */ }
97     },
98 };
99 
100 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
101                                     BlockDriverState *child_bs, Error **errp)
102 {
103     QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
104 
105     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
106         return false;
107     }
108 
109     dest->prealloc_align =
110         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
111     dest->prealloc_size =
112         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
113 
114     qemu_opts_del(opts);
115 
116     if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
117         error_setg(errp, "prealloc-align parameter of preallocate filter "
118                    "is not aligned to %llu", BDRV_SECTOR_SIZE);
119         return false;
120     }
121 
122     if (!QEMU_IS_ALIGNED(dest->prealloc_align,
123                          child_bs->bl.request_alignment)) {
124         error_setg(errp, "prealloc-align parameter of preallocate filter "
125                    "is not aligned to underlying node request alignment "
126                    "(%" PRIi32 ")", child_bs->bl.request_alignment);
127         return false;
128     }
129 
130     return true;
131 }
132 
133 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
134                             Error **errp)
135 {
136     BDRVPreallocateState *s = bs->opaque;
137     int ret;
138 
139     /*
140      * s->data_end and friends should be initialized on permission update.
141      * For this to work, mark them invalid.
142      */
143     s->file_end = s->zero_start = s->data_end = -EINVAL;
144 
145     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
146     if (ret < 0) {
147         return ret;
148     }
149 
150     if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
151         return -EINVAL;
152     }
153 
154     bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
155         (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
156 
157     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
158         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
159             bs->file->bs->supported_zero_flags);
160 
161     return 0;
162 }
163 
164 static void preallocate_close(BlockDriverState *bs)
165 {
166     int ret;
167     BDRVPreallocateState *s = bs->opaque;
168 
169     if (s->data_end < 0) {
170         return;
171     }
172 
173     if (s->file_end < 0) {
174         s->file_end = bdrv_getlength(bs->file->bs);
175         if (s->file_end < 0) {
176             return;
177         }
178     }
179 
180     if (s->data_end < s->file_end) {
181         ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
182                             NULL);
183         s->file_end = ret < 0 ? ret : s->data_end;
184     }
185 }
186 
187 
188 /*
189  * Handle reopen.
190  *
191  * We must implement reopen handlers, otherwise reopen just don't work. Handle
192  * new options and don't care about preallocation state, as it is handled in
193  * set/check permission handlers.
194  */
195 
196 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
197                                       BlockReopenQueue *queue, Error **errp)
198 {
199     PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
200 
201     if (!preallocate_absorb_opts(opts, reopen_state->options,
202                                  reopen_state->bs->file->bs, errp)) {
203         g_free(opts);
204         return -EINVAL;
205     }
206 
207     reopen_state->opaque = opts;
208 
209     return 0;
210 }
211 
212 static void preallocate_reopen_commit(BDRVReopenState *state)
213 {
214     BDRVPreallocateState *s = state->bs->opaque;
215 
216     s->opts = *(PreallocateOpts *)state->opaque;
217 
218     g_free(state->opaque);
219     state->opaque = NULL;
220 }
221 
222 static void preallocate_reopen_abort(BDRVReopenState *state)
223 {
224     g_free(state->opaque);
225     state->opaque = NULL;
226 }
227 
228 static coroutine_fn int preallocate_co_preadv_part(
229         BlockDriverState *bs, int64_t offset, int64_t bytes,
230         QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags)
231 {
232     return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
233                                flags);
234 }
235 
236 static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
237                                                int64_t offset, int64_t bytes)
238 {
239     return bdrv_co_pdiscard(bs->file, offset, bytes);
240 }
241 
242 static bool can_write_resize(uint64_t perm)
243 {
244     return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
245 }
246 
247 static bool has_prealloc_perms(BlockDriverState *bs)
248 {
249     BDRVPreallocateState *s = bs->opaque;
250 
251     if (can_write_resize(bs->file->perm)) {
252         assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
253         assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
254         return true;
255     }
256 
257     assert(s->data_end < 0);
258     assert(s->zero_start < 0);
259     assert(s->file_end < 0);
260     return false;
261 }
262 
263 /*
264  * Call on each write. Returns true if @want_merge_zero is true and the region
265  * [offset, offset + bytes) is zeroed (as a result of this call or earlier
266  * preallocation).
267  *
268  * want_merge_zero is used to merge write-zero request with preallocation in
269  * one bdrv_co_pwrite_zeroes() call.
270  */
271 static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset,
272                                       int64_t bytes, bool want_merge_zero)
273 {
274     BDRVPreallocateState *s = bs->opaque;
275     int64_t end = offset + bytes;
276     int64_t prealloc_start, prealloc_end;
277     int ret;
278     uint32_t file_align = bs->file->bs->bl.request_alignment;
279     uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
280 
281     assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
282 
283     if (!has_prealloc_perms(bs)) {
284         /* We don't have state neither should try to recover it */
285         return false;
286     }
287 
288     if (s->data_end < 0) {
289         s->data_end = bdrv_getlength(bs->file->bs);
290         if (s->data_end < 0) {
291             return false;
292         }
293 
294         if (s->file_end < 0) {
295             s->file_end = s->data_end;
296         }
297     }
298 
299     if (end <= s->data_end) {
300         return false;
301     }
302 
303     /* We have valid s->data_end, and request writes beyond it. */
304 
305     s->data_end = end;
306     if (s->zero_start < 0 || !want_merge_zero) {
307         s->zero_start = end;
308     }
309 
310     if (s->file_end < 0) {
311         s->file_end = bdrv_getlength(bs->file->bs);
312         if (s->file_end < 0) {
313             return false;
314         }
315     }
316 
317     /* Now s->data_end, s->zero_start and s->file_end are valid. */
318 
319     if (end <= s->file_end) {
320         /* No preallocation needed. */
321         return want_merge_zero && offset >= s->zero_start;
322     }
323 
324     /* Now we want new preallocation, as request writes beyond s->file_end. */
325 
326     prealloc_start = QEMU_ALIGN_UP(
327             want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
328             file_align);
329     prealloc_end = QEMU_ALIGN_UP(
330             MAX(prealloc_start, end) + s->opts.prealloc_size,
331             prealloc_align);
332 
333     want_merge_zero = want_merge_zero && (prealloc_start <= offset);
334 
335     ret = bdrv_co_pwrite_zeroes(
336             bs->file, prealloc_start, prealloc_end - prealloc_start,
337             BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
338     if (ret < 0) {
339         s->file_end = ret;
340         return false;
341     }
342 
343     s->file_end = prealloc_end;
344     return want_merge_zero;
345 }
346 
347 static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
348         int64_t offset, int64_t bytes, BdrvRequestFlags flags)
349 {
350     bool want_merge_zero =
351         !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
352     if (handle_write(bs, offset, bytes, want_merge_zero)) {
353         return 0;
354     }
355 
356     return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
357 }
358 
359 static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
360                                                     int64_t offset,
361                                                     int64_t bytes,
362                                                     QEMUIOVector *qiov,
363                                                     size_t qiov_offset,
364                                                     BdrvRequestFlags flags)
365 {
366     handle_write(bs, offset, bytes, false);
367 
368     return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
369                                 flags);
370 }
371 
372 static int coroutine_fn
373 preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
374                         bool exact, PreallocMode prealloc,
375                         BdrvRequestFlags flags, Error **errp)
376 {
377     ERRP_GUARD();
378     BDRVPreallocateState *s = bs->opaque;
379     int ret;
380 
381     if (s->data_end >= 0 && offset > s->data_end) {
382         if (s->file_end < 0) {
383             s->file_end = bdrv_getlength(bs->file->bs);
384             if (s->file_end < 0) {
385                 error_setg(errp, "failed to get file length");
386                 return s->file_end;
387             }
388         }
389 
390         if (prealloc == PREALLOC_MODE_FALLOC) {
391             /*
392              * If offset <= s->file_end, the task is already done, just
393              * update s->data_end, to move part of "filter preallocation"
394              * to "preallocation requested by user".
395              * Otherwise just proceed to preallocate missing part.
396              */
397             if (offset <= s->file_end) {
398                 s->data_end = offset;
399                 return 0;
400             }
401         } else {
402             /*
403              * We have to drop our preallocation, to
404              * - avoid "Cannot use preallocation for shrinking files" in
405              *   case of offset < file_end
406              * - give PREALLOC_MODE_OFF a chance to keep small disk
407              *   usage
408              * - give PREALLOC_MODE_FULL a chance to actually write the
409              *   whole region as user expects
410              */
411             if (s->file_end > s->data_end) {
412                 ret = bdrv_co_truncate(bs->file, s->data_end, true,
413                                        PREALLOC_MODE_OFF, 0, errp);
414                 if (ret < 0) {
415                     s->file_end = ret;
416                     error_prepend(errp, "preallocate-filter: failed to drop "
417                                   "write-zero preallocation: ");
418                     return ret;
419                 }
420                 s->file_end = s->data_end;
421             }
422         }
423 
424         s->data_end = offset;
425     }
426 
427     ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
428     if (ret < 0) {
429         s->file_end = s->zero_start = s->data_end = ret;
430         return ret;
431     }
432 
433     if (has_prealloc_perms(bs)) {
434         s->file_end = s->zero_start = s->data_end = offset;
435     }
436     return 0;
437 }
438 
439 static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
440 {
441     return bdrv_co_flush(bs->file->bs);
442 }
443 
444 static int64_t preallocate_getlength(BlockDriverState *bs)
445 {
446     int64_t ret;
447     BDRVPreallocateState *s = bs->opaque;
448 
449     if (s->data_end >= 0) {
450         return s->data_end;
451     }
452 
453     ret = bdrv_getlength(bs->file->bs);
454 
455     if (has_prealloc_perms(bs)) {
456         s->file_end = s->zero_start = s->data_end = ret;
457     }
458 
459     return ret;
460 }
461 
462 static int preallocate_check_perm(BlockDriverState *bs,
463                                   uint64_t perm, uint64_t shared, Error **errp)
464 {
465     BDRVPreallocateState *s = bs->opaque;
466 
467     if (s->data_end >= 0 && !can_write_resize(perm)) {
468         /*
469          * Lose permissions.
470          * We should truncate in check_perm, as in set_perm bs->file->perm will
471          * be already changed, and we should not violate it.
472          */
473         if (s->file_end < 0) {
474             s->file_end = bdrv_getlength(bs->file->bs);
475             if (s->file_end < 0) {
476                 error_setg(errp, "Failed to get file length");
477                 return s->file_end;
478             }
479         }
480 
481         if (s->data_end < s->file_end) {
482             int ret = bdrv_truncate(bs->file, s->data_end, true,
483                                     PREALLOC_MODE_OFF, 0, NULL);
484             if (ret < 0) {
485                 error_setg(errp, "Failed to drop preallocation");
486                 s->file_end = ret;
487                 return ret;
488             }
489             s->file_end = s->data_end;
490         }
491     }
492 
493     return 0;
494 }
495 
496 static void preallocate_set_perm(BlockDriverState *bs,
497                                  uint64_t perm, uint64_t shared)
498 {
499     BDRVPreallocateState *s = bs->opaque;
500 
501     if (can_write_resize(perm)) {
502         if (s->data_end < 0) {
503             s->data_end = s->file_end = s->zero_start =
504                 bdrv_getlength(bs->file->bs);
505         }
506     } else {
507         /*
508          * We drop our permissions, as well as allow shared
509          * permissions (see preallocate_child_perm), anyone will be able to
510          * change the child, so mark all states invalid. We'll regain control if
511          * get good permissions back.
512          */
513         s->data_end = s->file_end = s->zero_start = -EINVAL;
514     }
515 }
516 
517 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
518     BdrvChildRole role, BlockReopenQueue *reopen_queue,
519     uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
520 {
521     bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
522 
523     if (can_write_resize(perm)) {
524         /* This should come by default, but let's enforce: */
525         *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
526 
527         /*
528          * Don't share, to keep our states s->file_end, s->data_end and
529          * s->zero_start valid.
530          */
531         *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
532     }
533 }
534 
535 BlockDriver bdrv_preallocate_filter = {
536     .format_name = "preallocate",
537     .instance_size = sizeof(BDRVPreallocateState),
538 
539     .bdrv_getlength = preallocate_getlength,
540     .bdrv_open = preallocate_open,
541     .bdrv_close = preallocate_close,
542 
543     .bdrv_reopen_prepare  = preallocate_reopen_prepare,
544     .bdrv_reopen_commit   = preallocate_reopen_commit,
545     .bdrv_reopen_abort    = preallocate_reopen_abort,
546 
547     .bdrv_co_preadv_part = preallocate_co_preadv_part,
548     .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
549     .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
550     .bdrv_co_pdiscard = preallocate_co_pdiscard,
551     .bdrv_co_flush = preallocate_co_flush,
552     .bdrv_co_truncate = preallocate_co_truncate,
553 
554     .bdrv_check_perm = preallocate_check_perm,
555     .bdrv_set_perm = preallocate_set_perm,
556     .bdrv_child_perm = preallocate_child_perm,
557 
558     .has_variable_length = true,
559     .is_filter = true,
560 };
561 
562 static void bdrv_preallocate_init(void)
563 {
564     bdrv_register(&bdrv_preallocate_filter);
565 }
566 
567 block_init(bdrv_preallocate_init);
568