xref: /openbmc/qemu/block/preallocate.c (revision 51e47cf8)
1 /*
2  * preallocate filter driver
3  *
4  * The driver performs preallocate operation: it is injected above
5  * some node, and before each write over EOF it does additional preallocating
6  * write-zeroes request.
7  *
8  * Copyright (c) 2020 Virtuozzo International GmbH.
9  *
10  * Author:
11  *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program. If not, see <http://www.gnu.org/licenses/>.
25  */
26 
27 #include "qemu/osdep.h"
28 
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block-io.h"
34 #include "block/block_int.h"
35 
36 
37 typedef struct PreallocateOpts {
38     int64_t prealloc_size;
39     int64_t prealloc_align;
40 } PreallocateOpts;
41 
42 typedef struct BDRVPreallocateState {
43     PreallocateOpts opts;
44 
45     /*
46      * Track real data end, to crop preallocation on close. If < 0 the status is
47      * unknown.
48      *
49      * @data_end is a maximum of file size on open (or when we get write/resize
50      * permissions) and all write request ends after it. So it's safe to
51      * truncate to data_end if it is valid.
52      */
53     int64_t data_end;
54 
55     /*
56      * Start of trailing preallocated area which reads as zero. May be smaller
57      * than data_end, if user does over-EOF write zero operation. If < 0 the
58      * status is unknown.
59      *
60      * If both @zero_start and @file_end are valid, the region
61      * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62      * is not valid, @zero_start doesn't make much sense.
63      */
64     int64_t zero_start;
65 
66     /*
67      * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68      * to avoid extra lseek() calls on each write operation. If < 0 the status
69      * is unknown.
70      */
71     int64_t file_end;
72 
73     /*
74      * All three states @data_end, @zero_start and @file_end are guaranteed to
75      * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76      * BLK_PERM_WRITE permissions on file child.
77      */
78 } BDRVPreallocateState;
79 
80 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
81 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
82 static QemuOptsList runtime_opts = {
83     .name = "preallocate",
84     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
85     .desc = {
86         {
87             .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
88             .type = QEMU_OPT_SIZE,
89             .help = "on preallocation, align file length to this number, "
90                 "default 1M",
91         },
92         {
93             .name = PREALLOCATE_OPT_PREALLOC_SIZE,
94             .type = QEMU_OPT_SIZE,
95             .help = "how much to preallocate, default 128M",
96         },
97         { /* end of list */ }
98     },
99 };
100 
101 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
102                                     BlockDriverState *child_bs, Error **errp)
103 {
104     QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
105 
106     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
107         return false;
108     }
109 
110     dest->prealloc_align =
111         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
112     dest->prealloc_size =
113         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
114 
115     qemu_opts_del(opts);
116 
117     if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
118         error_setg(errp, "prealloc-align parameter of preallocate filter "
119                    "is not aligned to %llu", BDRV_SECTOR_SIZE);
120         return false;
121     }
122 
123     if (!QEMU_IS_ALIGNED(dest->prealloc_align,
124                          child_bs->bl.request_alignment)) {
125         error_setg(errp, "prealloc-align parameter of preallocate filter "
126                    "is not aligned to underlying node request alignment "
127                    "(%" PRIi32 ")", child_bs->bl.request_alignment);
128         return false;
129     }
130 
131     return true;
132 }
133 
134 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
135                             Error **errp)
136 {
137     BDRVPreallocateState *s = bs->opaque;
138     int ret;
139 
140     /*
141      * s->data_end and friends should be initialized on permission update.
142      * For this to work, mark them invalid.
143      */
144     s->file_end = s->zero_start = s->data_end = -EINVAL;
145 
146     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
147     if (ret < 0) {
148         return ret;
149     }
150 
151     if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
152         return -EINVAL;
153     }
154 
155     bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
156         (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
157 
158     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
159         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
160             bs->file->bs->supported_zero_flags);
161 
162     return 0;
163 }
164 
165 static void preallocate_close(BlockDriverState *bs)
166 {
167     int ret;
168     BDRVPreallocateState *s = bs->opaque;
169 
170     if (s->data_end < 0) {
171         return;
172     }
173 
174     if (s->file_end < 0) {
175         s->file_end = bdrv_getlength(bs->file->bs);
176         if (s->file_end < 0) {
177             return;
178         }
179     }
180 
181     if (s->data_end < s->file_end) {
182         ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
183                             NULL);
184         s->file_end = ret < 0 ? ret : s->data_end;
185     }
186 }
187 
188 
189 /*
190  * Handle reopen.
191  *
192  * We must implement reopen handlers, otherwise reopen just don't work. Handle
193  * new options and don't care about preallocation state, as it is handled in
194  * set/check permission handlers.
195  */
196 
197 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
198                                       BlockReopenQueue *queue, Error **errp)
199 {
200     PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
201 
202     if (!preallocate_absorb_opts(opts, reopen_state->options,
203                                  reopen_state->bs->file->bs, errp)) {
204         g_free(opts);
205         return -EINVAL;
206     }
207 
208     reopen_state->opaque = opts;
209 
210     return 0;
211 }
212 
213 static void preallocate_reopen_commit(BDRVReopenState *state)
214 {
215     BDRVPreallocateState *s = state->bs->opaque;
216 
217     s->opts = *(PreallocateOpts *)state->opaque;
218 
219     g_free(state->opaque);
220     state->opaque = NULL;
221 }
222 
223 static void preallocate_reopen_abort(BDRVReopenState *state)
224 {
225     g_free(state->opaque);
226     state->opaque = NULL;
227 }
228 
229 static int coroutine_fn GRAPH_RDLOCK
230 preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
231                            QEMUIOVector *qiov, size_t qiov_offset,
232                            BdrvRequestFlags flags)
233 {
234     return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
235                                flags);
236 }
237 
238 static int coroutine_fn GRAPH_RDLOCK
239 preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
240 {
241     return bdrv_co_pdiscard(bs->file, offset, bytes);
242 }
243 
244 static bool can_write_resize(uint64_t perm)
245 {
246     return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
247 }
248 
249 static bool has_prealloc_perms(BlockDriverState *bs)
250 {
251     BDRVPreallocateState *s = bs->opaque;
252 
253     if (can_write_resize(bs->file->perm)) {
254         assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
255         assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
256         return true;
257     }
258 
259     assert(s->data_end < 0);
260     assert(s->zero_start < 0);
261     assert(s->file_end < 0);
262     return false;
263 }
264 
265 /*
266  * Call on each write. Returns true if @want_merge_zero is true and the region
267  * [offset, offset + bytes) is zeroed (as a result of this call or earlier
268  * preallocation).
269  *
270  * want_merge_zero is used to merge write-zero request with preallocation in
271  * one bdrv_co_pwrite_zeroes() call.
272  */
273 static bool coroutine_fn GRAPH_RDLOCK
274 handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
275              bool want_merge_zero)
276 {
277     BDRVPreallocateState *s = bs->opaque;
278     int64_t end = offset + bytes;
279     int64_t prealloc_start, prealloc_end;
280     int ret;
281     uint32_t file_align = bs->file->bs->bl.request_alignment;
282     uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
283 
284     assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
285 
286     if (!has_prealloc_perms(bs)) {
287         /* We don't have state neither should try to recover it */
288         return false;
289     }
290 
291     if (s->data_end < 0) {
292         s->data_end = bdrv_co_getlength(bs->file->bs);
293         if (s->data_end < 0) {
294             return false;
295         }
296 
297         if (s->file_end < 0) {
298             s->file_end = s->data_end;
299         }
300     }
301 
302     if (end <= s->data_end) {
303         return false;
304     }
305 
306     /* We have valid s->data_end, and request writes beyond it. */
307 
308     s->data_end = end;
309     if (s->zero_start < 0 || !want_merge_zero) {
310         s->zero_start = end;
311     }
312 
313     if (s->file_end < 0) {
314         s->file_end = bdrv_co_getlength(bs->file->bs);
315         if (s->file_end < 0) {
316             return false;
317         }
318     }
319 
320     /* Now s->data_end, s->zero_start and s->file_end are valid. */
321 
322     if (end <= s->file_end) {
323         /* No preallocation needed. */
324         return want_merge_zero && offset >= s->zero_start;
325     }
326 
327     /* Now we want new preallocation, as request writes beyond s->file_end. */
328 
329     prealloc_start = QEMU_ALIGN_UP(
330             want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
331             file_align);
332     prealloc_end = QEMU_ALIGN_UP(
333             MAX(prealloc_start, end) + s->opts.prealloc_size,
334             prealloc_align);
335 
336     want_merge_zero = want_merge_zero && (prealloc_start <= offset);
337 
338     ret = bdrv_co_pwrite_zeroes(
339             bs->file, prealloc_start, prealloc_end - prealloc_start,
340             BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
341     if (ret < 0) {
342         s->file_end = ret;
343         return false;
344     }
345 
346     s->file_end = prealloc_end;
347     return want_merge_zero;
348 }
349 
350 static int coroutine_fn GRAPH_RDLOCK
351 preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
352                              int64_t bytes, BdrvRequestFlags flags)
353 {
354     bool want_merge_zero =
355         !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
356     if (handle_write(bs, offset, bytes, want_merge_zero)) {
357         return 0;
358     }
359 
360     return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
361 }
362 
363 static int coroutine_fn GRAPH_RDLOCK
364 preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
365                             QEMUIOVector *qiov, size_t qiov_offset,
366                             BdrvRequestFlags flags)
367 {
368     handle_write(bs, offset, bytes, false);
369 
370     return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
371                                 flags);
372 }
373 
374 static int coroutine_fn GRAPH_RDLOCK
375 preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
376                         bool exact, PreallocMode prealloc,
377                         BdrvRequestFlags flags, Error **errp)
378 {
379     ERRP_GUARD();
380     BDRVPreallocateState *s = bs->opaque;
381     int ret;
382 
383     if (s->data_end >= 0 && offset > s->data_end) {
384         if (s->file_end < 0) {
385             s->file_end = bdrv_co_getlength(bs->file->bs);
386             if (s->file_end < 0) {
387                 error_setg(errp, "failed to get file length");
388                 return s->file_end;
389             }
390         }
391 
392         if (prealloc == PREALLOC_MODE_FALLOC) {
393             /*
394              * If offset <= s->file_end, the task is already done, just
395              * update s->data_end, to move part of "filter preallocation"
396              * to "preallocation requested by user".
397              * Otherwise just proceed to preallocate missing part.
398              */
399             if (offset <= s->file_end) {
400                 s->data_end = offset;
401                 return 0;
402             }
403         } else {
404             /*
405              * We have to drop our preallocation, to
406              * - avoid "Cannot use preallocation for shrinking files" in
407              *   case of offset < file_end
408              * - give PREALLOC_MODE_OFF a chance to keep small disk
409              *   usage
410              * - give PREALLOC_MODE_FULL a chance to actually write the
411              *   whole region as user expects
412              */
413             if (s->file_end > s->data_end) {
414                 ret = bdrv_co_truncate(bs->file, s->data_end, true,
415                                        PREALLOC_MODE_OFF, 0, errp);
416                 if (ret < 0) {
417                     s->file_end = ret;
418                     error_prepend(errp, "preallocate-filter: failed to drop "
419                                   "write-zero preallocation: ");
420                     return ret;
421                 }
422                 s->file_end = s->data_end;
423             }
424         }
425 
426         s->data_end = offset;
427     }
428 
429     ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
430     if (ret < 0) {
431         s->file_end = s->zero_start = s->data_end = ret;
432         return ret;
433     }
434 
435     if (has_prealloc_perms(bs)) {
436         s->file_end = s->zero_start = s->data_end = offset;
437     }
438     return 0;
439 }
440 
441 static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
442 {
443     return bdrv_co_flush(bs->file->bs);
444 }
445 
446 static int64_t coroutine_fn GRAPH_RDLOCK
447 preallocate_co_getlength(BlockDriverState *bs)
448 {
449     int64_t ret;
450     BDRVPreallocateState *s = bs->opaque;
451 
452     if (s->data_end >= 0) {
453         return s->data_end;
454     }
455 
456     ret = bdrv_co_getlength(bs->file->bs);
457 
458     if (has_prealloc_perms(bs)) {
459         s->file_end = s->zero_start = s->data_end = ret;
460     }
461 
462     return ret;
463 }
464 
465 static int preallocate_check_perm(BlockDriverState *bs,
466                                   uint64_t perm, uint64_t shared, Error **errp)
467 {
468     BDRVPreallocateState *s = bs->opaque;
469 
470     if (s->data_end >= 0 && !can_write_resize(perm)) {
471         /*
472          * Lose permissions.
473          * We should truncate in check_perm, as in set_perm bs->file->perm will
474          * be already changed, and we should not violate it.
475          */
476         if (s->file_end < 0) {
477             s->file_end = bdrv_getlength(bs->file->bs);
478             if (s->file_end < 0) {
479                 error_setg(errp, "Failed to get file length");
480                 return s->file_end;
481             }
482         }
483 
484         if (s->data_end < s->file_end) {
485             int ret = bdrv_truncate(bs->file, s->data_end, true,
486                                     PREALLOC_MODE_OFF, 0, NULL);
487             if (ret < 0) {
488                 error_setg(errp, "Failed to drop preallocation");
489                 s->file_end = ret;
490                 return ret;
491             }
492             s->file_end = s->data_end;
493         }
494     }
495 
496     return 0;
497 }
498 
499 static void preallocate_set_perm(BlockDriverState *bs,
500                                  uint64_t perm, uint64_t shared)
501 {
502     BDRVPreallocateState *s = bs->opaque;
503 
504     if (can_write_resize(perm)) {
505         if (s->data_end < 0) {
506             s->data_end = s->file_end = s->zero_start =
507                 bdrv_getlength(bs->file->bs);
508         }
509     } else {
510         /*
511          * We drop our permissions, as well as allow shared
512          * permissions (see preallocate_child_perm), anyone will be able to
513          * change the child, so mark all states invalid. We'll regain control if
514          * get good permissions back.
515          */
516         s->data_end = s->file_end = s->zero_start = -EINVAL;
517     }
518 }
519 
520 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
521     BdrvChildRole role, BlockReopenQueue *reopen_queue,
522     uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
523 {
524     bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
525 
526     if (can_write_resize(perm)) {
527         /* This should come by default, but let's enforce: */
528         *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
529 
530         /*
531          * Don't share, to keep our states s->file_end, s->data_end and
532          * s->zero_start valid.
533          */
534         *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
535     }
536 }
537 
538 BlockDriver bdrv_preallocate_filter = {
539     .format_name = "preallocate",
540     .instance_size = sizeof(BDRVPreallocateState),
541 
542     .bdrv_co_getlength    = preallocate_co_getlength,
543     .bdrv_open            = preallocate_open,
544     .bdrv_close           = preallocate_close,
545 
546     .bdrv_reopen_prepare  = preallocate_reopen_prepare,
547     .bdrv_reopen_commit   = preallocate_reopen_commit,
548     .bdrv_reopen_abort    = preallocate_reopen_abort,
549 
550     .bdrv_co_preadv_part = preallocate_co_preadv_part,
551     .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
552     .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
553     .bdrv_co_pdiscard = preallocate_co_pdiscard,
554     .bdrv_co_flush = preallocate_co_flush,
555     .bdrv_co_truncate = preallocate_co_truncate,
556 
557     .bdrv_check_perm = preallocate_check_perm,
558     .bdrv_set_perm = preallocate_set_perm,
559     .bdrv_child_perm = preallocate_child_perm,
560 
561     .is_filter = true,
562 };
563 
564 static void bdrv_preallocate_init(void)
565 {
566     bdrv_register(&bdrv_preallocate_filter);
567 }
568 
569 block_init(bdrv_preallocate_init);
570