xref: /openbmc/qemu/block/preallocate.c (revision afb81fe8)
1 /*
2  * preallocate filter driver
3  *
4  * The driver performs preallocate operation: it is injected above
5  * some node, and before each write over EOF it does additional preallocating
6  * write-zeroes request.
7  *
8  * Copyright (c) 2020 Virtuozzo International GmbH.
9  *
10  * Author:
11  *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program. If not, see <http://www.gnu.org/licenses/>.
25  */
26 
27 #include "qemu/osdep.h"
28 
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block-io.h"
34 #include "block/block_int.h"
35 
36 
37 typedef struct PreallocateOpts {
38     int64_t prealloc_size;
39     int64_t prealloc_align;
40 } PreallocateOpts;
41 
42 typedef struct BDRVPreallocateState {
43     PreallocateOpts opts;
44 
45     /*
46      * Track real data end, to crop preallocation on close. If < 0 the status is
47      * unknown.
48      *
49      * @data_end is a maximum of file size on open (or when we get write/resize
50      * permissions) and all write request ends after it. So it's safe to
51      * truncate to data_end if it is valid.
52      */
53     int64_t data_end;
54 
55     /*
56      * Start of trailing preallocated area which reads as zero. May be smaller
57      * than data_end, if user does over-EOF write zero operation. If < 0 the
58      * status is unknown.
59      *
60      * If both @zero_start and @file_end are valid, the region
61      * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62      * is not valid, @zero_start doesn't make much sense.
63      */
64     int64_t zero_start;
65 
66     /*
67      * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68      * to avoid extra lseek() calls on each write operation. If < 0 the status
69      * is unknown.
70      */
71     int64_t file_end;
72 
73     /*
74      * All three states @data_end, @zero_start and @file_end are guaranteed to
75      * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76      * BLK_PERM_WRITE permissions on file child.
77      */
78 
79     /* Gives up the resize permission on children when parents don't need it */
80     QEMUBH *drop_resize_bh;
81 } BDRVPreallocateState;
82 
83 static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
84 static void preallocate_drop_resize_bh(void *opaque);
85 
86 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
87 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
88 static QemuOptsList runtime_opts = {
89     .name = "preallocate",
90     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
91     .desc = {
92         {
93             .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
94             .type = QEMU_OPT_SIZE,
95             .help = "on preallocation, align file length to this number, "
96                 "default 1M",
97         },
98         {
99             .name = PREALLOCATE_OPT_PREALLOC_SIZE,
100             .type = QEMU_OPT_SIZE,
101             .help = "how much to preallocate, default 128M",
102         },
103         { /* end of list */ }
104     },
105 };
106 
107 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
108                                     BlockDriverState *child_bs, Error **errp)
109 {
110     QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
111 
112     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
113         return false;
114     }
115 
116     dest->prealloc_align =
117         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
118     dest->prealloc_size =
119         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
120 
121     qemu_opts_del(opts);
122 
123     if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
124         error_setg(errp, "prealloc-align parameter of preallocate filter "
125                    "is not aligned to %llu", BDRV_SECTOR_SIZE);
126         return false;
127     }
128 
129     if (!QEMU_IS_ALIGNED(dest->prealloc_align,
130                          child_bs->bl.request_alignment)) {
131         error_setg(errp, "prealloc-align parameter of preallocate filter "
132                    "is not aligned to underlying node request alignment "
133                    "(%" PRIi32 ")", child_bs->bl.request_alignment);
134         return false;
135     }
136 
137     return true;
138 }
139 
140 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
141                             Error **errp)
142 {
143     BDRVPreallocateState *s = bs->opaque;
144     int ret;
145 
146     /*
147      * s->data_end and friends should be initialized on permission update.
148      * For this to work, mark them invalid.
149      */
150     s->file_end = s->zero_start = s->data_end = -EINVAL;
151     s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);
152 
153     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
154     if (ret < 0) {
155         return ret;
156     }
157 
158     if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
159         return -EINVAL;
160     }
161 
162     bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
163         (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
164 
165     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
166         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
167             bs->file->bs->supported_zero_flags);
168 
169     return 0;
170 }
171 
172 static int preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
173 {
174     BDRVPreallocateState *s = bs->opaque;
175     int ret;
176 
177     if (s->file_end < 0) {
178         s->file_end = bdrv_getlength(bs->file->bs);
179         if (s->file_end < 0) {
180             error_setg_errno(errp, -s->file_end, "Failed to get file length");
181             return s->file_end;
182         }
183     }
184 
185     if (s->data_end < s->file_end) {
186         ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
187                             NULL);
188         if (ret < 0) {
189             error_setg_errno(errp, -ret, "Failed to drop preallocation");
190             s->file_end = ret;
191             return ret;
192         }
193         s->file_end = s->data_end;
194     }
195 
196     return 0;
197 }
198 
199 static void preallocate_close(BlockDriverState *bs)
200 {
201     BDRVPreallocateState *s = bs->opaque;
202 
203     qemu_bh_cancel(s->drop_resize_bh);
204     qemu_bh_delete(s->drop_resize_bh);
205 
206     if (s->data_end >= 0) {
207         preallocate_truncate_to_real_size(bs, NULL);
208     }
209 }
210 
211 
212 /*
213  * Handle reopen.
214  *
215  * We must implement reopen handlers, otherwise reopen just don't work. Handle
216  * new options and don't care about preallocation state, as it is handled in
217  * set/check permission handlers.
218  */
219 
220 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
221                                       BlockReopenQueue *queue, Error **errp)
222 {
223     PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
224     int ret;
225 
226     if (!preallocate_absorb_opts(opts, reopen_state->options,
227                                  reopen_state->bs->file->bs, errp)) {
228         g_free(opts);
229         return -EINVAL;
230     }
231 
232     /*
233      * Drop the preallocation already here if reopening read-only. The child
234      * might also be reopened read-only and then scheduling a BH during the
235      * permission update is too late.
236      */
237     if ((reopen_state->flags & BDRV_O_RDWR) == 0) {
238         ret = preallocate_drop_resize(reopen_state->bs, errp);
239         if (ret < 0) {
240             g_free(opts);
241             return ret;
242         }
243     }
244 
245     reopen_state->opaque = opts;
246 
247     return 0;
248 }
249 
250 static void preallocate_reopen_commit(BDRVReopenState *state)
251 {
252     BDRVPreallocateState *s = state->bs->opaque;
253 
254     s->opts = *(PreallocateOpts *)state->opaque;
255 
256     g_free(state->opaque);
257     state->opaque = NULL;
258 }
259 
260 static void preallocate_reopen_abort(BDRVReopenState *state)
261 {
262     g_free(state->opaque);
263     state->opaque = NULL;
264 }
265 
266 static int coroutine_fn GRAPH_RDLOCK
267 preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
268                            QEMUIOVector *qiov, size_t qiov_offset,
269                            BdrvRequestFlags flags)
270 {
271     return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
272                                flags);
273 }
274 
275 static int coroutine_fn GRAPH_RDLOCK
276 preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
277 {
278     return bdrv_co_pdiscard(bs->file, offset, bytes);
279 }
280 
281 static bool can_write_resize(uint64_t perm)
282 {
283     return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
284 }
285 
286 static bool has_prealloc_perms(BlockDriverState *bs)
287 {
288     BDRVPreallocateState *s = bs->opaque;
289 
290     if (can_write_resize(bs->file->perm)) {
291         assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
292         assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
293         return true;
294     }
295 
296     assert(s->data_end < 0);
297     assert(s->zero_start < 0);
298     assert(s->file_end < 0);
299     return false;
300 }
301 
302 /*
303  * Call on each write. Returns true if @want_merge_zero is true and the region
304  * [offset, offset + bytes) is zeroed (as a result of this call or earlier
305  * preallocation).
306  *
307  * want_merge_zero is used to merge write-zero request with preallocation in
308  * one bdrv_co_pwrite_zeroes() call.
309  */
310 static bool coroutine_fn GRAPH_RDLOCK
311 handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
312              bool want_merge_zero)
313 {
314     BDRVPreallocateState *s = bs->opaque;
315     int64_t end = offset + bytes;
316     int64_t prealloc_start, prealloc_end;
317     int ret;
318     uint32_t file_align = bs->file->bs->bl.request_alignment;
319     uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
320 
321     assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
322 
323     if (!has_prealloc_perms(bs)) {
324         /* We don't have state neither should try to recover it */
325         return false;
326     }
327 
328     if (s->data_end < 0) {
329         s->data_end = bdrv_co_getlength(bs->file->bs);
330         if (s->data_end < 0) {
331             return false;
332         }
333 
334         if (s->file_end < 0) {
335             s->file_end = s->data_end;
336         }
337     }
338 
339     if (end <= s->data_end) {
340         return false;
341     }
342 
343     /* We have valid s->data_end, and request writes beyond it. */
344 
345     s->data_end = end;
346     if (s->zero_start < 0 || !want_merge_zero) {
347         s->zero_start = end;
348     }
349 
350     if (s->file_end < 0) {
351         s->file_end = bdrv_co_getlength(bs->file->bs);
352         if (s->file_end < 0) {
353             return false;
354         }
355     }
356 
357     /* Now s->data_end, s->zero_start and s->file_end are valid. */
358 
359     if (end <= s->file_end) {
360         /* No preallocation needed. */
361         return want_merge_zero && offset >= s->zero_start;
362     }
363 
364     /* Now we want new preallocation, as request writes beyond s->file_end. */
365 
366     prealloc_start = QEMU_ALIGN_UP(
367             want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
368             file_align);
369     prealloc_end = QEMU_ALIGN_UP(
370             MAX(prealloc_start, end) + s->opts.prealloc_size,
371             prealloc_align);
372 
373     want_merge_zero = want_merge_zero && (prealloc_start <= offset);
374 
375     ret = bdrv_co_pwrite_zeroes(
376             bs->file, prealloc_start, prealloc_end - prealloc_start,
377             BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
378     if (ret < 0) {
379         s->file_end = ret;
380         return false;
381     }
382 
383     s->file_end = prealloc_end;
384     return want_merge_zero;
385 }
386 
387 static int coroutine_fn GRAPH_RDLOCK
388 preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
389                              int64_t bytes, BdrvRequestFlags flags)
390 {
391     bool want_merge_zero =
392         !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
393     if (handle_write(bs, offset, bytes, want_merge_zero)) {
394         return 0;
395     }
396 
397     return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
398 }
399 
400 static int coroutine_fn GRAPH_RDLOCK
401 preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
402                             QEMUIOVector *qiov, size_t qiov_offset,
403                             BdrvRequestFlags flags)
404 {
405     handle_write(bs, offset, bytes, false);
406 
407     return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
408                                 flags);
409 }
410 
411 static int coroutine_fn GRAPH_RDLOCK
412 preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
413                         bool exact, PreallocMode prealloc,
414                         BdrvRequestFlags flags, Error **errp)
415 {
416     ERRP_GUARD();
417     BDRVPreallocateState *s = bs->opaque;
418     int ret;
419 
420     if (s->data_end >= 0 && offset > s->data_end) {
421         if (s->file_end < 0) {
422             s->file_end = bdrv_co_getlength(bs->file->bs);
423             if (s->file_end < 0) {
424                 error_setg(errp, "failed to get file length");
425                 return s->file_end;
426             }
427         }
428 
429         if (prealloc == PREALLOC_MODE_FALLOC) {
430             /*
431              * If offset <= s->file_end, the task is already done, just
432              * update s->data_end, to move part of "filter preallocation"
433              * to "preallocation requested by user".
434              * Otherwise just proceed to preallocate missing part.
435              */
436             if (offset <= s->file_end) {
437                 s->data_end = offset;
438                 return 0;
439             }
440         } else {
441             /*
442              * We have to drop our preallocation, to
443              * - avoid "Cannot use preallocation for shrinking files" in
444              *   case of offset < file_end
445              * - give PREALLOC_MODE_OFF a chance to keep small disk
446              *   usage
447              * - give PREALLOC_MODE_FULL a chance to actually write the
448              *   whole region as user expects
449              */
450             if (s->file_end > s->data_end) {
451                 ret = bdrv_co_truncate(bs->file, s->data_end, true,
452                                        PREALLOC_MODE_OFF, 0, errp);
453                 if (ret < 0) {
454                     s->file_end = ret;
455                     error_prepend(errp, "preallocate-filter: failed to drop "
456                                   "write-zero preallocation: ");
457                     return ret;
458                 }
459                 s->file_end = s->data_end;
460             }
461         }
462 
463         s->data_end = offset;
464     }
465 
466     ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
467     if (ret < 0) {
468         s->file_end = s->zero_start = s->data_end = ret;
469         return ret;
470     }
471 
472     if (has_prealloc_perms(bs)) {
473         s->file_end = s->zero_start = s->data_end = offset;
474     }
475     return 0;
476 }
477 
478 static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
479 {
480     return bdrv_co_flush(bs->file->bs);
481 }
482 
483 static int64_t coroutine_fn GRAPH_RDLOCK
484 preallocate_co_getlength(BlockDriverState *bs)
485 {
486     int64_t ret;
487     BDRVPreallocateState *s = bs->opaque;
488 
489     if (s->data_end >= 0) {
490         return s->data_end;
491     }
492 
493     ret = bdrv_co_getlength(bs->file->bs);
494 
495     if (has_prealloc_perms(bs)) {
496         s->file_end = s->zero_start = s->data_end = ret;
497     }
498 
499     return ret;
500 }
501 
502 static int preallocate_drop_resize(BlockDriverState *bs, Error **errp)
503 {
504     BDRVPreallocateState *s = bs->opaque;
505     int ret;
506 
507     if (s->data_end < 0) {
508         return 0;
509     }
510 
511     /*
512      * Before switching children to be read-only, truncate them to remove
513      * the preallocation and let them have the real size.
514      */
515     ret = preallocate_truncate_to_real_size(bs, errp);
516     if (ret < 0) {
517         return ret;
518     }
519 
520     /*
521      * We'll drop our permissions and will allow other users to take write and
522      * resize permissions (see preallocate_child_perm). Anyone will be able to
523      * change the child, so mark all states invalid. We'll regain control if a
524      * parent requests write access again.
525      */
526     s->data_end = s->file_end = s->zero_start = -EINVAL;
527 
528     bdrv_graph_rdlock_main_loop();
529     bdrv_child_refresh_perms(bs, bs->file, NULL);
530     bdrv_graph_rdunlock_main_loop();
531 
532     return 0;
533 }
534 
535 static void preallocate_drop_resize_bh(void *opaque)
536 {
537     /*
538      * In case of errors, we'll simply keep the exclusive lock on the image
539      * indefinitely.
540      */
541     preallocate_drop_resize(opaque, NULL);
542 }
543 
544 static void preallocate_set_perm(BlockDriverState *bs,
545                                  uint64_t perm, uint64_t shared)
546 {
547     BDRVPreallocateState *s = bs->opaque;
548 
549     if (can_write_resize(perm)) {
550         qemu_bh_cancel(s->drop_resize_bh);
551         if (s->data_end < 0) {
552             s->data_end = s->file_end = s->zero_start =
553                 bs->file->bs->total_sectors * BDRV_SECTOR_SIZE;
554         }
555     } else {
556         qemu_bh_schedule(s->drop_resize_bh);
557     }
558 }
559 
560 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
561     BdrvChildRole role, BlockReopenQueue *reopen_queue,
562     uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
563 {
564     BDRVPreallocateState *s = bs->opaque;
565 
566     bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
567 
568     /*
569      * We need exclusive write and resize permissions on the child not only when
570      * the parent can write to it, but also after the parent gave up write
571      * permissions until preallocate_drop_resize() has completed.
572      */
573     if (can_write_resize(perm) || s->data_end != -EINVAL) {
574         *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
575 
576         /*
577          * Don't share, to keep our states s->file_end, s->data_end and
578          * s->zero_start valid.
579          */
580         *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
581     }
582 }
583 
584 static BlockDriver bdrv_preallocate_filter = {
585     .format_name = "preallocate",
586     .instance_size = sizeof(BDRVPreallocateState),
587 
588     .bdrv_co_getlength    = preallocate_co_getlength,
589     .bdrv_open            = preallocate_open,
590     .bdrv_close           = preallocate_close,
591 
592     .bdrv_reopen_prepare  = preallocate_reopen_prepare,
593     .bdrv_reopen_commit   = preallocate_reopen_commit,
594     .bdrv_reopen_abort    = preallocate_reopen_abort,
595 
596     .bdrv_co_preadv_part = preallocate_co_preadv_part,
597     .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
598     .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
599     .bdrv_co_pdiscard = preallocate_co_pdiscard,
600     .bdrv_co_flush = preallocate_co_flush,
601     .bdrv_co_truncate = preallocate_co_truncate,
602 
603     .bdrv_set_perm = preallocate_set_perm,
604     .bdrv_child_perm = preallocate_child_perm,
605 
606     .is_filter = true,
607 };
608 
609 static void bdrv_preallocate_init(void)
610 {
611     bdrv_register(&bdrv_preallocate_filter);
612 }
613 
614 block_init(bdrv_preallocate_init);
615