xref: /openbmc/qemu/block/preallocate.c (revision ad6ef0a42e314a8c6ac6c96d5f6e607a1e5644b5)
1 /*
2  * preallocate filter driver
3  *
4  * The driver performs preallocate operation: it is injected above
5  * some node, and before each write over EOF it does additional preallocating
6  * write-zeroes request.
7  *
8  * Copyright (c) 2020 Virtuozzo International GmbH.
9  *
10  * Author:
11  *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program. If not, see <http://www.gnu.org/licenses/>.
25  */
26 
27 #include "qemu/osdep.h"
28 
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block-io.h"
34 #include "block/block_int.h"
35 
36 
37 typedef struct PreallocateOpts {
38     int64_t prealloc_size;
39     int64_t prealloc_align;
40 } PreallocateOpts;
41 
42 typedef struct BDRVPreallocateState {
43     PreallocateOpts opts;
44 
45     /*
46      * Track real data end, to crop preallocation on close. If < 0 the status is
47      * unknown.
48      *
49      * @data_end is a maximum of file size on open (or when we get write/resize
50      * permissions) and all write request ends after it. So it's safe to
51      * truncate to data_end if it is valid.
52      */
53     int64_t data_end;
54 
55     /*
56      * Start of trailing preallocated area which reads as zero. May be smaller
57      * than data_end, if user does over-EOF write zero operation. If < 0 the
58      * status is unknown.
59      *
60      * If both @zero_start and @file_end are valid, the region
61      * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62      * is not valid, @zero_start doesn't make much sense.
63      */
64     int64_t zero_start;
65 
66     /*
67      * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68      * to avoid extra lseek() calls on each write operation. If < 0 the status
69      * is unknown.
70      */
71     int64_t file_end;
72 
73     /*
74      * All three states @data_end, @zero_start and @file_end are guaranteed to
75      * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76      * BLK_PERM_WRITE permissions on file child.
77      */
78 
79     /* Gives up the resize permission on children when parents don't need it */
80     QEMUBH *drop_resize_bh;
81 } BDRVPreallocateState;
82 
83 static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
84 static void preallocate_drop_resize_bh(void *opaque);
85 
86 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
87 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
88 static QemuOptsList runtime_opts = {
89     .name = "preallocate",
90     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
91     .desc = {
92         {
93             .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
94             .type = QEMU_OPT_SIZE,
95             .help = "on preallocation, align file length to this number, "
96                 "default 1M",
97         },
98         {
99             .name = PREALLOCATE_OPT_PREALLOC_SIZE,
100             .type = QEMU_OPT_SIZE,
101             .help = "how much to preallocate, default 128M",
102         },
103         { /* end of list */ }
104     },
105 };
106 
preallocate_absorb_opts(PreallocateOpts * dest,QDict * options,BlockDriverState * child_bs,Error ** errp)107 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
108                                     BlockDriverState *child_bs, Error **errp)
109 {
110     QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
111 
112     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
113         return false;
114     }
115 
116     dest->prealloc_align =
117         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
118     dest->prealloc_size =
119         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
120 
121     qemu_opts_del(opts);
122 
123     if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
124         error_setg(errp, "prealloc-align parameter of preallocate filter "
125                    "is not aligned to %llu", BDRV_SECTOR_SIZE);
126         return false;
127     }
128 
129     if (!QEMU_IS_ALIGNED(dest->prealloc_align,
130                          child_bs->bl.request_alignment)) {
131         error_setg(errp, "prealloc-align parameter of preallocate filter "
132                    "is not aligned to underlying node request alignment "
133                    "(%" PRIi32 ")", child_bs->bl.request_alignment);
134         return false;
135     }
136 
137     return true;
138 }
139 
preallocate_open(BlockDriverState * bs,QDict * options,int flags,Error ** errp)140 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
141                             Error **errp)
142 {
143     BDRVPreallocateState *s = bs->opaque;
144     int ret;
145 
146     GLOBAL_STATE_CODE();
147 
148     /*
149      * s->data_end and friends should be initialized on permission update.
150      * For this to work, mark them invalid.
151      */
152     s->file_end = s->zero_start = s->data_end = -EINVAL;
153     s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);
154 
155     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
156     if (ret < 0) {
157         return ret;
158     }
159 
160     GRAPH_RDLOCK_GUARD_MAINLOOP();
161 
162     if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
163         return -EINVAL;
164     }
165 
166     bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
167         (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
168 
169     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
170         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
171             bs->file->bs->supported_zero_flags);
172 
173     return 0;
174 }
175 
176 static int GRAPH_RDLOCK
preallocate_truncate_to_real_size(BlockDriverState * bs,Error ** errp)177 preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
178 {
179     BDRVPreallocateState *s = bs->opaque;
180     int ret;
181 
182     if (s->file_end < 0) {
183         s->file_end = bdrv_getlength(bs->file->bs);
184         if (s->file_end < 0) {
185             error_setg_errno(errp, -s->file_end, "Failed to get file length");
186             return s->file_end;
187         }
188     }
189 
190     if (s->data_end < s->file_end) {
191         ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
192                             NULL);
193         if (ret < 0) {
194             error_setg_errno(errp, -ret, "Failed to drop preallocation");
195             s->file_end = ret;
196             return ret;
197         }
198         s->file_end = s->data_end;
199     }
200 
201     return 0;
202 }
203 
preallocate_close(BlockDriverState * bs)204 static void preallocate_close(BlockDriverState *bs)
205 {
206     BDRVPreallocateState *s = bs->opaque;
207 
208     GLOBAL_STATE_CODE();
209     GRAPH_RDLOCK_GUARD_MAINLOOP();
210 
211     qemu_bh_cancel(s->drop_resize_bh);
212     qemu_bh_delete(s->drop_resize_bh);
213 
214     if (s->data_end >= 0) {
215         preallocate_truncate_to_real_size(bs, NULL);
216     }
217 }
218 
219 
220 /*
221  * Handle reopen.
222  *
223  * We must implement reopen handlers, otherwise reopen just don't work. Handle
224  * new options and don't care about preallocation state, as it is handled in
225  * set/check permission handlers.
226  */
227 
preallocate_reopen_prepare(BDRVReopenState * reopen_state,BlockReopenQueue * queue,Error ** errp)228 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
229                                       BlockReopenQueue *queue, Error **errp)
230 {
231     PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
232     int ret;
233 
234     GLOBAL_STATE_CODE();
235     GRAPH_RDLOCK_GUARD_MAINLOOP();
236 
237     if (!preallocate_absorb_opts(opts, reopen_state->options,
238                                  reopen_state->bs->file->bs, errp)) {
239         g_free(opts);
240         return -EINVAL;
241     }
242 
243     /*
244      * Drop the preallocation already here if reopening read-only. The child
245      * might also be reopened read-only and then scheduling a BH during the
246      * permission update is too late.
247      */
248     if ((reopen_state->flags & BDRV_O_RDWR) == 0) {
249         ret = preallocate_drop_resize(reopen_state->bs, errp);
250         if (ret < 0) {
251             g_free(opts);
252             return ret;
253         }
254     }
255 
256     reopen_state->opaque = opts;
257 
258     return 0;
259 }
260 
preallocate_reopen_commit(BDRVReopenState * state)261 static void preallocate_reopen_commit(BDRVReopenState *state)
262 {
263     BDRVPreallocateState *s = state->bs->opaque;
264 
265     s->opts = *(PreallocateOpts *)state->opaque;
266 
267     g_free(state->opaque);
268     state->opaque = NULL;
269 }
270 
preallocate_reopen_abort(BDRVReopenState * state)271 static void preallocate_reopen_abort(BDRVReopenState *state)
272 {
273     g_free(state->opaque);
274     state->opaque = NULL;
275 }
276 
277 static int coroutine_fn GRAPH_RDLOCK
preallocate_co_preadv_part(BlockDriverState * bs,int64_t offset,int64_t bytes,QEMUIOVector * qiov,size_t qiov_offset,BdrvRequestFlags flags)278 preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
279                            QEMUIOVector *qiov, size_t qiov_offset,
280                            BdrvRequestFlags flags)
281 {
282     return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
283                                flags);
284 }
285 
286 static int coroutine_fn GRAPH_RDLOCK
preallocate_co_pdiscard(BlockDriverState * bs,int64_t offset,int64_t bytes)287 preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
288 {
289     return bdrv_co_pdiscard(bs->file, offset, bytes);
290 }
291 
can_write_resize(uint64_t perm)292 static bool can_write_resize(uint64_t perm)
293 {
294     return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
295 }
296 
has_prealloc_perms(BlockDriverState * bs)297 static bool GRAPH_RDLOCK has_prealloc_perms(BlockDriverState *bs)
298 {
299     BDRVPreallocateState *s = bs->opaque;
300 
301     if (can_write_resize(bs->file->perm)) {
302         assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
303         assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
304         return true;
305     }
306 
307     assert(s->data_end < 0);
308     assert(s->zero_start < 0);
309     assert(s->file_end < 0);
310     return false;
311 }
312 
313 /*
314  * Call on each write. Returns true if @want_merge_zero is true and the region
315  * [offset, offset + bytes) is zeroed (as a result of this call or earlier
316  * preallocation).
317  *
318  * want_merge_zero is used to merge write-zero request with preallocation in
319  * one bdrv_co_pwrite_zeroes() call.
320  */
321 static bool coroutine_fn GRAPH_RDLOCK
handle_write(BlockDriverState * bs,int64_t offset,int64_t bytes,bool want_merge_zero)322 handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
323              bool want_merge_zero)
324 {
325     BDRVPreallocateState *s = bs->opaque;
326     int64_t end = offset + bytes;
327     int64_t prealloc_start, prealloc_end;
328     int ret;
329     uint32_t file_align = bs->file->bs->bl.request_alignment;
330     uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
331 
332     assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
333 
334     if (!has_prealloc_perms(bs)) {
335         /* We don't have state neither should try to recover it */
336         return false;
337     }
338 
339     if (s->data_end < 0) {
340         s->data_end = bdrv_co_getlength(bs->file->bs);
341         if (s->data_end < 0) {
342             return false;
343         }
344 
345         if (s->file_end < 0) {
346             s->file_end = s->data_end;
347         }
348     }
349 
350     if (end <= s->data_end) {
351         return false;
352     }
353 
354     /* We have valid s->data_end, and request writes beyond it. */
355 
356     s->data_end = end;
357     if (s->zero_start < 0 || !want_merge_zero) {
358         s->zero_start = end;
359     }
360 
361     if (s->file_end < 0) {
362         s->file_end = bdrv_co_getlength(bs->file->bs);
363         if (s->file_end < 0) {
364             return false;
365         }
366     }
367 
368     /* Now s->data_end, s->zero_start and s->file_end are valid. */
369 
370     if (end <= s->file_end) {
371         /* No preallocation needed. */
372         return want_merge_zero && offset >= s->zero_start;
373     }
374 
375     /* Now we want new preallocation, as request writes beyond s->file_end. */
376 
377     prealloc_start = QEMU_ALIGN_UP(
378             want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
379             file_align);
380     prealloc_end = QEMU_ALIGN_UP(
381             MAX(prealloc_start, end) + s->opts.prealloc_size,
382             prealloc_align);
383 
384     want_merge_zero = want_merge_zero && (prealloc_start <= offset);
385 
386     ret = bdrv_co_pwrite_zeroes(
387             bs->file, prealloc_start, prealloc_end - prealloc_start,
388             BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
389     if (ret < 0) {
390         s->file_end = ret;
391         return false;
392     }
393 
394     s->file_end = prealloc_end;
395     return want_merge_zero;
396 }
397 
398 static int coroutine_fn GRAPH_RDLOCK
preallocate_co_pwrite_zeroes(BlockDriverState * bs,int64_t offset,int64_t bytes,BdrvRequestFlags flags)399 preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
400                              int64_t bytes, BdrvRequestFlags flags)
401 {
402     bool want_merge_zero =
403         !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
404     if (handle_write(bs, offset, bytes, want_merge_zero)) {
405         return 0;
406     }
407 
408     return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
409 }
410 
411 static int coroutine_fn GRAPH_RDLOCK
preallocate_co_pwritev_part(BlockDriverState * bs,int64_t offset,int64_t bytes,QEMUIOVector * qiov,size_t qiov_offset,BdrvRequestFlags flags)412 preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
413                             QEMUIOVector *qiov, size_t qiov_offset,
414                             BdrvRequestFlags flags)
415 {
416     handle_write(bs, offset, bytes, false);
417 
418     return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
419                                 flags);
420 }
421 
422 static int coroutine_fn GRAPH_RDLOCK
preallocate_co_truncate(BlockDriverState * bs,int64_t offset,bool exact,PreallocMode prealloc,BdrvRequestFlags flags,Error ** errp)423 preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
424                         bool exact, PreallocMode prealloc,
425                         BdrvRequestFlags flags, Error **errp)
426 {
427     ERRP_GUARD();
428     BDRVPreallocateState *s = bs->opaque;
429     int ret;
430 
431     if (s->data_end >= 0 && offset > s->data_end) {
432         if (s->file_end < 0) {
433             s->file_end = bdrv_co_getlength(bs->file->bs);
434             if (s->file_end < 0) {
435                 error_setg(errp, "failed to get file length");
436                 return s->file_end;
437             }
438         }
439 
440         if (prealloc == PREALLOC_MODE_FALLOC) {
441             /*
442              * If offset <= s->file_end, the task is already done, just
443              * update s->data_end, to move part of "filter preallocation"
444              * to "preallocation requested by user".
445              * Otherwise just proceed to preallocate missing part.
446              */
447             if (offset <= s->file_end) {
448                 s->data_end = offset;
449                 return 0;
450             }
451         } else {
452             /*
453              * We have to drop our preallocation, to
454              * - avoid "Cannot use preallocation for shrinking files" in
455              *   case of offset < file_end
456              * - give PREALLOC_MODE_OFF a chance to keep small disk
457              *   usage
458              * - give PREALLOC_MODE_FULL a chance to actually write the
459              *   whole region as user expects
460              */
461             if (s->file_end > s->data_end) {
462                 ret = bdrv_co_truncate(bs->file, s->data_end, true,
463                                        PREALLOC_MODE_OFF, 0, errp);
464                 if (ret < 0) {
465                     s->file_end = ret;
466                     error_prepend(errp, "preallocate-filter: failed to drop "
467                                   "write-zero preallocation: ");
468                     return ret;
469                 }
470                 s->file_end = s->data_end;
471             }
472         }
473 
474         s->data_end = offset;
475     }
476 
477     ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
478     if (ret < 0) {
479         s->file_end = s->zero_start = s->data_end = ret;
480         return ret;
481     }
482 
483     if (has_prealloc_perms(bs)) {
484         s->file_end = s->zero_start = s->data_end = offset;
485     }
486     return 0;
487 }
488 
preallocate_co_flush(BlockDriverState * bs)489 static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
490 {
491     return bdrv_co_flush(bs->file->bs);
492 }
493 
494 static int64_t coroutine_fn GRAPH_RDLOCK
preallocate_co_getlength(BlockDriverState * bs)495 preallocate_co_getlength(BlockDriverState *bs)
496 {
497     int64_t ret;
498     BDRVPreallocateState *s = bs->opaque;
499 
500     if (s->data_end >= 0) {
501         return s->data_end;
502     }
503 
504     ret = bdrv_co_getlength(bs->file->bs);
505 
506     if (has_prealloc_perms(bs)) {
507         s->file_end = s->zero_start = s->data_end = ret;
508     }
509 
510     return ret;
511 }
512 
513 static int GRAPH_RDLOCK
preallocate_drop_resize(BlockDriverState * bs,Error ** errp)514 preallocate_drop_resize(BlockDriverState *bs, Error **errp)
515 {
516     BDRVPreallocateState *s = bs->opaque;
517     int ret;
518 
519     if (s->data_end < 0) {
520         return 0;
521     }
522 
523     /*
524      * Before switching children to be read-only, truncate them to remove
525      * the preallocation and let them have the real size.
526      */
527     ret = preallocate_truncate_to_real_size(bs, errp);
528     if (ret < 0) {
529         return ret;
530     }
531 
532     /*
533      * We'll drop our permissions and will allow other users to take write and
534      * resize permissions (see preallocate_child_perm). Anyone will be able to
535      * change the child, so mark all states invalid. We'll regain control if a
536      * parent requests write access again.
537      */
538     s->data_end = s->file_end = s->zero_start = -EINVAL;
539 
540     bdrv_child_refresh_perms(bs, bs->file, NULL);
541 
542     return 0;
543 }
544 
preallocate_drop_resize_bh(void * opaque)545 static void preallocate_drop_resize_bh(void *opaque)
546 {
547     GLOBAL_STATE_CODE();
548     GRAPH_RDLOCK_GUARD_MAINLOOP();
549 
550     /*
551      * In case of errors, we'll simply keep the exclusive lock on the image
552      * indefinitely.
553      */
554     preallocate_drop_resize(opaque, NULL);
555 }
556 
557 static void GRAPH_RDLOCK
preallocate_set_perm(BlockDriverState * bs,uint64_t perm,uint64_t shared)558 preallocate_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
559 {
560     BDRVPreallocateState *s = bs->opaque;
561 
562     if (can_write_resize(perm)) {
563         qemu_bh_cancel(s->drop_resize_bh);
564         if (s->data_end < 0) {
565             s->data_end = s->file_end = s->zero_start =
566                 bs->file->bs->total_sectors * BDRV_SECTOR_SIZE;
567         }
568     } else {
569         qemu_bh_schedule(s->drop_resize_bh);
570     }
571 }
572 
preallocate_child_perm(BlockDriverState * bs,BdrvChild * c,BdrvChildRole role,BlockReopenQueue * reopen_queue,uint64_t perm,uint64_t shared,uint64_t * nperm,uint64_t * nshared)573 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
574     BdrvChildRole role, BlockReopenQueue *reopen_queue,
575     uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
576 {
577     BDRVPreallocateState *s = bs->opaque;
578 
579     bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
580 
581     /*
582      * We need exclusive write and resize permissions on the child not only when
583      * the parent can write to it, but also after the parent gave up write
584      * permissions until preallocate_drop_resize() has completed.
585      */
586     if (can_write_resize(perm) || s->data_end != -EINVAL) {
587         *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
588 
589         /*
590          * Don't share, to keep our states s->file_end, s->data_end and
591          * s->zero_start valid.
592          */
593         *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
594     }
595 }
596 
597 static BlockDriver bdrv_preallocate_filter = {
598     .format_name = "preallocate",
599     .instance_size = sizeof(BDRVPreallocateState),
600 
601     .bdrv_co_getlength    = preallocate_co_getlength,
602     .bdrv_open            = preallocate_open,
603     .bdrv_close           = preallocate_close,
604 
605     .bdrv_reopen_prepare  = preallocate_reopen_prepare,
606     .bdrv_reopen_commit   = preallocate_reopen_commit,
607     .bdrv_reopen_abort    = preallocate_reopen_abort,
608 
609     .bdrv_co_preadv_part = preallocate_co_preadv_part,
610     .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
611     .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
612     .bdrv_co_pdiscard = preallocate_co_pdiscard,
613     .bdrv_co_flush = preallocate_co_flush,
614     .bdrv_co_truncate = preallocate_co_truncate,
615 
616     .bdrv_set_perm = preallocate_set_perm,
617     .bdrv_child_perm = preallocate_child_perm,
618 
619     .is_filter = true,
620 };
621 
bdrv_preallocate_init(void)622 static void bdrv_preallocate_init(void)
623 {
624     bdrv_register(&bdrv_preallocate_filter);
625 }
626 
627 block_init(bdrv_preallocate_init);
628