xref: /openbmc/qemu/block/copy-before-write.c (revision 9d1401b79463e74adbfac69d836789d4e103fb61)
1 /*
2  * copy-before-write filter driver
3  *
4  * The driver performs Copy-Before-Write (CBW) operation: it is injected above
5  * some node, and before each write it copies _old_ data to the target node.
6  *
7  * Copyright (c) 2018-2021 Virtuozzo International GmbH.
8  *
9  * Author:
10  *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program. If not, see <http://www.gnu.org/licenses/>.
24  */
25 
26 #include "qemu/osdep.h"
27 
28 #include "sysemu/block-backend.h"
29 #include "qemu/cutils.h"
30 #include "qapi/error.h"
31 #include "block/block_int.h"
32 #include "block/qdict.h"
33 #include "block/block-copy.h"
34 
35 #include "block/copy-before-write.h"
36 #include "block/reqlist.h"
37 
38 #include "qapi/qapi-visit-block-core.h"
39 
40 typedef struct BDRVCopyBeforeWriteState {
41     BlockCopyState *bcs;
42     BdrvChild *target;
43 
44     /*
45      * @lock: protects access to @access_bitmap, @done_bitmap and
46      * @frozen_read_reqs
47      */
48     CoMutex lock;
49 
50     /*
51      * @access_bitmap: represents areas allowed for reading by fleecing user.
52      * Reading from non-dirty areas leads to -EACCES.
53      */
54     BdrvDirtyBitmap *access_bitmap;
55 
56     /*
57      * @done_bitmap: represents areas that was successfully copied to @target by
58      * copy-before-write operations.
59      */
60     BdrvDirtyBitmap *done_bitmap;
61 
62     /*
63      * @frozen_read_reqs: current read requests for fleecing user in bs->file
64      * node. These areas must not be rewritten by guest.
65      */
66     BlockReqList frozen_read_reqs;
67 } BDRVCopyBeforeWriteState;
68 
69 static coroutine_fn int cbw_co_preadv(
70         BlockDriverState *bs, int64_t offset, int64_t bytes,
71         QEMUIOVector *qiov, BdrvRequestFlags flags)
72 {
73     return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
74 }
75 
76 /*
77  * Do copy-before-write operation.
78  *
79  * On failure guest request must be failed too.
80  *
81  * On success, we also wait for all in-flight fleecing read requests in source
82  * node, and it's guaranteed that after cbw_do_copy_before_write() successful
83  * return there are no such requests and they will never appear.
84  */
85 static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
86         uint64_t offset, uint64_t bytes, BdrvRequestFlags flags)
87 {
88     BDRVCopyBeforeWriteState *s = bs->opaque;
89     int ret;
90     uint64_t off, end;
91     int64_t cluster_size = block_copy_cluster_size(s->bcs);
92 
93     if (flags & BDRV_REQ_WRITE_UNCHANGED) {
94         return 0;
95     }
96 
97     off = QEMU_ALIGN_DOWN(offset, cluster_size);
98     end = QEMU_ALIGN_UP(offset + bytes, cluster_size);
99 
100     ret = block_copy(s->bcs, off, end - off, true);
101     if (ret < 0) {
102         return ret;
103     }
104 
105     WITH_QEMU_LOCK_GUARD(&s->lock) {
106         bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off);
107         reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock);
108     }
109 
110     return 0;
111 }
112 
113 static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs,
114                                         int64_t offset, int64_t bytes)
115 {
116     int ret = cbw_do_copy_before_write(bs, offset, bytes, 0);
117     if (ret < 0) {
118         return ret;
119     }
120 
121     return bdrv_co_pdiscard(bs->file, offset, bytes);
122 }
123 
124 static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs,
125         int64_t offset, int64_t bytes, BdrvRequestFlags flags)
126 {
127     int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
128     if (ret < 0) {
129         return ret;
130     }
131 
132     return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
133 }
134 
135 static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs,
136                                        int64_t offset,
137                                        int64_t bytes,
138                                        QEMUIOVector *qiov,
139                                        BdrvRequestFlags flags)
140 {
141     int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
142     if (ret < 0) {
143         return ret;
144     }
145 
146     return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
147 }
148 
149 static int coroutine_fn cbw_co_flush(BlockDriverState *bs)
150 {
151     if (!bs->file) {
152         return 0;
153     }
154 
155     return bdrv_co_flush(bs->file->bs);
156 }
157 
158 /*
159  * If @offset not accessible - return NULL.
160  *
161  * Otherwise, set @pnum to some bytes that accessible from @file (@file is set
162  * to bs->file or to s->target). Return newly allocated BlockReq object that
163  * should be than passed to cbw_snapshot_read_unlock().
164  *
165  * It's guaranteed that guest writes will not interact in the region until
166  * cbw_snapshot_read_unlock() called.
167  */
168 static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs,
169                                         int64_t offset, int64_t bytes,
170                                         int64_t *pnum, BdrvChild **file)
171 {
172     BDRVCopyBeforeWriteState *s = bs->opaque;
173     BlockReq *req = g_new(BlockReq, 1);
174     bool done;
175 
176     QEMU_LOCK_GUARD(&s->lock);
177 
178     if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) {
179         g_free(req);
180         return NULL;
181     }
182 
183     done = bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, pnum);
184     if (done) {
185         /*
186          * Special invalid BlockReq, that is handled in
187          * cbw_snapshot_read_unlock(). We don't need to lock something to read
188          * from s->target.
189          */
190         *req = (BlockReq) {.offset = -1, .bytes = -1};
191         *file = s->target;
192     } else {
193         reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes);
194         *file = bs->file;
195     }
196 
197     return req;
198 }
199 
200 static void cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
201 {
202     BDRVCopyBeforeWriteState *s = bs->opaque;
203 
204     if (req->offset == -1 && req->bytes == -1) {
205         g_free(req);
206         return;
207     }
208 
209     QEMU_LOCK_GUARD(&s->lock);
210 
211     reqlist_remove_req(req);
212     g_free(req);
213 }
214 
215 static coroutine_fn int
216 cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes,
217                        QEMUIOVector *qiov, size_t qiov_offset)
218 {
219     BlockReq *req;
220     BdrvChild *file;
221     int ret;
222 
223     /* TODO: upgrade to async loop using AioTask */
224     while (bytes) {
225         int64_t cur_bytes;
226 
227         req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &file);
228         if (!req) {
229             return -EACCES;
230         }
231 
232         ret = bdrv_co_preadv_part(file, offset, cur_bytes,
233                                   qiov, qiov_offset, 0);
234         cbw_snapshot_read_unlock(bs, req);
235         if (ret < 0) {
236             return ret;
237         }
238 
239         bytes -= cur_bytes;
240         offset += cur_bytes;
241         qiov_offset += cur_bytes;
242     }
243 
244     return 0;
245 }
246 
247 static int coroutine_fn
248 cbw_co_snapshot_block_status(BlockDriverState *bs,
249                              bool want_zero, int64_t offset, int64_t bytes,
250                              int64_t *pnum, int64_t *map,
251                              BlockDriverState **file)
252 {
253     BDRVCopyBeforeWriteState *s = bs->opaque;
254     BlockReq *req;
255     int ret;
256     int64_t cur_bytes;
257     BdrvChild *child;
258 
259     req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &child);
260     if (!req) {
261         return -EACCES;
262     }
263 
264     ret = bdrv_block_status(child->bs, offset, cur_bytes, pnum, map, file);
265     if (child == s->target) {
266         /*
267          * We refer to s->target only for areas that we've written to it.
268          * And we can not report unallocated blocks in s->target: this will
269          * break generic block-status-above logic, that will go to
270          * copy-before-write filtered child in this case.
271          */
272         assert(ret & BDRV_BLOCK_ALLOCATED);
273     }
274 
275     cbw_snapshot_read_unlock(bs, req);
276 
277     return ret;
278 }
279 
280 static int coroutine_fn cbw_co_pdiscard_snapshot(BlockDriverState *bs,
281                                                  int64_t offset, int64_t bytes)
282 {
283     BDRVCopyBeforeWriteState *s = bs->opaque;
284 
285     WITH_QEMU_LOCK_GUARD(&s->lock) {
286         bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
287     }
288 
289     block_copy_reset(s->bcs, offset, bytes);
290 
291     return bdrv_co_pdiscard(s->target, offset, bytes);
292 }
293 
294 static void cbw_refresh_filename(BlockDriverState *bs)
295 {
296     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
297             bs->file->bs->filename);
298 }
299 
300 static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
301                            BdrvChildRole role,
302                            BlockReopenQueue *reopen_queue,
303                            uint64_t perm, uint64_t shared,
304                            uint64_t *nperm, uint64_t *nshared)
305 {
306     if (!(role & BDRV_CHILD_FILTERED)) {
307         /*
308          * Target child
309          *
310          * Share write to target (child_file), to not interfere
311          * with guest writes to its disk which may be in target backing chain.
312          * Can't resize during a backup block job because we check the size
313          * only upfront.
314          */
315         *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
316         *nperm = BLK_PERM_WRITE;
317     } else {
318         /* Source child */
319         bdrv_default_perms(bs, c, role, reopen_queue,
320                            perm, shared, nperm, nshared);
321 
322         if (!QLIST_EMPTY(&bs->parents)) {
323             if (perm & BLK_PERM_WRITE) {
324                 *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
325             }
326             *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
327         }
328     }
329 }
330 
331 static bool cbw_parse_bitmap_option(QDict *options, BdrvDirtyBitmap **bitmap,
332                                     Error **errp)
333 {
334     QDict *bitmap_qdict = NULL;
335     BlockDirtyBitmap *bmp_param = NULL;
336     Visitor *v = NULL;
337     bool ret = false;
338 
339     *bitmap = NULL;
340 
341     qdict_extract_subqdict(options, &bitmap_qdict, "bitmap.");
342     if (!qdict_size(bitmap_qdict)) {
343         ret = true;
344         goto out;
345     }
346 
347     v = qobject_input_visitor_new_flat_confused(bitmap_qdict, errp);
348     if (!v) {
349         goto out;
350     }
351 
352     visit_type_BlockDirtyBitmap(v, NULL, &bmp_param, errp);
353     if (!bmp_param) {
354         goto out;
355     }
356 
357     *bitmap = block_dirty_bitmap_lookup(bmp_param->node, bmp_param->name, NULL,
358                                         errp);
359     if (!*bitmap) {
360         goto out;
361     }
362 
363     ret = true;
364 
365 out:
366     qapi_free_BlockDirtyBitmap(bmp_param);
367     visit_free(v);
368     qobject_unref(bitmap_qdict);
369 
370     return ret;
371 }
372 
373 static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
374                     Error **errp)
375 {
376     BDRVCopyBeforeWriteState *s = bs->opaque;
377     BdrvDirtyBitmap *bitmap = NULL;
378     int64_t cluster_size;
379 
380     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
381                                BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
382                                false, errp);
383     if (!bs->file) {
384         return -EINVAL;
385     }
386 
387     s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds,
388                                 BDRV_CHILD_DATA, false, errp);
389     if (!s->target) {
390         return -EINVAL;
391     }
392 
393     if (!cbw_parse_bitmap_option(options, &bitmap, errp)) {
394         return -EINVAL;
395     }
396 
397     bs->total_sectors = bs->file->bs->total_sectors;
398     bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
399             (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
400     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
401             ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
402              bs->file->bs->supported_zero_flags);
403 
404     s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
405     if (!s->bcs) {
406         error_prepend(errp, "Cannot create block-copy-state: ");
407         return -EINVAL;
408     }
409 
410     cluster_size = block_copy_cluster_size(s->bcs);
411 
412     s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
413     if (!s->done_bitmap) {
414         return -EINVAL;
415     }
416     bdrv_disable_dirty_bitmap(s->done_bitmap);
417 
418     /* s->access_bitmap starts equal to bcs bitmap */
419     s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
420     if (!s->access_bitmap) {
421         return -EINVAL;
422     }
423     bdrv_disable_dirty_bitmap(s->access_bitmap);
424     bdrv_dirty_bitmap_merge_internal(s->access_bitmap,
425                                      block_copy_dirty_bitmap(s->bcs), NULL,
426                                      true);
427 
428     qemu_co_mutex_init(&s->lock);
429     QLIST_INIT(&s->frozen_read_reqs);
430 
431     return 0;
432 }
433 
434 static void cbw_close(BlockDriverState *bs)
435 {
436     BDRVCopyBeforeWriteState *s = bs->opaque;
437 
438     bdrv_release_dirty_bitmap(s->access_bitmap);
439     bdrv_release_dirty_bitmap(s->done_bitmap);
440 
441     block_copy_state_free(s->bcs);
442     s->bcs = NULL;
443 }
444 
445 BlockDriver bdrv_cbw_filter = {
446     .format_name = "copy-before-write",
447     .instance_size = sizeof(BDRVCopyBeforeWriteState),
448 
449     .bdrv_open                  = cbw_open,
450     .bdrv_close                 = cbw_close,
451 
452     .bdrv_co_preadv             = cbw_co_preadv,
453     .bdrv_co_pwritev            = cbw_co_pwritev,
454     .bdrv_co_pwrite_zeroes      = cbw_co_pwrite_zeroes,
455     .bdrv_co_pdiscard           = cbw_co_pdiscard,
456     .bdrv_co_flush              = cbw_co_flush,
457 
458     .bdrv_co_preadv_snapshot       = cbw_co_preadv_snapshot,
459     .bdrv_co_pdiscard_snapshot     = cbw_co_pdiscard_snapshot,
460     .bdrv_co_snapshot_block_status = cbw_co_snapshot_block_status,
461 
462     .bdrv_refresh_filename      = cbw_refresh_filename,
463 
464     .bdrv_child_perm            = cbw_child_perm,
465 
466     .is_filter = true,
467 };
468 
469 BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
470                                   BlockDriverState *target,
471                                   const char *filter_node_name,
472                                   BlockCopyState **bcs,
473                                   Error **errp)
474 {
475     ERRP_GUARD();
476     BDRVCopyBeforeWriteState *state;
477     BlockDriverState *top;
478     QDict *opts;
479 
480     assert(source->total_sectors == target->total_sectors);
481     GLOBAL_STATE_CODE();
482 
483     opts = qdict_new();
484     qdict_put_str(opts, "driver", "copy-before-write");
485     if (filter_node_name) {
486         qdict_put_str(opts, "node-name", filter_node_name);
487     }
488     qdict_put_str(opts, "file", bdrv_get_node_name(source));
489     qdict_put_str(opts, "target", bdrv_get_node_name(target));
490 
491     top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
492     if (!top) {
493         return NULL;
494     }
495 
496     state = top->opaque;
497     *bcs = state->bcs;
498 
499     return top;
500 }
501 
502 void bdrv_cbw_drop(BlockDriverState *bs)
503 {
504     GLOBAL_STATE_CODE();
505     bdrv_drop_filter(bs, &error_abort);
506     bdrv_unref(bs);
507 }
508 
509 static void cbw_init(void)
510 {
511     bdrv_register(&bdrv_cbw_filter);
512 }
513 
514 block_init(cbw_init);
515