xref: /openbmc/qemu/block/replication.c (revision 88cd34ee)
1 /*
2  * Replication Block filter
3  *
4  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
5  * Copyright (c) 2016 Intel Corporation
6  * Copyright (c) 2016 FUJITSU LIMITED
7  *
8  * Author:
9  *   Wen Congyang <wency@cn.fujitsu.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu/module.h"
17 #include "qemu/option.h"
18 #include "block/nbd.h"
19 #include "block/blockjob.h"
20 #include "block/block_int.h"
21 #include "block/block_backup.h"
22 #include "sysemu/block-backend.h"
23 #include "qapi/error.h"
24 #include "qapi/qmp/qdict.h"
25 #include "replication.h"
26 
27 typedef enum {
28     BLOCK_REPLICATION_NONE,             /* block replication is not started */
29     BLOCK_REPLICATION_RUNNING,          /* block replication is running */
30     BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
31     BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
32     BLOCK_REPLICATION_DONE,             /* block replication is done */
33 } ReplicationStage;
34 
35 typedef struct BDRVReplicationState {
36     ReplicationMode mode;
37     ReplicationStage stage;
38     BdrvChild *active_disk;
39     BlockJob *commit_job;
40     BdrvChild *hidden_disk;
41     BdrvChild *secondary_disk;
42     BlockJob *backup_job;
43     char *top_id;
44     ReplicationState *rs;
45     Error *blocker;
46     bool orig_hidden_read_only;
47     bool orig_secondary_read_only;
48     int error;
49 } BDRVReplicationState;
50 
51 static void replication_start(ReplicationState *rs, ReplicationMode mode,
52                               Error **errp);
53 static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
54 static void replication_get_error(ReplicationState *rs, Error **errp);
55 static void replication_stop(ReplicationState *rs, bool failover,
56                              Error **errp);
57 
58 #define REPLICATION_MODE        "mode"
59 #define REPLICATION_TOP_ID      "top-id"
60 static QemuOptsList replication_runtime_opts = {
61     .name = "replication",
62     .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
63     .desc = {
64         {
65             .name = REPLICATION_MODE,
66             .type = QEMU_OPT_STRING,
67         },
68         {
69             .name = REPLICATION_TOP_ID,
70             .type = QEMU_OPT_STRING,
71         },
72         { /* end of list */ }
73     },
74 };
75 
76 static ReplicationOps replication_ops = {
77     .start = replication_start,
78     .checkpoint = replication_do_checkpoint,
79     .get_error = replication_get_error,
80     .stop = replication_stop,
81 };
82 
83 static int replication_open(BlockDriverState *bs, QDict *options,
84                             int flags, Error **errp)
85 {
86     int ret;
87     BDRVReplicationState *s = bs->opaque;
88     Error *local_err = NULL;
89     QemuOpts *opts = NULL;
90     const char *mode;
91     const char *top_id;
92 
93     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
94                                false, errp);
95     if (!bs->file) {
96         return -EINVAL;
97     }
98 
99     ret = -EINVAL;
100     opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
101     qemu_opts_absorb_qdict(opts, options, &local_err);
102     if (local_err) {
103         goto fail;
104     }
105 
106     mode = qemu_opt_get(opts, REPLICATION_MODE);
107     if (!mode) {
108         error_setg(&local_err, "Missing the option mode");
109         goto fail;
110     }
111 
112     if (!strcmp(mode, "primary")) {
113         s->mode = REPLICATION_MODE_PRIMARY;
114         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
115         if (top_id) {
116             error_setg(&local_err, "The primary side does not support option top-id");
117             goto fail;
118         }
119     } else if (!strcmp(mode, "secondary")) {
120         s->mode = REPLICATION_MODE_SECONDARY;
121         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
122         s->top_id = g_strdup(top_id);
123         if (!s->top_id) {
124             error_setg(&local_err, "Missing the option top-id");
125             goto fail;
126         }
127     } else {
128         error_setg(&local_err,
129                    "The option mode's value should be primary or secondary");
130         goto fail;
131     }
132 
133     s->rs = replication_new(bs, &replication_ops);
134 
135     ret = 0;
136 
137 fail:
138     qemu_opts_del(opts);
139     error_propagate(errp, local_err);
140 
141     return ret;
142 }
143 
144 static void replication_close(BlockDriverState *bs)
145 {
146     BDRVReplicationState *s = bs->opaque;
147 
148     if (s->stage == BLOCK_REPLICATION_RUNNING) {
149         replication_stop(s->rs, false, NULL);
150     }
151     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
152         job_cancel_sync(&s->commit_job->job);
153     }
154 
155     if (s->mode == REPLICATION_MODE_SECONDARY) {
156         g_free(s->top_id);
157     }
158 
159     replication_remove(s->rs);
160 }
161 
162 static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
163                                    const BdrvChildRole *role,
164                                    BlockReopenQueue *reopen_queue,
165                                    uint64_t perm, uint64_t shared,
166                                    uint64_t *nperm, uint64_t *nshared)
167 {
168     *nperm = BLK_PERM_CONSISTENT_READ;
169     if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
170         *nperm |= BLK_PERM_WRITE;
171     }
172     *nshared = BLK_PERM_CONSISTENT_READ \
173                | BLK_PERM_WRITE \
174                | BLK_PERM_WRITE_UNCHANGED;
175     return;
176 }
177 
178 static int64_t replication_getlength(BlockDriverState *bs)
179 {
180     return bdrv_getlength(bs->file->bs);
181 }
182 
183 static int replication_get_io_status(BDRVReplicationState *s)
184 {
185     switch (s->stage) {
186     case BLOCK_REPLICATION_NONE:
187         return -EIO;
188     case BLOCK_REPLICATION_RUNNING:
189         return 0;
190     case BLOCK_REPLICATION_FAILOVER:
191         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
192     case BLOCK_REPLICATION_FAILOVER_FAILED:
193         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
194     case BLOCK_REPLICATION_DONE:
195         /*
196          * active commit job completes, and active disk and secondary_disk
197          * is swapped, so we can operate bs->file directly
198          */
199         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
200     default:
201         abort();
202     }
203 }
204 
205 static int replication_return_value(BDRVReplicationState *s, int ret)
206 {
207     if (s->mode == REPLICATION_MODE_SECONDARY) {
208         return ret;
209     }
210 
211     if (ret < 0) {
212         s->error = ret;
213         ret = 0;
214     }
215 
216     return ret;
217 }
218 
219 static coroutine_fn int replication_co_readv(BlockDriverState *bs,
220                                              int64_t sector_num,
221                                              int remaining_sectors,
222                                              QEMUIOVector *qiov)
223 {
224     BDRVReplicationState *s = bs->opaque;
225     int ret;
226 
227     if (s->mode == REPLICATION_MODE_PRIMARY) {
228         /* We only use it to forward primary write requests */
229         return -EIO;
230     }
231 
232     ret = replication_get_io_status(s);
233     if (ret < 0) {
234         return ret;
235     }
236 
237     ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
238                          remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
239 
240     return replication_return_value(s, ret);
241 }
242 
243 static coroutine_fn int replication_co_writev(BlockDriverState *bs,
244                                               int64_t sector_num,
245                                               int remaining_sectors,
246                                               QEMUIOVector *qiov,
247                                               int flags)
248 {
249     BDRVReplicationState *s = bs->opaque;
250     QEMUIOVector hd_qiov;
251     uint64_t bytes_done = 0;
252     BdrvChild *top = bs->file;
253     BdrvChild *base = s->secondary_disk;
254     BdrvChild *target;
255     int ret;
256     int64_t n;
257 
258     assert(!flags);
259     ret = replication_get_io_status(s);
260     if (ret < 0) {
261         goto out;
262     }
263 
264     if (ret == 0) {
265         ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
266                               remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
267         return replication_return_value(s, ret);
268     }
269 
270     /*
271      * Failover failed, only write to active disk if the sectors
272      * have already been allocated in active disk/hidden disk.
273      */
274     qemu_iovec_init(&hd_qiov, qiov->niov);
275     while (remaining_sectors > 0) {
276         int64_t count;
277 
278         ret = bdrv_is_allocated_above(top->bs, base->bs, false,
279                                       sector_num * BDRV_SECTOR_SIZE,
280                                       remaining_sectors * BDRV_SECTOR_SIZE,
281                                       &count);
282         if (ret < 0) {
283             goto out1;
284         }
285 
286         assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
287         n = count >> BDRV_SECTOR_BITS;
288         qemu_iovec_reset(&hd_qiov);
289         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
290 
291         target = ret ? top : base;
292         ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
293                               n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
294         if (ret < 0) {
295             goto out1;
296         }
297 
298         remaining_sectors -= n;
299         sector_num += n;
300         bytes_done += count;
301     }
302 
303 out1:
304     qemu_iovec_destroy(&hd_qiov);
305 out:
306     return ret;
307 }
308 
309 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
310 {
311     Error *local_err = NULL;
312     int ret;
313 
314     if (!s->backup_job) {
315         error_setg(errp, "Backup job was cancelled unexpectedly");
316         return;
317     }
318 
319     backup_do_checkpoint(s->backup_job, &local_err);
320     if (local_err) {
321         error_propagate(errp, local_err);
322         return;
323     }
324 
325     if (!s->active_disk->bs->drv) {
326         error_setg(errp, "Active disk %s is ejected",
327                    s->active_disk->bs->node_name);
328         return;
329     }
330 
331     ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
332     if (ret < 0) {
333         error_setg(errp, "Cannot make active disk empty");
334         return;
335     }
336 
337     if (!s->hidden_disk->bs->drv) {
338         error_setg(errp, "Hidden disk %s is ejected",
339                    s->hidden_disk->bs->node_name);
340         return;
341     }
342 
343     ret = s->hidden_disk->bs->drv->bdrv_make_empty(s->hidden_disk->bs);
344     if (ret < 0) {
345         error_setg(errp, "Cannot make hidden disk empty");
346         return;
347     }
348 }
349 
350 /* This function is supposed to be called twice:
351  * first with writable = true, then with writable = false.
352  * The first call puts s->hidden_disk and s->secondary_disk in
353  * r/w mode, and the second puts them back in their original state.
354  */
355 static void reopen_backing_file(BlockDriverState *bs, bool writable,
356                                 Error **errp)
357 {
358     BDRVReplicationState *s = bs->opaque;
359     BlockReopenQueue *reopen_queue = NULL;
360     Error *local_err = NULL;
361 
362     if (writable) {
363         s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
364         s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
365     }
366 
367     bdrv_subtree_drained_begin(s->hidden_disk->bs);
368     bdrv_subtree_drained_begin(s->secondary_disk->bs);
369 
370     if (s->orig_hidden_read_only) {
371         QDict *opts = qdict_new();
372         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
373         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
374                                          opts, true);
375     }
376 
377     if (s->orig_secondary_read_only) {
378         QDict *opts = qdict_new();
379         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
380         reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
381                                          opts, true);
382     }
383 
384     if (reopen_queue) {
385         bdrv_reopen_multiple(reopen_queue, &local_err);
386         error_propagate(errp, local_err);
387     }
388 
389     bdrv_subtree_drained_end(s->hidden_disk->bs);
390     bdrv_subtree_drained_end(s->secondary_disk->bs);
391 }
392 
393 static void backup_job_cleanup(BlockDriverState *bs)
394 {
395     BDRVReplicationState *s = bs->opaque;
396     BlockDriverState *top_bs;
397 
398     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
399     if (!top_bs) {
400         return;
401     }
402     bdrv_op_unblock_all(top_bs, s->blocker);
403     error_free(s->blocker);
404     reopen_backing_file(bs, false, NULL);
405 }
406 
407 static void backup_job_completed(void *opaque, int ret)
408 {
409     BlockDriverState *bs = opaque;
410     BDRVReplicationState *s = bs->opaque;
411 
412     if (s->stage != BLOCK_REPLICATION_FAILOVER) {
413         /* The backup job is cancelled unexpectedly */
414         s->error = -EIO;
415     }
416 
417     backup_job_cleanup(bs);
418 }
419 
420 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
421 {
422     BdrvChild *child;
423 
424     /* The bs itself is the top_bs */
425     if (top_bs == bs) {
426         return true;
427     }
428 
429     /* Iterate over top_bs's children */
430     QLIST_FOREACH(child, &top_bs->children, next) {
431         if (child->bs == bs || check_top_bs(child->bs, bs)) {
432             return true;
433         }
434     }
435 
436     return false;
437 }
438 
439 static void replication_start(ReplicationState *rs, ReplicationMode mode,
440                               Error **errp)
441 {
442     BlockDriverState *bs = rs->opaque;
443     BDRVReplicationState *s;
444     BlockDriverState *top_bs;
445     int64_t active_length, hidden_length, disk_length;
446     AioContext *aio_context;
447     Error *local_err = NULL;
448 
449     aio_context = bdrv_get_aio_context(bs);
450     aio_context_acquire(aio_context);
451     s = bs->opaque;
452 
453     if (s->stage != BLOCK_REPLICATION_NONE) {
454         error_setg(errp, "Block replication is running or done");
455         aio_context_release(aio_context);
456         return;
457     }
458 
459     if (s->mode != mode) {
460         error_setg(errp, "The parameter mode's value is invalid, needs %d,"
461                    " but got %d", s->mode, mode);
462         aio_context_release(aio_context);
463         return;
464     }
465 
466     switch (s->mode) {
467     case REPLICATION_MODE_PRIMARY:
468         break;
469     case REPLICATION_MODE_SECONDARY:
470         s->active_disk = bs->file;
471         if (!s->active_disk || !s->active_disk->bs ||
472                                     !s->active_disk->bs->backing) {
473             error_setg(errp, "Active disk doesn't have backing file");
474             aio_context_release(aio_context);
475             return;
476         }
477 
478         s->hidden_disk = s->active_disk->bs->backing;
479         if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
480             error_setg(errp, "Hidden disk doesn't have backing file");
481             aio_context_release(aio_context);
482             return;
483         }
484 
485         s->secondary_disk = s->hidden_disk->bs->backing;
486         if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
487             error_setg(errp, "The secondary disk doesn't have block backend");
488             aio_context_release(aio_context);
489             return;
490         }
491 
492         /* verify the length */
493         active_length = bdrv_getlength(s->active_disk->bs);
494         hidden_length = bdrv_getlength(s->hidden_disk->bs);
495         disk_length = bdrv_getlength(s->secondary_disk->bs);
496         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
497             active_length != hidden_length || hidden_length != disk_length) {
498             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
499                        " are not the same");
500             aio_context_release(aio_context);
501             return;
502         }
503 
504         /* Must be true, or the bdrv_getlength() calls would have failed */
505         assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
506 
507         if (!s->active_disk->bs->drv->bdrv_make_empty ||
508             !s->hidden_disk->bs->drv->bdrv_make_empty) {
509             error_setg(errp,
510                        "Active disk or hidden disk doesn't support make_empty");
511             aio_context_release(aio_context);
512             return;
513         }
514 
515         /* reopen the backing file in r/w mode */
516         reopen_backing_file(bs, true, &local_err);
517         if (local_err) {
518             error_propagate(errp, local_err);
519             aio_context_release(aio_context);
520             return;
521         }
522 
523         /* start backup job now */
524         error_setg(&s->blocker,
525                    "Block device is in use by internal backup job");
526 
527         top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
528         if (!top_bs || !bdrv_is_root_node(top_bs) ||
529             !check_top_bs(top_bs, bs)) {
530             error_setg(errp, "No top_bs or it is invalid");
531             reopen_backing_file(bs, false, NULL);
532             aio_context_release(aio_context);
533             return;
534         }
535         bdrv_op_block_all(top_bs, s->blocker);
536         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
537 
538         s->backup_job = backup_job_create(
539                                 NULL, s->secondary_disk->bs, s->hidden_disk->bs,
540                                 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
541                                 BLOCKDEV_ON_ERROR_REPORT,
542                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
543                                 backup_job_completed, bs, NULL, &local_err);
544         if (local_err) {
545             error_propagate(errp, local_err);
546             backup_job_cleanup(bs);
547             aio_context_release(aio_context);
548             return;
549         }
550         job_start(&s->backup_job->job);
551         break;
552     default:
553         aio_context_release(aio_context);
554         abort();
555     }
556 
557     s->stage = BLOCK_REPLICATION_RUNNING;
558 
559     if (s->mode == REPLICATION_MODE_SECONDARY) {
560         secondary_do_checkpoint(s, errp);
561     }
562 
563     s->error = 0;
564     aio_context_release(aio_context);
565 }
566 
567 static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
568 {
569     BlockDriverState *bs = rs->opaque;
570     BDRVReplicationState *s;
571     AioContext *aio_context;
572 
573     aio_context = bdrv_get_aio_context(bs);
574     aio_context_acquire(aio_context);
575     s = bs->opaque;
576 
577     if (s->mode == REPLICATION_MODE_SECONDARY) {
578         secondary_do_checkpoint(s, errp);
579     }
580     aio_context_release(aio_context);
581 }
582 
583 static void replication_get_error(ReplicationState *rs, Error **errp)
584 {
585     BlockDriverState *bs = rs->opaque;
586     BDRVReplicationState *s;
587     AioContext *aio_context;
588 
589     aio_context = bdrv_get_aio_context(bs);
590     aio_context_acquire(aio_context);
591     s = bs->opaque;
592 
593     if (s->stage != BLOCK_REPLICATION_RUNNING) {
594         error_setg(errp, "Block replication is not running");
595         aio_context_release(aio_context);
596         return;
597     }
598 
599     if (s->error) {
600         error_setg(errp, "I/O error occurred");
601         aio_context_release(aio_context);
602         return;
603     }
604     aio_context_release(aio_context);
605 }
606 
607 static void replication_done(void *opaque, int ret)
608 {
609     BlockDriverState *bs = opaque;
610     BDRVReplicationState *s = bs->opaque;
611 
612     if (ret == 0) {
613         s->stage = BLOCK_REPLICATION_DONE;
614 
615         s->active_disk = NULL;
616         s->secondary_disk = NULL;
617         s->hidden_disk = NULL;
618         s->error = 0;
619     } else {
620         s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
621         s->error = -EIO;
622     }
623 }
624 
625 static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
626 {
627     BlockDriverState *bs = rs->opaque;
628     BDRVReplicationState *s;
629     AioContext *aio_context;
630 
631     aio_context = bdrv_get_aio_context(bs);
632     aio_context_acquire(aio_context);
633     s = bs->opaque;
634 
635     if (s->stage != BLOCK_REPLICATION_RUNNING) {
636         error_setg(errp, "Block replication is not running");
637         aio_context_release(aio_context);
638         return;
639     }
640 
641     switch (s->mode) {
642     case REPLICATION_MODE_PRIMARY:
643         s->stage = BLOCK_REPLICATION_DONE;
644         s->error = 0;
645         break;
646     case REPLICATION_MODE_SECONDARY:
647         /*
648          * This BDS will be closed, and the job should be completed
649          * before the BDS is closed, because we will access hidden
650          * disk, secondary disk in backup_job_completed().
651          */
652         if (s->backup_job) {
653             job_cancel_sync(&s->backup_job->job);
654         }
655 
656         if (!failover) {
657             secondary_do_checkpoint(s, errp);
658             s->stage = BLOCK_REPLICATION_DONE;
659             aio_context_release(aio_context);
660             return;
661         }
662 
663         s->stage = BLOCK_REPLICATION_FAILOVER;
664         s->commit_job = commit_active_start(
665                             NULL, s->active_disk->bs, s->secondary_disk->bs,
666                             JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
667                             NULL, replication_done, bs, true, errp);
668         break;
669     default:
670         aio_context_release(aio_context);
671         abort();
672     }
673     aio_context_release(aio_context);
674 }
675 
676 static const char *const replication_strong_runtime_opts[] = {
677     REPLICATION_MODE,
678     REPLICATION_TOP_ID,
679 
680     NULL
681 };
682 
683 static BlockDriver bdrv_replication = {
684     .format_name                = "replication",
685     .instance_size              = sizeof(BDRVReplicationState),
686 
687     .bdrv_open                  = replication_open,
688     .bdrv_close                 = replication_close,
689     .bdrv_child_perm            = replication_child_perm,
690 
691     .bdrv_getlength             = replication_getlength,
692     .bdrv_co_readv              = replication_co_readv,
693     .bdrv_co_writev             = replication_co_writev,
694 
695     .is_filter                  = true,
696 
697     .has_variable_length        = true,
698     .strong_runtime_opts        = replication_strong_runtime_opts,
699 };
700 
701 static void bdrv_replication_init(void)
702 {
703     bdrv_register(&bdrv_replication);
704 }
705 
706 block_init(bdrv_replication_init);
707