xref: /openbmc/qemu/block/replication.c (revision 500eb6db)
1 /*
2  * Replication Block filter
3  *
4  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
5  * Copyright (c) 2016 Intel Corporation
6  * Copyright (c) 2016 FUJITSU LIMITED
7  *
8  * Author:
9  *   Wen Congyang <wency@cn.fujitsu.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu/module.h"
17 #include "qemu/option.h"
18 #include "block/nbd.h"
19 #include "block/blockjob.h"
20 #include "block/block_int.h"
21 #include "block/block_backup.h"
22 #include "sysemu/block-backend.h"
23 #include "qapi/error.h"
24 #include "qapi/qmp/qdict.h"
25 #include "replication.h"
26 
27 typedef enum {
28     BLOCK_REPLICATION_NONE,             /* block replication is not started */
29     BLOCK_REPLICATION_RUNNING,          /* block replication is running */
30     BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
31     BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
32     BLOCK_REPLICATION_DONE,             /* block replication is done */
33 } ReplicationStage;
34 
35 typedef struct BDRVReplicationState {
36     ReplicationMode mode;
37     ReplicationStage stage;
38     BdrvChild *active_disk;
39     BdrvChild *hidden_disk;
40     BdrvChild *secondary_disk;
41     char *top_id;
42     ReplicationState *rs;
43     Error *blocker;
44     bool orig_hidden_read_only;
45     bool orig_secondary_read_only;
46     int error;
47 } BDRVReplicationState;
48 
49 static void replication_start(ReplicationState *rs, ReplicationMode mode,
50                               Error **errp);
51 static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
52 static void replication_get_error(ReplicationState *rs, Error **errp);
53 static void replication_stop(ReplicationState *rs, bool failover,
54                              Error **errp);
55 
56 #define REPLICATION_MODE        "mode"
57 #define REPLICATION_TOP_ID      "top-id"
58 static QemuOptsList replication_runtime_opts = {
59     .name = "replication",
60     .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
61     .desc = {
62         {
63             .name = REPLICATION_MODE,
64             .type = QEMU_OPT_STRING,
65         },
66         {
67             .name = REPLICATION_TOP_ID,
68             .type = QEMU_OPT_STRING,
69         },
70         { /* end of list */ }
71     },
72 };
73 
74 static ReplicationOps replication_ops = {
75     .start = replication_start,
76     .checkpoint = replication_do_checkpoint,
77     .get_error = replication_get_error,
78     .stop = replication_stop,
79 };
80 
81 static int replication_open(BlockDriverState *bs, QDict *options,
82                             int flags, Error **errp)
83 {
84     int ret;
85     BDRVReplicationState *s = bs->opaque;
86     Error *local_err = NULL;
87     QemuOpts *opts = NULL;
88     const char *mode;
89     const char *top_id;
90 
91     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
92                                false, errp);
93     if (!bs->file) {
94         return -EINVAL;
95     }
96 
97     ret = -EINVAL;
98     opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
99     qemu_opts_absorb_qdict(opts, options, &local_err);
100     if (local_err) {
101         goto fail;
102     }
103 
104     mode = qemu_opt_get(opts, REPLICATION_MODE);
105     if (!mode) {
106         error_setg(&local_err, "Missing the option mode");
107         goto fail;
108     }
109 
110     if (!strcmp(mode, "primary")) {
111         s->mode = REPLICATION_MODE_PRIMARY;
112         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
113         if (top_id) {
114             error_setg(&local_err, "The primary side does not support option top-id");
115             goto fail;
116         }
117     } else if (!strcmp(mode, "secondary")) {
118         s->mode = REPLICATION_MODE_SECONDARY;
119         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
120         s->top_id = g_strdup(top_id);
121         if (!s->top_id) {
122             error_setg(&local_err, "Missing the option top-id");
123             goto fail;
124         }
125     } else {
126         error_setg(&local_err,
127                    "The option mode's value should be primary or secondary");
128         goto fail;
129     }
130 
131     s->rs = replication_new(bs, &replication_ops);
132 
133     ret = 0;
134 
135 fail:
136     qemu_opts_del(opts);
137     error_propagate(errp, local_err);
138 
139     return ret;
140 }
141 
142 static void replication_close(BlockDriverState *bs)
143 {
144     BDRVReplicationState *s = bs->opaque;
145 
146     if (s->stage == BLOCK_REPLICATION_RUNNING) {
147         replication_stop(s->rs, false, NULL);
148     }
149     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
150         job_cancel_sync(&s->active_disk->bs->job->job);
151     }
152 
153     if (s->mode == REPLICATION_MODE_SECONDARY) {
154         g_free(s->top_id);
155     }
156 
157     replication_remove(s->rs);
158 }
159 
160 static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
161                                    const BdrvChildRole *role,
162                                    BlockReopenQueue *reopen_queue,
163                                    uint64_t perm, uint64_t shared,
164                                    uint64_t *nperm, uint64_t *nshared)
165 {
166     *nperm = BLK_PERM_CONSISTENT_READ;
167     if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
168         *nperm |= BLK_PERM_WRITE;
169     }
170     *nshared = BLK_PERM_CONSISTENT_READ \
171                | BLK_PERM_WRITE \
172                | BLK_PERM_WRITE_UNCHANGED;
173     return;
174 }
175 
176 static int64_t replication_getlength(BlockDriverState *bs)
177 {
178     return bdrv_getlength(bs->file->bs);
179 }
180 
181 static int replication_get_io_status(BDRVReplicationState *s)
182 {
183     switch (s->stage) {
184     case BLOCK_REPLICATION_NONE:
185         return -EIO;
186     case BLOCK_REPLICATION_RUNNING:
187         return 0;
188     case BLOCK_REPLICATION_FAILOVER:
189         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
190     case BLOCK_REPLICATION_FAILOVER_FAILED:
191         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
192     case BLOCK_REPLICATION_DONE:
193         /*
194          * active commit job completes, and active disk and secondary_disk
195          * is swapped, so we can operate bs->file directly
196          */
197         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
198     default:
199         abort();
200     }
201 }
202 
203 static int replication_return_value(BDRVReplicationState *s, int ret)
204 {
205     if (s->mode == REPLICATION_MODE_SECONDARY) {
206         return ret;
207     }
208 
209     if (ret < 0) {
210         s->error = ret;
211         ret = 0;
212     }
213 
214     return ret;
215 }
216 
217 static coroutine_fn int replication_co_readv(BlockDriverState *bs,
218                                              int64_t sector_num,
219                                              int remaining_sectors,
220                                              QEMUIOVector *qiov)
221 {
222     BDRVReplicationState *s = bs->opaque;
223     int ret;
224 
225     if (s->mode == REPLICATION_MODE_PRIMARY) {
226         /* We only use it to forward primary write requests */
227         return -EIO;
228     }
229 
230     ret = replication_get_io_status(s);
231     if (ret < 0) {
232         return ret;
233     }
234 
235     ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
236                          remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
237 
238     return replication_return_value(s, ret);
239 }
240 
241 static coroutine_fn int replication_co_writev(BlockDriverState *bs,
242                                               int64_t sector_num,
243                                               int remaining_sectors,
244                                               QEMUIOVector *qiov,
245                                               int flags)
246 {
247     BDRVReplicationState *s = bs->opaque;
248     QEMUIOVector hd_qiov;
249     uint64_t bytes_done = 0;
250     BdrvChild *top = bs->file;
251     BdrvChild *base = s->secondary_disk;
252     BdrvChild *target;
253     int ret;
254     int64_t n;
255 
256     assert(!flags);
257     ret = replication_get_io_status(s);
258     if (ret < 0) {
259         goto out;
260     }
261 
262     if (ret == 0) {
263         ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
264                               remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
265         return replication_return_value(s, ret);
266     }
267 
268     /*
269      * Failover failed, only write to active disk if the sectors
270      * have already been allocated in active disk/hidden disk.
271      */
272     qemu_iovec_init(&hd_qiov, qiov->niov);
273     while (remaining_sectors > 0) {
274         int64_t count;
275 
276         ret = bdrv_is_allocated_above(top->bs, base->bs,
277                                       sector_num * BDRV_SECTOR_SIZE,
278                                       remaining_sectors * BDRV_SECTOR_SIZE,
279                                       &count);
280         if (ret < 0) {
281             goto out1;
282         }
283 
284         assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
285         n = count >> BDRV_SECTOR_BITS;
286         qemu_iovec_reset(&hd_qiov);
287         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
288 
289         target = ret ? top : base;
290         ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
291                               n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
292         if (ret < 0) {
293             goto out1;
294         }
295 
296         remaining_sectors -= n;
297         sector_num += n;
298         bytes_done += count;
299     }
300 
301 out1:
302     qemu_iovec_destroy(&hd_qiov);
303 out:
304     return ret;
305 }
306 
307 static bool replication_recurse_is_first_non_filter(BlockDriverState *bs,
308                                                     BlockDriverState *candidate)
309 {
310     return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
311 }
312 
313 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
314 {
315     Error *local_err = NULL;
316     int ret;
317 
318     if (!s->secondary_disk->bs->job) {
319         error_setg(errp, "Backup job was cancelled unexpectedly");
320         return;
321     }
322 
323     backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
324     if (local_err) {
325         error_propagate(errp, local_err);
326         return;
327     }
328 
329     if (!s->active_disk->bs->drv) {
330         error_setg(errp, "Active disk %s is ejected",
331                    s->active_disk->bs->node_name);
332         return;
333     }
334 
335     ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
336     if (ret < 0) {
337         error_setg(errp, "Cannot make active disk empty");
338         return;
339     }
340 
341     if (!s->hidden_disk->bs->drv) {
342         error_setg(errp, "Hidden disk %s is ejected",
343                    s->hidden_disk->bs->node_name);
344         return;
345     }
346 
347     ret = s->hidden_disk->bs->drv->bdrv_make_empty(s->hidden_disk->bs);
348     if (ret < 0) {
349         error_setg(errp, "Cannot make hidden disk empty");
350         return;
351     }
352 }
353 
354 /* This function is supposed to be called twice:
355  * first with writable = true, then with writable = false.
356  * The first call puts s->hidden_disk and s->secondary_disk in
357  * r/w mode, and the second puts them back in their original state.
358  */
359 static void reopen_backing_file(BlockDriverState *bs, bool writable,
360                                 Error **errp)
361 {
362     BDRVReplicationState *s = bs->opaque;
363     BlockReopenQueue *reopen_queue = NULL;
364     Error *local_err = NULL;
365 
366     if (writable) {
367         s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
368         s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
369     }
370 
371     bdrv_subtree_drained_begin(s->hidden_disk->bs);
372     bdrv_subtree_drained_begin(s->secondary_disk->bs);
373 
374     if (s->orig_hidden_read_only) {
375         QDict *opts = qdict_new();
376         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
377         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
378                                          opts, true);
379     }
380 
381     if (s->orig_secondary_read_only) {
382         QDict *opts = qdict_new();
383         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
384         reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
385                                          opts, true);
386     }
387 
388     if (reopen_queue) {
389         bdrv_reopen_multiple(reopen_queue, &local_err);
390         error_propagate(errp, local_err);
391     }
392 
393     bdrv_subtree_drained_end(s->hidden_disk->bs);
394     bdrv_subtree_drained_end(s->secondary_disk->bs);
395 }
396 
397 static void backup_job_cleanup(BlockDriverState *bs)
398 {
399     BDRVReplicationState *s = bs->opaque;
400     BlockDriverState *top_bs;
401 
402     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
403     if (!top_bs) {
404         return;
405     }
406     bdrv_op_unblock_all(top_bs, s->blocker);
407     error_free(s->blocker);
408     reopen_backing_file(bs, false, NULL);
409 }
410 
411 static void backup_job_completed(void *opaque, int ret)
412 {
413     BlockDriverState *bs = opaque;
414     BDRVReplicationState *s = bs->opaque;
415 
416     if (s->stage != BLOCK_REPLICATION_FAILOVER) {
417         /* The backup job is cancelled unexpectedly */
418         s->error = -EIO;
419     }
420 
421     backup_job_cleanup(bs);
422 }
423 
424 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
425 {
426     BdrvChild *child;
427 
428     /* The bs itself is the top_bs */
429     if (top_bs == bs) {
430         return true;
431     }
432 
433     /* Iterate over top_bs's children */
434     QLIST_FOREACH(child, &top_bs->children, next) {
435         if (child->bs == bs || check_top_bs(child->bs, bs)) {
436             return true;
437         }
438     }
439 
440     return false;
441 }
442 
443 static void replication_start(ReplicationState *rs, ReplicationMode mode,
444                               Error **errp)
445 {
446     BlockDriverState *bs = rs->opaque;
447     BDRVReplicationState *s;
448     BlockDriverState *top_bs;
449     int64_t active_length, hidden_length, disk_length;
450     AioContext *aio_context;
451     Error *local_err = NULL;
452     BlockJob *job;
453 
454     aio_context = bdrv_get_aio_context(bs);
455     aio_context_acquire(aio_context);
456     s = bs->opaque;
457 
458     if (s->stage != BLOCK_REPLICATION_NONE) {
459         error_setg(errp, "Block replication is running or done");
460         aio_context_release(aio_context);
461         return;
462     }
463 
464     if (s->mode != mode) {
465         error_setg(errp, "The parameter mode's value is invalid, needs %d,"
466                    " but got %d", s->mode, mode);
467         aio_context_release(aio_context);
468         return;
469     }
470 
471     switch (s->mode) {
472     case REPLICATION_MODE_PRIMARY:
473         break;
474     case REPLICATION_MODE_SECONDARY:
475         s->active_disk = bs->file;
476         if (!s->active_disk || !s->active_disk->bs ||
477                                     !s->active_disk->bs->backing) {
478             error_setg(errp, "Active disk doesn't have backing file");
479             aio_context_release(aio_context);
480             return;
481         }
482 
483         s->hidden_disk = s->active_disk->bs->backing;
484         if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
485             error_setg(errp, "Hidden disk doesn't have backing file");
486             aio_context_release(aio_context);
487             return;
488         }
489 
490         s->secondary_disk = s->hidden_disk->bs->backing;
491         if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
492             error_setg(errp, "The secondary disk doesn't have block backend");
493             aio_context_release(aio_context);
494             return;
495         }
496 
497         /* verify the length */
498         active_length = bdrv_getlength(s->active_disk->bs);
499         hidden_length = bdrv_getlength(s->hidden_disk->bs);
500         disk_length = bdrv_getlength(s->secondary_disk->bs);
501         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
502             active_length != hidden_length || hidden_length != disk_length) {
503             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
504                        " are not the same");
505             aio_context_release(aio_context);
506             return;
507         }
508 
509         /* Must be true, or the bdrv_getlength() calls would have failed */
510         assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
511 
512         if (!s->active_disk->bs->drv->bdrv_make_empty ||
513             !s->hidden_disk->bs->drv->bdrv_make_empty) {
514             error_setg(errp,
515                        "Active disk or hidden disk doesn't support make_empty");
516             aio_context_release(aio_context);
517             return;
518         }
519 
520         /* reopen the backing file in r/w mode */
521         reopen_backing_file(bs, true, &local_err);
522         if (local_err) {
523             error_propagate(errp, local_err);
524             aio_context_release(aio_context);
525             return;
526         }
527 
528         /* start backup job now */
529         error_setg(&s->blocker,
530                    "Block device is in use by internal backup job");
531 
532         top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
533         if (!top_bs || !bdrv_is_root_node(top_bs) ||
534             !check_top_bs(top_bs, bs)) {
535             error_setg(errp, "No top_bs or it is invalid");
536             reopen_backing_file(bs, false, NULL);
537             aio_context_release(aio_context);
538             return;
539         }
540         bdrv_op_block_all(top_bs, s->blocker);
541         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
542 
543         job = backup_job_create(NULL, s->secondary_disk->bs, s->hidden_disk->bs,
544                                 0, MIRROR_SYNC_MODE_NONE, NULL, false,
545                                 BLOCKDEV_ON_ERROR_REPORT,
546                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
547                                 backup_job_completed, bs, NULL, &local_err);
548         if (local_err) {
549             error_propagate(errp, local_err);
550             backup_job_cleanup(bs);
551             aio_context_release(aio_context);
552             return;
553         }
554         job_start(&job->job);
555         break;
556     default:
557         aio_context_release(aio_context);
558         abort();
559     }
560 
561     s->stage = BLOCK_REPLICATION_RUNNING;
562 
563     if (s->mode == REPLICATION_MODE_SECONDARY) {
564         secondary_do_checkpoint(s, errp);
565     }
566 
567     s->error = 0;
568     aio_context_release(aio_context);
569 }
570 
571 static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
572 {
573     BlockDriverState *bs = rs->opaque;
574     BDRVReplicationState *s;
575     AioContext *aio_context;
576 
577     aio_context = bdrv_get_aio_context(bs);
578     aio_context_acquire(aio_context);
579     s = bs->opaque;
580 
581     if (s->mode == REPLICATION_MODE_SECONDARY) {
582         secondary_do_checkpoint(s, errp);
583     }
584     aio_context_release(aio_context);
585 }
586 
587 static void replication_get_error(ReplicationState *rs, Error **errp)
588 {
589     BlockDriverState *bs = rs->opaque;
590     BDRVReplicationState *s;
591     AioContext *aio_context;
592 
593     aio_context = bdrv_get_aio_context(bs);
594     aio_context_acquire(aio_context);
595     s = bs->opaque;
596 
597     if (s->stage != BLOCK_REPLICATION_RUNNING) {
598         error_setg(errp, "Block replication is not running");
599         aio_context_release(aio_context);
600         return;
601     }
602 
603     if (s->error) {
604         error_setg(errp, "I/O error occurred");
605         aio_context_release(aio_context);
606         return;
607     }
608     aio_context_release(aio_context);
609 }
610 
611 static void replication_done(void *opaque, int ret)
612 {
613     BlockDriverState *bs = opaque;
614     BDRVReplicationState *s = bs->opaque;
615 
616     if (ret == 0) {
617         s->stage = BLOCK_REPLICATION_DONE;
618 
619         s->active_disk = NULL;
620         s->secondary_disk = NULL;
621         s->hidden_disk = NULL;
622         s->error = 0;
623     } else {
624         s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
625         s->error = -EIO;
626     }
627 }
628 
629 static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
630 {
631     BlockDriverState *bs = rs->opaque;
632     BDRVReplicationState *s;
633     AioContext *aio_context;
634 
635     aio_context = bdrv_get_aio_context(bs);
636     aio_context_acquire(aio_context);
637     s = bs->opaque;
638 
639     if (s->stage != BLOCK_REPLICATION_RUNNING) {
640         error_setg(errp, "Block replication is not running");
641         aio_context_release(aio_context);
642         return;
643     }
644 
645     switch (s->mode) {
646     case REPLICATION_MODE_PRIMARY:
647         s->stage = BLOCK_REPLICATION_DONE;
648         s->error = 0;
649         break;
650     case REPLICATION_MODE_SECONDARY:
651         /*
652          * This BDS will be closed, and the job should be completed
653          * before the BDS is closed, because we will access hidden
654          * disk, secondary disk in backup_job_completed().
655          */
656         if (s->secondary_disk->bs->job) {
657             job_cancel_sync(&s->secondary_disk->bs->job->job);
658         }
659 
660         if (!failover) {
661             secondary_do_checkpoint(s, errp);
662             s->stage = BLOCK_REPLICATION_DONE;
663             aio_context_release(aio_context);
664             return;
665         }
666 
667         s->stage = BLOCK_REPLICATION_FAILOVER;
668         commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
669                             JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
670                             NULL, replication_done, bs, true, errp);
671         break;
672     default:
673         aio_context_release(aio_context);
674         abort();
675     }
676     aio_context_release(aio_context);
677 }
678 
679 static const char *const replication_strong_runtime_opts[] = {
680     REPLICATION_MODE,
681     REPLICATION_TOP_ID,
682 
683     NULL
684 };
685 
686 static BlockDriver bdrv_replication = {
687     .format_name                = "replication",
688     .instance_size              = sizeof(BDRVReplicationState),
689 
690     .bdrv_open                  = replication_open,
691     .bdrv_close                 = replication_close,
692     .bdrv_child_perm            = replication_child_perm,
693 
694     .bdrv_getlength             = replication_getlength,
695     .bdrv_co_readv              = replication_co_readv,
696     .bdrv_co_writev             = replication_co_writev,
697 
698     .is_filter                  = true,
699     .bdrv_recurse_is_first_non_filter = replication_recurse_is_first_non_filter,
700 
701     .has_variable_length        = true,
702     .strong_runtime_opts        = replication_strong_runtime_opts,
703 };
704 
705 static void bdrv_replication_init(void)
706 {
707     bdrv_register(&bdrv_replication);
708 }
709 
710 block_init(bdrv_replication_init);
711