xref: /openbmc/qemu/block/replication.c (revision dd8d6a2d)
1 /*
2  * Replication Block filter
3  *
4  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
5  * Copyright (c) 2016 Intel Corporation
6  * Copyright (c) 2016 FUJITSU LIMITED
7  *
8  * Author:
9  *   Wen Congyang <wency@cn.fujitsu.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu/option.h"
17 #include "block/nbd.h"
18 #include "block/blockjob.h"
19 #include "block/block_int.h"
20 #include "block/block_backup.h"
21 #include "sysemu/block-backend.h"
22 #include "qapi/error.h"
23 #include "qapi/qmp/qdict.h"
24 #include "replication.h"
25 
26 typedef enum {
27     BLOCK_REPLICATION_NONE,             /* block replication is not started */
28     BLOCK_REPLICATION_RUNNING,          /* block replication is running */
29     BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
30     BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
31     BLOCK_REPLICATION_DONE,             /* block replication is done */
32 } ReplicationStage;
33 
34 typedef struct BDRVReplicationState {
35     ReplicationMode mode;
36     ReplicationStage stage;
37     BdrvChild *active_disk;
38     BdrvChild *hidden_disk;
39     BdrvChild *secondary_disk;
40     char *top_id;
41     ReplicationState *rs;
42     Error *blocker;
43     bool orig_hidden_read_only;
44     bool orig_secondary_read_only;
45     int error;
46 } BDRVReplicationState;
47 
48 static void replication_start(ReplicationState *rs, ReplicationMode mode,
49                               Error **errp);
50 static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
51 static void replication_get_error(ReplicationState *rs, Error **errp);
52 static void replication_stop(ReplicationState *rs, bool failover,
53                              Error **errp);
54 
55 #define REPLICATION_MODE        "mode"
56 #define REPLICATION_TOP_ID      "top-id"
57 static QemuOptsList replication_runtime_opts = {
58     .name = "replication",
59     .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
60     .desc = {
61         {
62             .name = REPLICATION_MODE,
63             .type = QEMU_OPT_STRING,
64         },
65         {
66             .name = REPLICATION_TOP_ID,
67             .type = QEMU_OPT_STRING,
68         },
69         { /* end of list */ }
70     },
71 };
72 
73 static ReplicationOps replication_ops = {
74     .start = replication_start,
75     .checkpoint = replication_do_checkpoint,
76     .get_error = replication_get_error,
77     .stop = replication_stop,
78 };
79 
80 static int replication_open(BlockDriverState *bs, QDict *options,
81                             int flags, Error **errp)
82 {
83     int ret;
84     BDRVReplicationState *s = bs->opaque;
85     Error *local_err = NULL;
86     QemuOpts *opts = NULL;
87     const char *mode;
88     const char *top_id;
89 
90     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
91                                false, errp);
92     if (!bs->file) {
93         return -EINVAL;
94     }
95 
96     ret = -EINVAL;
97     opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
98     qemu_opts_absorb_qdict(opts, options, &local_err);
99     if (local_err) {
100         goto fail;
101     }
102 
103     mode = qemu_opt_get(opts, REPLICATION_MODE);
104     if (!mode) {
105         error_setg(&local_err, "Missing the option mode");
106         goto fail;
107     }
108 
109     if (!strcmp(mode, "primary")) {
110         s->mode = REPLICATION_MODE_PRIMARY;
111         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
112         if (top_id) {
113             error_setg(&local_err, "The primary side does not support option top-id");
114             goto fail;
115         }
116     } else if (!strcmp(mode, "secondary")) {
117         s->mode = REPLICATION_MODE_SECONDARY;
118         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
119         s->top_id = g_strdup(top_id);
120         if (!s->top_id) {
121             error_setg(&local_err, "Missing the option top-id");
122             goto fail;
123         }
124     } else {
125         error_setg(&local_err,
126                    "The option mode's value should be primary or secondary");
127         goto fail;
128     }
129 
130     s->rs = replication_new(bs, &replication_ops);
131 
132     ret = 0;
133 
134 fail:
135     qemu_opts_del(opts);
136     error_propagate(errp, local_err);
137 
138     return ret;
139 }
140 
141 static void replication_close(BlockDriverState *bs)
142 {
143     BDRVReplicationState *s = bs->opaque;
144 
145     if (s->stage == BLOCK_REPLICATION_RUNNING) {
146         replication_stop(s->rs, false, NULL);
147     }
148     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
149         job_cancel_sync(&s->active_disk->bs->job->job);
150     }
151 
152     if (s->mode == REPLICATION_MODE_SECONDARY) {
153         g_free(s->top_id);
154     }
155 
156     replication_remove(s->rs);
157 }
158 
159 static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
160                                    const BdrvChildRole *role,
161                                    BlockReopenQueue *reopen_queue,
162                                    uint64_t perm, uint64_t shared,
163                                    uint64_t *nperm, uint64_t *nshared)
164 {
165     *nperm = BLK_PERM_CONSISTENT_READ;
166     if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
167         *nperm |= BLK_PERM_WRITE;
168     }
169     *nshared = BLK_PERM_CONSISTENT_READ \
170                | BLK_PERM_WRITE \
171                | BLK_PERM_WRITE_UNCHANGED;
172     return;
173 }
174 
175 static int64_t replication_getlength(BlockDriverState *bs)
176 {
177     return bdrv_getlength(bs->file->bs);
178 }
179 
180 static int replication_get_io_status(BDRVReplicationState *s)
181 {
182     switch (s->stage) {
183     case BLOCK_REPLICATION_NONE:
184         return -EIO;
185     case BLOCK_REPLICATION_RUNNING:
186         return 0;
187     case BLOCK_REPLICATION_FAILOVER:
188         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
189     case BLOCK_REPLICATION_FAILOVER_FAILED:
190         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
191     case BLOCK_REPLICATION_DONE:
192         /*
193          * active commit job completes, and active disk and secondary_disk
194          * is swapped, so we can operate bs->file directly
195          */
196         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
197     default:
198         abort();
199     }
200 }
201 
202 static int replication_return_value(BDRVReplicationState *s, int ret)
203 {
204     if (s->mode == REPLICATION_MODE_SECONDARY) {
205         return ret;
206     }
207 
208     if (ret < 0) {
209         s->error = ret;
210         ret = 0;
211     }
212 
213     return ret;
214 }
215 
216 static coroutine_fn int replication_co_readv(BlockDriverState *bs,
217                                              int64_t sector_num,
218                                              int remaining_sectors,
219                                              QEMUIOVector *qiov)
220 {
221     BDRVReplicationState *s = bs->opaque;
222     int ret;
223 
224     if (s->mode == REPLICATION_MODE_PRIMARY) {
225         /* We only use it to forward primary write requests */
226         return -EIO;
227     }
228 
229     ret = replication_get_io_status(s);
230     if (ret < 0) {
231         return ret;
232     }
233 
234     ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
235                          remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
236 
237     return replication_return_value(s, ret);
238 }
239 
240 static coroutine_fn int replication_co_writev(BlockDriverState *bs,
241                                               int64_t sector_num,
242                                               int remaining_sectors,
243                                               QEMUIOVector *qiov,
244                                               int flags)
245 {
246     BDRVReplicationState *s = bs->opaque;
247     QEMUIOVector hd_qiov;
248     uint64_t bytes_done = 0;
249     BdrvChild *top = bs->file;
250     BdrvChild *base = s->secondary_disk;
251     BdrvChild *target;
252     int ret;
253     int64_t n;
254 
255     assert(!flags);
256     ret = replication_get_io_status(s);
257     if (ret < 0) {
258         goto out;
259     }
260 
261     if (ret == 0) {
262         ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
263                               remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
264         return replication_return_value(s, ret);
265     }
266 
267     /*
268      * Failover failed, only write to active disk if the sectors
269      * have already been allocated in active disk/hidden disk.
270      */
271     qemu_iovec_init(&hd_qiov, qiov->niov);
272     while (remaining_sectors > 0) {
273         int64_t count;
274 
275         ret = bdrv_is_allocated_above(top->bs, base->bs,
276                                       sector_num * BDRV_SECTOR_SIZE,
277                                       remaining_sectors * BDRV_SECTOR_SIZE,
278                                       &count);
279         if (ret < 0) {
280             goto out1;
281         }
282 
283         assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
284         n = count >> BDRV_SECTOR_BITS;
285         qemu_iovec_reset(&hd_qiov);
286         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
287 
288         target = ret ? top : base;
289         ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
290                               n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
291         if (ret < 0) {
292             goto out1;
293         }
294 
295         remaining_sectors -= n;
296         sector_num += n;
297         bytes_done += count;
298     }
299 
300 out1:
301     qemu_iovec_destroy(&hd_qiov);
302 out:
303     return ret;
304 }
305 
306 static bool replication_recurse_is_first_non_filter(BlockDriverState *bs,
307                                                     BlockDriverState *candidate)
308 {
309     return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
310 }
311 
312 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
313 {
314     Error *local_err = NULL;
315     int ret;
316 
317     if (!s->secondary_disk->bs->job) {
318         error_setg(errp, "Backup job was cancelled unexpectedly");
319         return;
320     }
321 
322     backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
323     if (local_err) {
324         error_propagate(errp, local_err);
325         return;
326     }
327 
328     if (!s->active_disk->bs->drv) {
329         error_setg(errp, "Active disk %s is ejected",
330                    s->active_disk->bs->node_name);
331         return;
332     }
333 
334     ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
335     if (ret < 0) {
336         error_setg(errp, "Cannot make active disk empty");
337         return;
338     }
339 
340     if (!s->hidden_disk->bs->drv) {
341         error_setg(errp, "Hidden disk %s is ejected",
342                    s->hidden_disk->bs->node_name);
343         return;
344     }
345 
346     ret = s->hidden_disk->bs->drv->bdrv_make_empty(s->hidden_disk->bs);
347     if (ret < 0) {
348         error_setg(errp, "Cannot make hidden disk empty");
349         return;
350     }
351 }
352 
353 /* This function is supposed to be called twice:
354  * first with writable = true, then with writable = false.
355  * The first call puts s->hidden_disk and s->secondary_disk in
356  * r/w mode, and the second puts them back in their original state.
357  */
358 static void reopen_backing_file(BlockDriverState *bs, bool writable,
359                                 Error **errp)
360 {
361     BDRVReplicationState *s = bs->opaque;
362     BlockReopenQueue *reopen_queue = NULL;
363     Error *local_err = NULL;
364 
365     if (writable) {
366         s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
367         s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
368     }
369 
370     bdrv_subtree_drained_begin(s->hidden_disk->bs);
371     bdrv_subtree_drained_begin(s->secondary_disk->bs);
372 
373     if (s->orig_hidden_read_only) {
374         QDict *opts = qdict_new();
375         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
376         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
377                                          opts);
378     }
379 
380     if (s->orig_secondary_read_only) {
381         QDict *opts = qdict_new();
382         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
383         reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
384                                          opts);
385     }
386 
387     if (reopen_queue) {
388         bdrv_reopen_multiple(bdrv_get_aio_context(bs),
389                              reopen_queue, &local_err);
390         error_propagate(errp, local_err);
391     }
392 
393     bdrv_subtree_drained_end(s->hidden_disk->bs);
394     bdrv_subtree_drained_end(s->secondary_disk->bs);
395 }
396 
397 static void backup_job_cleanup(BlockDriverState *bs)
398 {
399     BDRVReplicationState *s = bs->opaque;
400     BlockDriverState *top_bs;
401 
402     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
403     if (!top_bs) {
404         return;
405     }
406     bdrv_op_unblock_all(top_bs, s->blocker);
407     error_free(s->blocker);
408     reopen_backing_file(bs, false, NULL);
409 }
410 
411 static void backup_job_completed(void *opaque, int ret)
412 {
413     BlockDriverState *bs = opaque;
414     BDRVReplicationState *s = bs->opaque;
415 
416     if (s->stage != BLOCK_REPLICATION_FAILOVER) {
417         /* The backup job is cancelled unexpectedly */
418         s->error = -EIO;
419     }
420 
421     backup_job_cleanup(bs);
422 }
423 
424 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
425 {
426     BdrvChild *child;
427 
428     /* The bs itself is the top_bs */
429     if (top_bs == bs) {
430         return true;
431     }
432 
433     /* Iterate over top_bs's children */
434     QLIST_FOREACH(child, &top_bs->children, next) {
435         if (child->bs == bs || check_top_bs(child->bs, bs)) {
436             return true;
437         }
438     }
439 
440     return false;
441 }
442 
443 static void replication_start(ReplicationState *rs, ReplicationMode mode,
444                               Error **errp)
445 {
446     BlockDriverState *bs = rs->opaque;
447     BDRVReplicationState *s;
448     BlockDriverState *top_bs;
449     int64_t active_length, hidden_length, disk_length;
450     AioContext *aio_context;
451     Error *local_err = NULL;
452     BlockJob *job;
453 
454     aio_context = bdrv_get_aio_context(bs);
455     aio_context_acquire(aio_context);
456     s = bs->opaque;
457 
458     if (s->stage != BLOCK_REPLICATION_NONE) {
459         error_setg(errp, "Block replication is running or done");
460         aio_context_release(aio_context);
461         return;
462     }
463 
464     if (s->mode != mode) {
465         error_setg(errp, "The parameter mode's value is invalid, needs %d,"
466                    " but got %d", s->mode, mode);
467         aio_context_release(aio_context);
468         return;
469     }
470 
471     switch (s->mode) {
472     case REPLICATION_MODE_PRIMARY:
473         break;
474     case REPLICATION_MODE_SECONDARY:
475         s->active_disk = bs->file;
476         if (!s->active_disk || !s->active_disk->bs ||
477                                     !s->active_disk->bs->backing) {
478             error_setg(errp, "Active disk doesn't have backing file");
479             aio_context_release(aio_context);
480             return;
481         }
482 
483         s->hidden_disk = s->active_disk->bs->backing;
484         if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
485             error_setg(errp, "Hidden disk doesn't have backing file");
486             aio_context_release(aio_context);
487             return;
488         }
489 
490         s->secondary_disk = s->hidden_disk->bs->backing;
491         if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
492             error_setg(errp, "The secondary disk doesn't have block backend");
493             aio_context_release(aio_context);
494             return;
495         }
496 
497         /* verify the length */
498         active_length = bdrv_getlength(s->active_disk->bs);
499         hidden_length = bdrv_getlength(s->hidden_disk->bs);
500         disk_length = bdrv_getlength(s->secondary_disk->bs);
501         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
502             active_length != hidden_length || hidden_length != disk_length) {
503             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
504                        " are not the same");
505             aio_context_release(aio_context);
506             return;
507         }
508 
509         /* Must be true, or the bdrv_getlength() calls would have failed */
510         assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
511 
512         if (!s->active_disk->bs->drv->bdrv_make_empty ||
513             !s->hidden_disk->bs->drv->bdrv_make_empty) {
514             error_setg(errp,
515                        "Active disk or hidden disk doesn't support make_empty");
516             aio_context_release(aio_context);
517             return;
518         }
519 
520         /* reopen the backing file in r/w mode */
521         reopen_backing_file(bs, true, &local_err);
522         if (local_err) {
523             error_propagate(errp, local_err);
524             aio_context_release(aio_context);
525             return;
526         }
527 
528         /* start backup job now */
529         error_setg(&s->blocker,
530                    "Block device is in use by internal backup job");
531 
532         top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
533         if (!top_bs || !bdrv_is_root_node(top_bs) ||
534             !check_top_bs(top_bs, bs)) {
535             error_setg(errp, "No top_bs or it is invalid");
536             reopen_backing_file(bs, false, NULL);
537             aio_context_release(aio_context);
538             return;
539         }
540         bdrv_op_block_all(top_bs, s->blocker);
541         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
542 
543         job = backup_job_create(NULL, s->secondary_disk->bs, s->hidden_disk->bs,
544                                 0, MIRROR_SYNC_MODE_NONE, NULL, false,
545                                 BLOCKDEV_ON_ERROR_REPORT,
546                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
547                                 backup_job_completed, bs, NULL, &local_err);
548         if (local_err) {
549             error_propagate(errp, local_err);
550             backup_job_cleanup(bs);
551             aio_context_release(aio_context);
552             return;
553         }
554         job_start(&job->job);
555         break;
556     default:
557         aio_context_release(aio_context);
558         abort();
559     }
560 
561     s->stage = BLOCK_REPLICATION_RUNNING;
562 
563     if (s->mode == REPLICATION_MODE_SECONDARY) {
564         secondary_do_checkpoint(s, errp);
565     }
566 
567     s->error = 0;
568     aio_context_release(aio_context);
569 }
570 
571 static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
572 {
573     BlockDriverState *bs = rs->opaque;
574     BDRVReplicationState *s;
575     AioContext *aio_context;
576 
577     aio_context = bdrv_get_aio_context(bs);
578     aio_context_acquire(aio_context);
579     s = bs->opaque;
580 
581     if (s->mode == REPLICATION_MODE_SECONDARY) {
582         secondary_do_checkpoint(s, errp);
583     }
584     aio_context_release(aio_context);
585 }
586 
587 static void replication_get_error(ReplicationState *rs, Error **errp)
588 {
589     BlockDriverState *bs = rs->opaque;
590     BDRVReplicationState *s;
591     AioContext *aio_context;
592 
593     aio_context = bdrv_get_aio_context(bs);
594     aio_context_acquire(aio_context);
595     s = bs->opaque;
596 
597     if (s->stage != BLOCK_REPLICATION_RUNNING) {
598         error_setg(errp, "Block replication is not running");
599         aio_context_release(aio_context);
600         return;
601     }
602 
603     if (s->error) {
604         error_setg(errp, "I/O error occurred");
605         aio_context_release(aio_context);
606         return;
607     }
608     aio_context_release(aio_context);
609 }
610 
611 static void replication_done(void *opaque, int ret)
612 {
613     BlockDriverState *bs = opaque;
614     BDRVReplicationState *s = bs->opaque;
615 
616     if (ret == 0) {
617         s->stage = BLOCK_REPLICATION_DONE;
618 
619         s->active_disk = NULL;
620         s->secondary_disk = NULL;
621         s->hidden_disk = NULL;
622         s->error = 0;
623     } else {
624         s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
625         s->error = -EIO;
626     }
627 }
628 
629 static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
630 {
631     BlockDriverState *bs = rs->opaque;
632     BDRVReplicationState *s;
633     AioContext *aio_context;
634 
635     aio_context = bdrv_get_aio_context(bs);
636     aio_context_acquire(aio_context);
637     s = bs->opaque;
638 
639     if (s->stage != BLOCK_REPLICATION_RUNNING) {
640         error_setg(errp, "Block replication is not running");
641         aio_context_release(aio_context);
642         return;
643     }
644 
645     switch (s->mode) {
646     case REPLICATION_MODE_PRIMARY:
647         s->stage = BLOCK_REPLICATION_DONE;
648         s->error = 0;
649         break;
650     case REPLICATION_MODE_SECONDARY:
651         /*
652          * This BDS will be closed, and the job should be completed
653          * before the BDS is closed, because we will access hidden
654          * disk, secondary disk in backup_job_completed().
655          */
656         if (s->secondary_disk->bs->job) {
657             job_cancel_sync(&s->secondary_disk->bs->job->job);
658         }
659 
660         if (!failover) {
661             secondary_do_checkpoint(s, errp);
662             s->stage = BLOCK_REPLICATION_DONE;
663             aio_context_release(aio_context);
664             return;
665         }
666 
667         s->stage = BLOCK_REPLICATION_FAILOVER;
668         commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
669                             JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
670                             NULL, replication_done, bs, true, errp);
671         break;
672     default:
673         aio_context_release(aio_context);
674         abort();
675     }
676     aio_context_release(aio_context);
677 }
678 
679 static const char *const replication_strong_runtime_opts[] = {
680     REPLICATION_MODE,
681     REPLICATION_TOP_ID,
682 
683     NULL
684 };
685 
686 BlockDriver bdrv_replication = {
687     .format_name                = "replication",
688     .instance_size              = sizeof(BDRVReplicationState),
689 
690     .bdrv_open                  = replication_open,
691     .bdrv_close                 = replication_close,
692     .bdrv_child_perm            = replication_child_perm,
693 
694     .bdrv_getlength             = replication_getlength,
695     .bdrv_co_readv              = replication_co_readv,
696     .bdrv_co_writev             = replication_co_writev,
697 
698     .is_filter                  = true,
699     .bdrv_recurse_is_first_non_filter = replication_recurse_is_first_non_filter,
700 
701     .has_variable_length        = true,
702     .strong_runtime_opts        = replication_strong_runtime_opts,
703 };
704 
705 static void bdrv_replication_init(void)
706 {
707     bdrv_register(&bdrv_replication);
708 }
709 
710 block_init(bdrv_replication_init);
711