xref: /openbmc/qemu/block/replication.c (revision 750541c492018e01bad5f34b087397ee6a0b835b)
1 /*
2  * Replication Block filter
3  *
4  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
5  * Copyright (c) 2016 Intel Corporation
6  * Copyright (c) 2016 FUJITSU LIMITED
7  *
8  * Author:
9  *   Wen Congyang <wency@cn.fujitsu.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu/option.h"
17 #include "block/nbd.h"
18 #include "block/blockjob.h"
19 #include "block/block_int.h"
20 #include "block/block_backup.h"
21 #include "sysemu/block-backend.h"
22 #include "qapi/error.h"
23 #include "qapi/qmp/qdict.h"
24 #include "replication.h"
25 
26 typedef enum {
27     BLOCK_REPLICATION_NONE,             /* block replication is not started */
28     BLOCK_REPLICATION_RUNNING,          /* block replication is running */
29     BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
30     BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
31     BLOCK_REPLICATION_DONE,             /* block replication is done */
32 } ReplicationStage;
33 
34 typedef struct BDRVReplicationState {
35     ReplicationMode mode;
36     ReplicationStage stage;
37     BdrvChild *active_disk;
38     BdrvChild *hidden_disk;
39     BdrvChild *secondary_disk;
40     char *top_id;
41     ReplicationState *rs;
42     Error *blocker;
43     bool orig_hidden_read_only;
44     bool orig_secondary_read_only;
45     int error;
46 } BDRVReplicationState;
47 
48 static void replication_start(ReplicationState *rs, ReplicationMode mode,
49                               Error **errp);
50 static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
51 static void replication_get_error(ReplicationState *rs, Error **errp);
52 static void replication_stop(ReplicationState *rs, bool failover,
53                              Error **errp);
54 
55 #define REPLICATION_MODE        "mode"
56 #define REPLICATION_TOP_ID      "top-id"
57 static QemuOptsList replication_runtime_opts = {
58     .name = "replication",
59     .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
60     .desc = {
61         {
62             .name = REPLICATION_MODE,
63             .type = QEMU_OPT_STRING,
64         },
65         {
66             .name = REPLICATION_TOP_ID,
67             .type = QEMU_OPT_STRING,
68         },
69         { /* end of list */ }
70     },
71 };
72 
73 static ReplicationOps replication_ops = {
74     .start = replication_start,
75     .checkpoint = replication_do_checkpoint,
76     .get_error = replication_get_error,
77     .stop = replication_stop,
78 };
79 
80 static int replication_open(BlockDriverState *bs, QDict *options,
81                             int flags, Error **errp)
82 {
83     int ret;
84     BDRVReplicationState *s = bs->opaque;
85     Error *local_err = NULL;
86     QemuOpts *opts = NULL;
87     const char *mode;
88     const char *top_id;
89 
90     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
91                                false, errp);
92     if (!bs->file) {
93         return -EINVAL;
94     }
95 
96     ret = -EINVAL;
97     opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
98     qemu_opts_absorb_qdict(opts, options, &local_err);
99     if (local_err) {
100         goto fail;
101     }
102 
103     mode = qemu_opt_get(opts, REPLICATION_MODE);
104     if (!mode) {
105         error_setg(&local_err, "Missing the option mode");
106         goto fail;
107     }
108 
109     if (!strcmp(mode, "primary")) {
110         s->mode = REPLICATION_MODE_PRIMARY;
111         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
112         if (top_id) {
113             error_setg(&local_err, "The primary side does not support option top-id");
114             goto fail;
115         }
116     } else if (!strcmp(mode, "secondary")) {
117         s->mode = REPLICATION_MODE_SECONDARY;
118         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
119         s->top_id = g_strdup(top_id);
120         if (!s->top_id) {
121             error_setg(&local_err, "Missing the option top-id");
122             goto fail;
123         }
124     } else {
125         error_setg(&local_err,
126                    "The option mode's value should be primary or secondary");
127         goto fail;
128     }
129 
130     s->rs = replication_new(bs, &replication_ops);
131 
132     ret = 0;
133 
134 fail:
135     qemu_opts_del(opts);
136     error_propagate(errp, local_err);
137 
138     return ret;
139 }
140 
141 static void replication_close(BlockDriverState *bs)
142 {
143     BDRVReplicationState *s = bs->opaque;
144 
145     if (s->stage == BLOCK_REPLICATION_RUNNING) {
146         replication_stop(s->rs, false, NULL);
147     }
148     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
149         job_cancel_sync(&s->active_disk->bs->job->job);
150     }
151 
152     if (s->mode == REPLICATION_MODE_SECONDARY) {
153         g_free(s->top_id);
154     }
155 
156     replication_remove(s->rs);
157 }
158 
159 static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
160                                    const BdrvChildRole *role,
161                                    BlockReopenQueue *reopen_queue,
162                                    uint64_t perm, uint64_t shared,
163                                    uint64_t *nperm, uint64_t *nshared)
164 {
165     *nperm = BLK_PERM_CONSISTENT_READ;
166     if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
167         *nperm |= BLK_PERM_WRITE;
168     }
169     *nshared = BLK_PERM_CONSISTENT_READ \
170                | BLK_PERM_WRITE \
171                | BLK_PERM_WRITE_UNCHANGED;
172     return;
173 }
174 
175 static int64_t replication_getlength(BlockDriverState *bs)
176 {
177     return bdrv_getlength(bs->file->bs);
178 }
179 
180 static int replication_get_io_status(BDRVReplicationState *s)
181 {
182     switch (s->stage) {
183     case BLOCK_REPLICATION_NONE:
184         return -EIO;
185     case BLOCK_REPLICATION_RUNNING:
186         return 0;
187     case BLOCK_REPLICATION_FAILOVER:
188         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
189     case BLOCK_REPLICATION_FAILOVER_FAILED:
190         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
191     case BLOCK_REPLICATION_DONE:
192         /*
193          * active commit job completes, and active disk and secondary_disk
194          * is swapped, so we can operate bs->file directly
195          */
196         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
197     default:
198         abort();
199     }
200 }
201 
202 static int replication_return_value(BDRVReplicationState *s, int ret)
203 {
204     if (s->mode == REPLICATION_MODE_SECONDARY) {
205         return ret;
206     }
207 
208     if (ret < 0) {
209         s->error = ret;
210         ret = 0;
211     }
212 
213     return ret;
214 }
215 
216 static coroutine_fn int replication_co_readv(BlockDriverState *bs,
217                                              int64_t sector_num,
218                                              int remaining_sectors,
219                                              QEMUIOVector *qiov)
220 {
221     BDRVReplicationState *s = bs->opaque;
222     int ret;
223 
224     if (s->mode == REPLICATION_MODE_PRIMARY) {
225         /* We only use it to forward primary write requests */
226         return -EIO;
227     }
228 
229     ret = replication_get_io_status(s);
230     if (ret < 0) {
231         return ret;
232     }
233 
234     ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
235                          remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
236 
237     return replication_return_value(s, ret);
238 }
239 
240 static coroutine_fn int replication_co_writev(BlockDriverState *bs,
241                                               int64_t sector_num,
242                                               int remaining_sectors,
243                                               QEMUIOVector *qiov,
244                                               int flags)
245 {
246     BDRVReplicationState *s = bs->opaque;
247     QEMUIOVector hd_qiov;
248     uint64_t bytes_done = 0;
249     BdrvChild *top = bs->file;
250     BdrvChild *base = s->secondary_disk;
251     BdrvChild *target;
252     int ret;
253     int64_t n;
254 
255     assert(!flags);
256     ret = replication_get_io_status(s);
257     if (ret < 0) {
258         goto out;
259     }
260 
261     if (ret == 0) {
262         ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
263                               remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
264         return replication_return_value(s, ret);
265     }
266 
267     /*
268      * Failover failed, only write to active disk if the sectors
269      * have already been allocated in active disk/hidden disk.
270      */
271     qemu_iovec_init(&hd_qiov, qiov->niov);
272     while (remaining_sectors > 0) {
273         int64_t count;
274 
275         ret = bdrv_is_allocated_above(top->bs, base->bs,
276                                       sector_num * BDRV_SECTOR_SIZE,
277                                       remaining_sectors * BDRV_SECTOR_SIZE,
278                                       &count);
279         if (ret < 0) {
280             goto out1;
281         }
282 
283         assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
284         n = count >> BDRV_SECTOR_BITS;
285         qemu_iovec_reset(&hd_qiov);
286         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
287 
288         target = ret ? top : base;
289         ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
290                               n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
291         if (ret < 0) {
292             goto out1;
293         }
294 
295         remaining_sectors -= n;
296         sector_num += n;
297         bytes_done += count;
298     }
299 
300 out1:
301     qemu_iovec_destroy(&hd_qiov);
302 out:
303     return ret;
304 }
305 
306 static bool replication_recurse_is_first_non_filter(BlockDriverState *bs,
307                                                     BlockDriverState *candidate)
308 {
309     return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
310 }
311 
312 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
313 {
314     Error *local_err = NULL;
315     int ret;
316 
317     if (!s->secondary_disk->bs->job) {
318         error_setg(errp, "Backup job was cancelled unexpectedly");
319         return;
320     }
321 
322     backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
323     if (local_err) {
324         error_propagate(errp, local_err);
325         return;
326     }
327 
328     if (!s->active_disk->bs->drv) {
329         error_setg(errp, "Active disk %s is ejected",
330                    s->active_disk->bs->node_name);
331         return;
332     }
333 
334     ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
335     if (ret < 0) {
336         error_setg(errp, "Cannot make active disk empty");
337         return;
338     }
339 
340     if (!s->hidden_disk->bs->drv) {
341         error_setg(errp, "Hidden disk %s is ejected",
342                    s->hidden_disk->bs->node_name);
343         return;
344     }
345 
346     ret = s->hidden_disk->bs->drv->bdrv_make_empty(s->hidden_disk->bs);
347     if (ret < 0) {
348         error_setg(errp, "Cannot make hidden disk empty");
349         return;
350     }
351 }
352 
353 /* This function is supposed to be called twice:
354  * first with writable = true, then with writable = false.
355  * The first call puts s->hidden_disk and s->secondary_disk in
356  * r/w mode, and the second puts them back in their original state.
357  */
358 static void reopen_backing_file(BlockDriverState *bs, bool writable,
359                                 Error **errp)
360 {
361     BDRVReplicationState *s = bs->opaque;
362     BlockReopenQueue *reopen_queue = NULL;
363     Error *local_err = NULL;
364 
365     if (writable) {
366         s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
367         s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
368     }
369 
370     bdrv_subtree_drained_begin(s->hidden_disk->bs);
371     bdrv_subtree_drained_begin(s->secondary_disk->bs);
372 
373     if (s->orig_hidden_read_only) {
374         QDict *opts = qdict_new();
375         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
376         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
377                                          opts, true);
378     }
379 
380     if (s->orig_secondary_read_only) {
381         QDict *opts = qdict_new();
382         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
383         reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
384                                          opts, true);
385     }
386 
387     if (reopen_queue) {
388         bdrv_reopen_multiple(reopen_queue, &local_err);
389         error_propagate(errp, local_err);
390     }
391 
392     bdrv_subtree_drained_end(s->hidden_disk->bs);
393     bdrv_subtree_drained_end(s->secondary_disk->bs);
394 }
395 
396 static void backup_job_cleanup(BlockDriverState *bs)
397 {
398     BDRVReplicationState *s = bs->opaque;
399     BlockDriverState *top_bs;
400 
401     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
402     if (!top_bs) {
403         return;
404     }
405     bdrv_op_unblock_all(top_bs, s->blocker);
406     error_free(s->blocker);
407     reopen_backing_file(bs, false, NULL);
408 }
409 
410 static void backup_job_completed(void *opaque, int ret)
411 {
412     BlockDriverState *bs = opaque;
413     BDRVReplicationState *s = bs->opaque;
414 
415     if (s->stage != BLOCK_REPLICATION_FAILOVER) {
416         /* The backup job is cancelled unexpectedly */
417         s->error = -EIO;
418     }
419 
420     backup_job_cleanup(bs);
421 }
422 
423 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
424 {
425     BdrvChild *child;
426 
427     /* The bs itself is the top_bs */
428     if (top_bs == bs) {
429         return true;
430     }
431 
432     /* Iterate over top_bs's children */
433     QLIST_FOREACH(child, &top_bs->children, next) {
434         if (child->bs == bs || check_top_bs(child->bs, bs)) {
435             return true;
436         }
437     }
438 
439     return false;
440 }
441 
442 static void replication_start(ReplicationState *rs, ReplicationMode mode,
443                               Error **errp)
444 {
445     BlockDriverState *bs = rs->opaque;
446     BDRVReplicationState *s;
447     BlockDriverState *top_bs;
448     int64_t active_length, hidden_length, disk_length;
449     AioContext *aio_context;
450     Error *local_err = NULL;
451     BlockJob *job;
452 
453     aio_context = bdrv_get_aio_context(bs);
454     aio_context_acquire(aio_context);
455     s = bs->opaque;
456 
457     if (s->stage != BLOCK_REPLICATION_NONE) {
458         error_setg(errp, "Block replication is running or done");
459         aio_context_release(aio_context);
460         return;
461     }
462 
463     if (s->mode != mode) {
464         error_setg(errp, "The parameter mode's value is invalid, needs %d,"
465                    " but got %d", s->mode, mode);
466         aio_context_release(aio_context);
467         return;
468     }
469 
470     switch (s->mode) {
471     case REPLICATION_MODE_PRIMARY:
472         break;
473     case REPLICATION_MODE_SECONDARY:
474         s->active_disk = bs->file;
475         if (!s->active_disk || !s->active_disk->bs ||
476                                     !s->active_disk->bs->backing) {
477             error_setg(errp, "Active disk doesn't have backing file");
478             aio_context_release(aio_context);
479             return;
480         }
481 
482         s->hidden_disk = s->active_disk->bs->backing;
483         if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
484             error_setg(errp, "Hidden disk doesn't have backing file");
485             aio_context_release(aio_context);
486             return;
487         }
488 
489         s->secondary_disk = s->hidden_disk->bs->backing;
490         if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
491             error_setg(errp, "The secondary disk doesn't have block backend");
492             aio_context_release(aio_context);
493             return;
494         }
495 
496         /* verify the length */
497         active_length = bdrv_getlength(s->active_disk->bs);
498         hidden_length = bdrv_getlength(s->hidden_disk->bs);
499         disk_length = bdrv_getlength(s->secondary_disk->bs);
500         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
501             active_length != hidden_length || hidden_length != disk_length) {
502             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
503                        " are not the same");
504             aio_context_release(aio_context);
505             return;
506         }
507 
508         /* Must be true, or the bdrv_getlength() calls would have failed */
509         assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
510 
511         if (!s->active_disk->bs->drv->bdrv_make_empty ||
512             !s->hidden_disk->bs->drv->bdrv_make_empty) {
513             error_setg(errp,
514                        "Active disk or hidden disk doesn't support make_empty");
515             aio_context_release(aio_context);
516             return;
517         }
518 
519         /* reopen the backing file in r/w mode */
520         reopen_backing_file(bs, true, &local_err);
521         if (local_err) {
522             error_propagate(errp, local_err);
523             aio_context_release(aio_context);
524             return;
525         }
526 
527         /* start backup job now */
528         error_setg(&s->blocker,
529                    "Block device is in use by internal backup job");
530 
531         top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
532         if (!top_bs || !bdrv_is_root_node(top_bs) ||
533             !check_top_bs(top_bs, bs)) {
534             error_setg(errp, "No top_bs or it is invalid");
535             reopen_backing_file(bs, false, NULL);
536             aio_context_release(aio_context);
537             return;
538         }
539         bdrv_op_block_all(top_bs, s->blocker);
540         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
541 
542         job = backup_job_create(NULL, s->secondary_disk->bs, s->hidden_disk->bs,
543                                 0, MIRROR_SYNC_MODE_NONE, NULL, false,
544                                 BLOCKDEV_ON_ERROR_REPORT,
545                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
546                                 backup_job_completed, bs, NULL, &local_err);
547         if (local_err) {
548             error_propagate(errp, local_err);
549             backup_job_cleanup(bs);
550             aio_context_release(aio_context);
551             return;
552         }
553         job_start(&job->job);
554         break;
555     default:
556         aio_context_release(aio_context);
557         abort();
558     }
559 
560     s->stage = BLOCK_REPLICATION_RUNNING;
561 
562     if (s->mode == REPLICATION_MODE_SECONDARY) {
563         secondary_do_checkpoint(s, errp);
564     }
565 
566     s->error = 0;
567     aio_context_release(aio_context);
568 }
569 
570 static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
571 {
572     BlockDriverState *bs = rs->opaque;
573     BDRVReplicationState *s;
574     AioContext *aio_context;
575 
576     aio_context = bdrv_get_aio_context(bs);
577     aio_context_acquire(aio_context);
578     s = bs->opaque;
579 
580     if (s->mode == REPLICATION_MODE_SECONDARY) {
581         secondary_do_checkpoint(s, errp);
582     }
583     aio_context_release(aio_context);
584 }
585 
586 static void replication_get_error(ReplicationState *rs, Error **errp)
587 {
588     BlockDriverState *bs = rs->opaque;
589     BDRVReplicationState *s;
590     AioContext *aio_context;
591 
592     aio_context = bdrv_get_aio_context(bs);
593     aio_context_acquire(aio_context);
594     s = bs->opaque;
595 
596     if (s->stage != BLOCK_REPLICATION_RUNNING) {
597         error_setg(errp, "Block replication is not running");
598         aio_context_release(aio_context);
599         return;
600     }
601 
602     if (s->error) {
603         error_setg(errp, "I/O error occurred");
604         aio_context_release(aio_context);
605         return;
606     }
607     aio_context_release(aio_context);
608 }
609 
610 static void replication_done(void *opaque, int ret)
611 {
612     BlockDriverState *bs = opaque;
613     BDRVReplicationState *s = bs->opaque;
614 
615     if (ret == 0) {
616         s->stage = BLOCK_REPLICATION_DONE;
617 
618         s->active_disk = NULL;
619         s->secondary_disk = NULL;
620         s->hidden_disk = NULL;
621         s->error = 0;
622     } else {
623         s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
624         s->error = -EIO;
625     }
626 }
627 
628 static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
629 {
630     BlockDriverState *bs = rs->opaque;
631     BDRVReplicationState *s;
632     AioContext *aio_context;
633 
634     aio_context = bdrv_get_aio_context(bs);
635     aio_context_acquire(aio_context);
636     s = bs->opaque;
637 
638     if (s->stage != BLOCK_REPLICATION_RUNNING) {
639         error_setg(errp, "Block replication is not running");
640         aio_context_release(aio_context);
641         return;
642     }
643 
644     switch (s->mode) {
645     case REPLICATION_MODE_PRIMARY:
646         s->stage = BLOCK_REPLICATION_DONE;
647         s->error = 0;
648         break;
649     case REPLICATION_MODE_SECONDARY:
650         /*
651          * This BDS will be closed, and the job should be completed
652          * before the BDS is closed, because we will access hidden
653          * disk, secondary disk in backup_job_completed().
654          */
655         if (s->secondary_disk->bs->job) {
656             job_cancel_sync(&s->secondary_disk->bs->job->job);
657         }
658 
659         if (!failover) {
660             secondary_do_checkpoint(s, errp);
661             s->stage = BLOCK_REPLICATION_DONE;
662             aio_context_release(aio_context);
663             return;
664         }
665 
666         s->stage = BLOCK_REPLICATION_FAILOVER;
667         commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
668                             JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
669                             NULL, replication_done, bs, true, errp);
670         break;
671     default:
672         aio_context_release(aio_context);
673         abort();
674     }
675     aio_context_release(aio_context);
676 }
677 
678 static const char *const replication_strong_runtime_opts[] = {
679     REPLICATION_MODE,
680     REPLICATION_TOP_ID,
681 
682     NULL
683 };
684 
685 static BlockDriver bdrv_replication = {
686     .format_name                = "replication",
687     .instance_size              = sizeof(BDRVReplicationState),
688 
689     .bdrv_open                  = replication_open,
690     .bdrv_close                 = replication_close,
691     .bdrv_child_perm            = replication_child_perm,
692 
693     .bdrv_getlength             = replication_getlength,
694     .bdrv_co_readv              = replication_co_readv,
695     .bdrv_co_writev             = replication_co_writev,
696 
697     .is_filter                  = true,
698     .bdrv_recurse_is_first_non_filter = replication_recurse_is_first_non_filter,
699 
700     .has_variable_length        = true,
701     .strong_runtime_opts        = replication_strong_runtime_opts,
702 };
703 
704 static void bdrv_replication_init(void)
705 {
706     bdrv_register(&bdrv_replication);
707 }
708 
709 block_init(bdrv_replication_init);
710