xref: /openbmc/qemu/block/replication.c (revision 0885f1221e0add5529dada1e7948d2c00189cb8b)
1 /*
2  * Replication Block filter
3  *
4  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
5  * Copyright (c) 2016 Intel Corporation
6  * Copyright (c) 2016 FUJITSU LIMITED
7  *
8  * Author:
9  *   Wen Congyang <wency@cn.fujitsu.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu/module.h"
17 #include "qemu/option.h"
18 #include "block/nbd.h"
19 #include "block/blockjob.h"
20 #include "block/block_int.h"
21 #include "block/block_backup.h"
22 #include "sysemu/block-backend.h"
23 #include "qapi/error.h"
24 #include "qapi/qmp/qdict.h"
25 #include "block/replication.h"
26 
27 typedef enum {
28     BLOCK_REPLICATION_NONE,             /* block replication is not started */
29     BLOCK_REPLICATION_RUNNING,          /* block replication is running */
30     BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
31     BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
32     BLOCK_REPLICATION_DONE,             /* block replication is done */
33 } ReplicationStage;
34 
35 typedef struct BDRVReplicationState {
36     ReplicationMode mode;
37     ReplicationStage stage;
38     BlockJob *commit_job;
39     BdrvChild *hidden_disk;
40     BdrvChild *secondary_disk;
41     BlockJob *backup_job;
42     char *top_id;
43     ReplicationState *rs;
44     Error *blocker;
45     bool orig_hidden_read_only;
46     bool orig_secondary_read_only;
47     int error;
48 } BDRVReplicationState;
49 
50 static void replication_start(ReplicationState *rs, ReplicationMode mode,
51                               Error **errp);
52 static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
53 static void replication_get_error(ReplicationState *rs, Error **errp);
54 static void replication_stop(ReplicationState *rs, bool failover,
55                              Error **errp);
56 
57 #define REPLICATION_MODE        "mode"
58 #define REPLICATION_TOP_ID      "top-id"
59 static QemuOptsList replication_runtime_opts = {
60     .name = "replication",
61     .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
62     .desc = {
63         {
64             .name = REPLICATION_MODE,
65             .type = QEMU_OPT_STRING,
66         },
67         {
68             .name = REPLICATION_TOP_ID,
69             .type = QEMU_OPT_STRING,
70         },
71         { /* end of list */ }
72     },
73 };
74 
75 static ReplicationOps replication_ops = {
76     .start = replication_start,
77     .checkpoint = replication_do_checkpoint,
78     .get_error = replication_get_error,
79     .stop = replication_stop,
80 };
81 
82 static int replication_open(BlockDriverState *bs, QDict *options,
83                             int flags, Error **errp)
84 {
85     int ret;
86     BDRVReplicationState *s = bs->opaque;
87     QemuOpts *opts = NULL;
88     const char *mode;
89     const char *top_id;
90 
91     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
92     if (ret < 0) {
93         return ret;
94     }
95 
96     ret = -EINVAL;
97     opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
98     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
99         goto fail;
100     }
101 
102     mode = qemu_opt_get(opts, REPLICATION_MODE);
103     if (!mode) {
104         error_setg(errp, "Missing the option mode");
105         goto fail;
106     }
107 
108     if (!strcmp(mode, "primary")) {
109         s->mode = REPLICATION_MODE_PRIMARY;
110         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
111         if (top_id) {
112             error_setg(errp,
113                        "The primary side does not support option top-id");
114             goto fail;
115         }
116     } else if (!strcmp(mode, "secondary")) {
117         s->mode = REPLICATION_MODE_SECONDARY;
118         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
119         s->top_id = g_strdup(top_id);
120         if (!s->top_id) {
121             error_setg(errp, "Missing the option top-id");
122             goto fail;
123         }
124     } else {
125         error_setg(errp,
126                    "The option mode's value should be primary or secondary");
127         goto fail;
128     }
129 
130     s->rs = replication_new(bs, &replication_ops);
131 
132     ret = 0;
133 
134 fail:
135     qemu_opts_del(opts);
136     return ret;
137 }
138 
139 static void replication_close(BlockDriverState *bs)
140 {
141     BDRVReplicationState *s = bs->opaque;
142     Job *commit_job;
143     GLOBAL_STATE_CODE();
144 
145     if (s->stage == BLOCK_REPLICATION_RUNNING) {
146         replication_stop(s->rs, false, NULL);
147     }
148     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
149         commit_job = &s->commit_job->job;
150         assert(commit_job->aio_context == qemu_get_current_aio_context());
151         job_cancel_sync(commit_job, false);
152     }
153 
154     if (s->mode == REPLICATION_MODE_SECONDARY) {
155         g_free(s->top_id);
156     }
157 
158     replication_remove(s->rs);
159 }
160 
161 static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
162                                    BdrvChildRole role,
163                                    BlockReopenQueue *reopen_queue,
164                                    uint64_t perm, uint64_t shared,
165                                    uint64_t *nperm, uint64_t *nshared)
166 {
167     if (role & BDRV_CHILD_PRIMARY) {
168         *nperm = BLK_PERM_CONSISTENT_READ;
169     } else {
170         *nperm = 0;
171     }
172 
173     if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
174         *nperm |= BLK_PERM_WRITE;
175     }
176     *nshared = BLK_PERM_CONSISTENT_READ
177                | BLK_PERM_WRITE
178                | BLK_PERM_WRITE_UNCHANGED;
179     return;
180 }
181 
182 static int64_t coroutine_fn GRAPH_RDLOCK
183 replication_co_getlength(BlockDriverState *bs)
184 {
185     return bdrv_co_getlength(bs->file->bs);
186 }
187 
188 static int replication_get_io_status(BDRVReplicationState *s)
189 {
190     switch (s->stage) {
191     case BLOCK_REPLICATION_NONE:
192         return -EIO;
193     case BLOCK_REPLICATION_RUNNING:
194         return 0;
195     case BLOCK_REPLICATION_FAILOVER:
196         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
197     case BLOCK_REPLICATION_FAILOVER_FAILED:
198         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
199     case BLOCK_REPLICATION_DONE:
200         /*
201          * active commit job completes, and active disk and secondary_disk
202          * is swapped, so we can operate bs->file directly
203          */
204         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
205     default:
206         abort();
207     }
208 }
209 
210 static int replication_return_value(BDRVReplicationState *s, int ret)
211 {
212     if (s->mode == REPLICATION_MODE_SECONDARY) {
213         return ret;
214     }
215 
216     if (ret < 0) {
217         s->error = ret;
218         ret = 0;
219     }
220 
221     return ret;
222 }
223 
224 static int coroutine_fn GRAPH_RDLOCK
225 replication_co_readv(BlockDriverState *bs, int64_t sector_num,
226                      int remaining_sectors, QEMUIOVector *qiov)
227 {
228     BDRVReplicationState *s = bs->opaque;
229     int ret;
230 
231     if (s->mode == REPLICATION_MODE_PRIMARY) {
232         /* We only use it to forward primary write requests */
233         return -EIO;
234     }
235 
236     ret = replication_get_io_status(s);
237     if (ret < 0) {
238         return ret;
239     }
240 
241     ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
242                          remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
243 
244     return replication_return_value(s, ret);
245 }
246 
247 static int coroutine_fn GRAPH_RDLOCK
248 replication_co_writev(BlockDriverState *bs, int64_t sector_num,
249                       int remaining_sectors, QEMUIOVector *qiov, int flags)
250 {
251     BDRVReplicationState *s = bs->opaque;
252     QEMUIOVector hd_qiov;
253     uint64_t bytes_done = 0;
254     BdrvChild *top = bs->file;
255     BdrvChild *base = s->secondary_disk;
256     BdrvChild *target;
257     int ret;
258     int64_t n;
259 
260     ret = replication_get_io_status(s);
261     if (ret < 0) {
262         goto out;
263     }
264 
265     if (ret == 0) {
266         ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
267                               remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
268         return replication_return_value(s, ret);
269     }
270 
271     /*
272      * Failover failed, only write to active disk if the sectors
273      * have already been allocated in active disk/hidden disk.
274      */
275     qemu_iovec_init(&hd_qiov, qiov->niov);
276     while (remaining_sectors > 0) {
277         int64_t count;
278 
279         ret = bdrv_co_is_allocated_above(top->bs, base->bs, false,
280                                          sector_num * BDRV_SECTOR_SIZE,
281                                          remaining_sectors * BDRV_SECTOR_SIZE,
282                                          &count);
283         if (ret < 0) {
284             goto out1;
285         }
286 
287         assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
288         n = count >> BDRV_SECTOR_BITS;
289         qemu_iovec_reset(&hd_qiov);
290         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
291 
292         target = ret ? top : base;
293         ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
294                               n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
295         if (ret < 0) {
296             goto out1;
297         }
298 
299         remaining_sectors -= n;
300         sector_num += n;
301         bytes_done += count;
302     }
303 
304 out1:
305     qemu_iovec_destroy(&hd_qiov);
306 out:
307     return ret;
308 }
309 
310 static void GRAPH_UNLOCKED
311 secondary_do_checkpoint(BlockDriverState *bs, Error **errp)
312 {
313     BDRVReplicationState *s = bs->opaque;
314     BdrvChild *active_disk = bs->file;
315     Error *local_err = NULL;
316     int ret;
317 
318     GRAPH_RDLOCK_GUARD_MAINLOOP();
319 
320     if (!s->backup_job) {
321         error_setg(errp, "Backup job was cancelled unexpectedly");
322         return;
323     }
324 
325     backup_do_checkpoint(s->backup_job, &local_err);
326     if (local_err) {
327         error_propagate(errp, local_err);
328         return;
329     }
330 
331     if (!active_disk->bs->drv) {
332         error_setg(errp, "Active disk %s is ejected",
333                    active_disk->bs->node_name);
334         return;
335     }
336 
337     ret = bdrv_make_empty(active_disk, errp);
338     if (ret < 0) {
339         return;
340     }
341 
342     if (!s->hidden_disk->bs->drv) {
343         error_setg(errp, "Hidden disk %s is ejected",
344                    s->hidden_disk->bs->node_name);
345         return;
346     }
347 
348     ret = bdrv_make_empty(s->hidden_disk, errp);
349     if (ret < 0) {
350         return;
351     }
352 }
353 
354 /* This function is supposed to be called twice:
355  * first with writable = true, then with writable = false.
356  * The first call puts s->hidden_disk and s->secondary_disk in
357  * r/w mode, and the second puts them back in their original state.
358  */
359 static void reopen_backing_file(BlockDriverState *bs, bool writable,
360                                 Error **errp)
361 {
362     BDRVReplicationState *s = bs->opaque;
363     BdrvChild *hidden_disk, *secondary_disk;
364     BlockReopenQueue *reopen_queue = NULL;
365 
366     /*
367      * s->hidden_disk and s->secondary_disk may not be set yet, as they will
368      * only be set after the children are writable.
369      */
370     hidden_disk = bs->file->bs->backing;
371     secondary_disk = hidden_disk->bs->backing;
372 
373     if (writable) {
374         s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
375         s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
376     }
377 
378     if (s->orig_hidden_read_only) {
379         QDict *opts = qdict_new();
380         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
381         reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
382                                          opts, true);
383     }
384 
385     if (s->orig_secondary_read_only) {
386         QDict *opts = qdict_new();
387         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
388         reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
389                                          opts, true);
390     }
391 
392     if (reopen_queue) {
393         AioContext *ctx = bdrv_get_aio_context(bs);
394         if (ctx != qemu_get_aio_context()) {
395             aio_context_release(ctx);
396         }
397         bdrv_reopen_multiple(reopen_queue, errp);
398         if (ctx != qemu_get_aio_context()) {
399             aio_context_acquire(ctx);
400         }
401     }
402 }
403 
404 static void backup_job_cleanup(BlockDriverState *bs)
405 {
406     BDRVReplicationState *s = bs->opaque;
407     BlockDriverState *top_bs;
408 
409     s->backup_job = NULL;
410 
411     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
412     if (!top_bs) {
413         return;
414     }
415     bdrv_op_unblock_all(top_bs, s->blocker);
416     error_free(s->blocker);
417     reopen_backing_file(bs, false, NULL);
418 }
419 
420 static void backup_job_completed(void *opaque, int ret)
421 {
422     BlockDriverState *bs = opaque;
423     BDRVReplicationState *s = bs->opaque;
424 
425     if (s->stage != BLOCK_REPLICATION_FAILOVER) {
426         /* The backup job is cancelled unexpectedly */
427         s->error = -EIO;
428     }
429 
430     backup_job_cleanup(bs);
431 }
432 
433 static bool GRAPH_RDLOCK
434 check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
435 {
436     BdrvChild *child;
437 
438     /* The bs itself is the top_bs */
439     if (top_bs == bs) {
440         return true;
441     }
442 
443     /* Iterate over top_bs's children */
444     QLIST_FOREACH(child, &top_bs->children, next) {
445         if (child->bs == bs || check_top_bs(child->bs, bs)) {
446             return true;
447         }
448     }
449 
450     return false;
451 }
452 
453 static void replication_start(ReplicationState *rs, ReplicationMode mode,
454                               Error **errp)
455 {
456     BlockDriverState *bs = rs->opaque;
457     BDRVReplicationState *s;
458     BlockDriverState *top_bs;
459     BdrvChild *active_disk, *hidden_disk, *secondary_disk;
460     int64_t active_length, hidden_length, disk_length;
461     AioContext *aio_context;
462     Error *local_err = NULL;
463     BackupPerf perf = { .use_copy_range = true, .max_workers = 1 };
464 
465     GLOBAL_STATE_CODE();
466 
467     aio_context = bdrv_get_aio_context(bs);
468     aio_context_acquire(aio_context);
469     s = bs->opaque;
470 
471     if (s->stage == BLOCK_REPLICATION_DONE ||
472         s->stage == BLOCK_REPLICATION_FAILOVER) {
473         /*
474          * This case happens when a secondary is promoted to primary.
475          * Ignore the request because the secondary side of replication
476          * doesn't have to do anything anymore.
477          */
478         aio_context_release(aio_context);
479         return;
480     }
481 
482     if (s->stage != BLOCK_REPLICATION_NONE) {
483         error_setg(errp, "Block replication is running or done");
484         aio_context_release(aio_context);
485         return;
486     }
487 
488     if (s->mode != mode) {
489         error_setg(errp, "The parameter mode's value is invalid, needs %d,"
490                    " but got %d", s->mode, mode);
491         aio_context_release(aio_context);
492         return;
493     }
494 
495     switch (s->mode) {
496     case REPLICATION_MODE_PRIMARY:
497         break;
498     case REPLICATION_MODE_SECONDARY:
499         active_disk = bs->file;
500         if (!active_disk || !active_disk->bs || !active_disk->bs->backing) {
501             error_setg(errp, "Active disk doesn't have backing file");
502             aio_context_release(aio_context);
503             return;
504         }
505 
506         hidden_disk = active_disk->bs->backing;
507         if (!hidden_disk->bs || !hidden_disk->bs->backing) {
508             error_setg(errp, "Hidden disk doesn't have backing file");
509             aio_context_release(aio_context);
510             return;
511         }
512 
513         bdrv_graph_rdlock_main_loop();
514         secondary_disk = hidden_disk->bs->backing;
515         if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
516             error_setg(errp, "The secondary disk doesn't have block backend");
517             bdrv_graph_rdunlock_main_loop();
518             aio_context_release(aio_context);
519             return;
520         }
521         bdrv_graph_rdunlock_main_loop();
522 
523         /* verify the length */
524         active_length = bdrv_getlength(active_disk->bs);
525         hidden_length = bdrv_getlength(hidden_disk->bs);
526         disk_length = bdrv_getlength(secondary_disk->bs);
527         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
528             active_length != hidden_length || hidden_length != disk_length) {
529             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
530                        " are not the same");
531             aio_context_release(aio_context);
532             return;
533         }
534 
535         /* Must be true, or the bdrv_getlength() calls would have failed */
536         assert(active_disk->bs->drv && hidden_disk->bs->drv);
537 
538         bdrv_graph_rdlock_main_loop();
539         if (!active_disk->bs->drv->bdrv_make_empty ||
540             !hidden_disk->bs->drv->bdrv_make_empty) {
541             error_setg(errp,
542                        "Active disk or hidden disk doesn't support make_empty");
543             aio_context_release(aio_context);
544             bdrv_graph_rdunlock_main_loop();
545             return;
546         }
547         bdrv_graph_rdunlock_main_loop();
548 
549         /* reopen the backing file in r/w mode */
550         reopen_backing_file(bs, true, &local_err);
551         if (local_err) {
552             error_propagate(errp, local_err);
553             aio_context_release(aio_context);
554             return;
555         }
556 
557         bdrv_graph_wrlock(bs);
558 
559         bdrv_ref(hidden_disk->bs);
560         s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
561                                            &child_of_bds, BDRV_CHILD_DATA,
562                                            &local_err);
563         if (local_err) {
564             error_propagate(errp, local_err);
565             bdrv_graph_wrunlock();
566             aio_context_release(aio_context);
567             return;
568         }
569 
570         bdrv_ref(secondary_disk->bs);
571         s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
572                                               "secondary disk", &child_of_bds,
573                                               BDRV_CHILD_DATA, &local_err);
574         if (local_err) {
575             error_propagate(errp, local_err);
576             bdrv_graph_wrunlock();
577             aio_context_release(aio_context);
578             return;
579         }
580 
581         /* start backup job now */
582         error_setg(&s->blocker,
583                    "Block device is in use by internal backup job");
584 
585         top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
586         if (!top_bs || !bdrv_is_root_node(top_bs) ||
587             !check_top_bs(top_bs, bs)) {
588             error_setg(errp, "No top_bs or it is invalid");
589             bdrv_graph_wrunlock();
590             reopen_backing_file(bs, false, NULL);
591             aio_context_release(aio_context);
592             return;
593         }
594         bdrv_op_block_all(top_bs, s->blocker);
595         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
596 
597         bdrv_graph_wrunlock();
598 
599         s->backup_job = backup_job_create(
600                                 NULL, s->secondary_disk->bs, s->hidden_disk->bs,
601                                 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
602                                 &perf,
603                                 BLOCKDEV_ON_ERROR_REPORT,
604                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
605                                 backup_job_completed, bs, NULL, &local_err);
606         if (local_err) {
607             error_propagate(errp, local_err);
608             backup_job_cleanup(bs);
609             aio_context_release(aio_context);
610             return;
611         }
612         job_start(&s->backup_job->job);
613         break;
614     default:
615         aio_context_release(aio_context);
616         abort();
617     }
618 
619     s->stage = BLOCK_REPLICATION_RUNNING;
620 
621     if (s->mode == REPLICATION_MODE_SECONDARY) {
622         secondary_do_checkpoint(bs, errp);
623     }
624 
625     s->error = 0;
626     aio_context_release(aio_context);
627 }
628 
629 static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
630 {
631     BlockDriverState *bs = rs->opaque;
632     BDRVReplicationState *s;
633     AioContext *aio_context;
634 
635     aio_context = bdrv_get_aio_context(bs);
636     aio_context_acquire(aio_context);
637     s = bs->opaque;
638 
639     if (s->stage == BLOCK_REPLICATION_DONE ||
640         s->stage == BLOCK_REPLICATION_FAILOVER) {
641         /*
642          * This case happens when a secondary was promoted to primary.
643          * Ignore the request because the secondary side of replication
644          * doesn't have to do anything anymore.
645          */
646         aio_context_release(aio_context);
647         return;
648     }
649 
650     if (s->mode == REPLICATION_MODE_SECONDARY) {
651         secondary_do_checkpoint(bs, errp);
652     }
653     aio_context_release(aio_context);
654 }
655 
656 static void replication_get_error(ReplicationState *rs, Error **errp)
657 {
658     BlockDriverState *bs = rs->opaque;
659     BDRVReplicationState *s;
660     AioContext *aio_context;
661 
662     aio_context = bdrv_get_aio_context(bs);
663     aio_context_acquire(aio_context);
664     s = bs->opaque;
665 
666     if (s->stage == BLOCK_REPLICATION_NONE) {
667         error_setg(errp, "Block replication is not running");
668         aio_context_release(aio_context);
669         return;
670     }
671 
672     if (s->error) {
673         error_setg(errp, "I/O error occurred");
674         aio_context_release(aio_context);
675         return;
676     }
677     aio_context_release(aio_context);
678 }
679 
680 static void replication_done(void *opaque, int ret)
681 {
682     BlockDriverState *bs = opaque;
683     BDRVReplicationState *s = bs->opaque;
684 
685     if (ret == 0) {
686         s->stage = BLOCK_REPLICATION_DONE;
687 
688         bdrv_graph_wrlock(NULL);
689         bdrv_unref_child(bs, s->secondary_disk);
690         s->secondary_disk = NULL;
691         bdrv_unref_child(bs, s->hidden_disk);
692         s->hidden_disk = NULL;
693         bdrv_graph_wrunlock();
694 
695         s->error = 0;
696     } else {
697         s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
698         s->error = -EIO;
699     }
700 }
701 
702 static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
703 {
704     BlockDriverState *bs = rs->opaque;
705     BDRVReplicationState *s;
706     AioContext *aio_context;
707 
708     aio_context = bdrv_get_aio_context(bs);
709     aio_context_acquire(aio_context);
710     s = bs->opaque;
711 
712     if (s->stage == BLOCK_REPLICATION_DONE ||
713         s->stage == BLOCK_REPLICATION_FAILOVER) {
714         /*
715          * This case happens when a secondary was promoted to primary.
716          * Ignore the request because the secondary side of replication
717          * doesn't have to do anything anymore.
718          */
719         aio_context_release(aio_context);
720         return;
721     }
722 
723     if (s->stage != BLOCK_REPLICATION_RUNNING) {
724         error_setg(errp, "Block replication is not running");
725         aio_context_release(aio_context);
726         return;
727     }
728 
729     switch (s->mode) {
730     case REPLICATION_MODE_PRIMARY:
731         s->stage = BLOCK_REPLICATION_DONE;
732         s->error = 0;
733         break;
734     case REPLICATION_MODE_SECONDARY:
735         /*
736          * This BDS will be closed, and the job should be completed
737          * before the BDS is closed, because we will access hidden
738          * disk, secondary disk in backup_job_completed().
739          */
740         if (s->backup_job) {
741             aio_context_release(aio_context);
742             job_cancel_sync(&s->backup_job->job, true);
743             aio_context_acquire(aio_context);
744         }
745 
746         if (!failover) {
747             secondary_do_checkpoint(bs, errp);
748             s->stage = BLOCK_REPLICATION_DONE;
749             aio_context_release(aio_context);
750             return;
751         }
752 
753         s->stage = BLOCK_REPLICATION_FAILOVER;
754         s->commit_job = commit_active_start(
755                             NULL, bs->file->bs, s->secondary_disk->bs,
756                             JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
757                             NULL, replication_done, bs, true, errp);
758         break;
759     default:
760         aio_context_release(aio_context);
761         abort();
762     }
763     aio_context_release(aio_context);
764 }
765 
766 static const char *const replication_strong_runtime_opts[] = {
767     REPLICATION_MODE,
768     REPLICATION_TOP_ID,
769 
770     NULL
771 };
772 
773 static BlockDriver bdrv_replication = {
774     .format_name                = "replication",
775     .instance_size              = sizeof(BDRVReplicationState),
776 
777     .bdrv_open                  = replication_open,
778     .bdrv_close                 = replication_close,
779     .bdrv_child_perm            = replication_child_perm,
780 
781     .bdrv_co_getlength          = replication_co_getlength,
782     .bdrv_co_readv              = replication_co_readv,
783     .bdrv_co_writev             = replication_co_writev,
784 
785     .is_filter                  = true,
786 
787     .strong_runtime_opts        = replication_strong_runtime_opts,
788 };
789 
790 static void bdrv_replication_init(void)
791 {
792     bdrv_register(&bdrv_replication);
793 }
794 
795 block_init(bdrv_replication_init);
796