xref: /openbmc/qemu/block/replication.c (revision f6ac2078)
129ff7890SWen Congyang /*
229ff7890SWen Congyang  * Replication Block filter
329ff7890SWen Congyang  *
429ff7890SWen Congyang  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
529ff7890SWen Congyang  * Copyright (c) 2016 Intel Corporation
629ff7890SWen Congyang  * Copyright (c) 2016 FUJITSU LIMITED
729ff7890SWen Congyang  *
829ff7890SWen Congyang  * Author:
929ff7890SWen Congyang  *   Wen Congyang <wency@cn.fujitsu.com>
1029ff7890SWen Congyang  *
1129ff7890SWen Congyang  * This work is licensed under the terms of the GNU GPL, version 2 or later.
1229ff7890SWen Congyang  * See the COPYING file in the top-level directory.
1329ff7890SWen Congyang  */
1429ff7890SWen Congyang 
1529ff7890SWen Congyang #include "qemu/osdep.h"
1629ff7890SWen Congyang #include "qemu-common.h"
1729ff7890SWen Congyang #include "block/nbd.h"
1829ff7890SWen Congyang #include "block/blockjob.h"
1929ff7890SWen Congyang #include "block/block_int.h"
2029ff7890SWen Congyang #include "block/block_backup.h"
2129ff7890SWen Congyang #include "sysemu/block-backend.h"
2229ff7890SWen Congyang #include "qapi/error.h"
2329ff7890SWen Congyang #include "replication.h"
2429ff7890SWen Congyang 
253c76c606SFam Zheng typedef enum {
263c76c606SFam Zheng     BLOCK_REPLICATION_NONE,             /* block replication is not started */
273c76c606SFam Zheng     BLOCK_REPLICATION_RUNNING,          /* block replication is running */
283c76c606SFam Zheng     BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
293c76c606SFam Zheng     BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
303c76c606SFam Zheng     BLOCK_REPLICATION_DONE,             /* block replication is done */
313c76c606SFam Zheng } ReplicationStage;
323c76c606SFam Zheng 
3329ff7890SWen Congyang typedef struct BDRVReplicationState {
3429ff7890SWen Congyang     ReplicationMode mode;
353c76c606SFam Zheng     ReplicationStage stage;
3629ff7890SWen Congyang     BdrvChild *active_disk;
3729ff7890SWen Congyang     BdrvChild *hidden_disk;
3829ff7890SWen Congyang     BdrvChild *secondary_disk;
3929ff7890SWen Congyang     char *top_id;
4029ff7890SWen Congyang     ReplicationState *rs;
4129ff7890SWen Congyang     Error *blocker;
4229ff7890SWen Congyang     int orig_hidden_flags;
4329ff7890SWen Congyang     int orig_secondary_flags;
4429ff7890SWen Congyang     int error;
4529ff7890SWen Congyang } BDRVReplicationState;
4629ff7890SWen Congyang 
4729ff7890SWen Congyang static void replication_start(ReplicationState *rs, ReplicationMode mode,
4829ff7890SWen Congyang                               Error **errp);
4929ff7890SWen Congyang static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
5029ff7890SWen Congyang static void replication_get_error(ReplicationState *rs, Error **errp);
5129ff7890SWen Congyang static void replication_stop(ReplicationState *rs, bool failover,
5229ff7890SWen Congyang                              Error **errp);
5329ff7890SWen Congyang 
5429ff7890SWen Congyang #define REPLICATION_MODE        "mode"
5529ff7890SWen Congyang #define REPLICATION_TOP_ID      "top-id"
5629ff7890SWen Congyang static QemuOptsList replication_runtime_opts = {
5729ff7890SWen Congyang     .name = "replication",
5829ff7890SWen Congyang     .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
5929ff7890SWen Congyang     .desc = {
6029ff7890SWen Congyang         {
6129ff7890SWen Congyang             .name = REPLICATION_MODE,
6229ff7890SWen Congyang             .type = QEMU_OPT_STRING,
6329ff7890SWen Congyang         },
6429ff7890SWen Congyang         {
6529ff7890SWen Congyang             .name = REPLICATION_TOP_ID,
6629ff7890SWen Congyang             .type = QEMU_OPT_STRING,
6729ff7890SWen Congyang         },
6829ff7890SWen Congyang         { /* end of list */ }
6929ff7890SWen Congyang     },
7029ff7890SWen Congyang };
7129ff7890SWen Congyang 
7229ff7890SWen Congyang static ReplicationOps replication_ops = {
7329ff7890SWen Congyang     .start = replication_start,
7429ff7890SWen Congyang     .checkpoint = replication_do_checkpoint,
7529ff7890SWen Congyang     .get_error = replication_get_error,
7629ff7890SWen Congyang     .stop = replication_stop,
7729ff7890SWen Congyang };
7829ff7890SWen Congyang 
7929ff7890SWen Congyang static int replication_open(BlockDriverState *bs, QDict *options,
8029ff7890SWen Congyang                             int flags, Error **errp)
8129ff7890SWen Congyang {
8229ff7890SWen Congyang     int ret;
8329ff7890SWen Congyang     BDRVReplicationState *s = bs->opaque;
8429ff7890SWen Congyang     Error *local_err = NULL;
8529ff7890SWen Congyang     QemuOpts *opts = NULL;
8629ff7890SWen Congyang     const char *mode;
8729ff7890SWen Congyang     const char *top_id;
8829ff7890SWen Congyang 
894e4bf5c4SKevin Wolf     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
904e4bf5c4SKevin Wolf                                false, errp);
914e4bf5c4SKevin Wolf     if (!bs->file) {
924e4bf5c4SKevin Wolf         return -EINVAL;
934e4bf5c4SKevin Wolf     }
944e4bf5c4SKevin Wolf 
9529ff7890SWen Congyang     ret = -EINVAL;
9629ff7890SWen Congyang     opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
9729ff7890SWen Congyang     qemu_opts_absorb_qdict(opts, options, &local_err);
9829ff7890SWen Congyang     if (local_err) {
9929ff7890SWen Congyang         goto fail;
10029ff7890SWen Congyang     }
10129ff7890SWen Congyang 
10229ff7890SWen Congyang     mode = qemu_opt_get(opts, REPLICATION_MODE);
10329ff7890SWen Congyang     if (!mode) {
10429ff7890SWen Congyang         error_setg(&local_err, "Missing the option mode");
10529ff7890SWen Congyang         goto fail;
10629ff7890SWen Congyang     }
10729ff7890SWen Congyang 
10829ff7890SWen Congyang     if (!strcmp(mode, "primary")) {
10929ff7890SWen Congyang         s->mode = REPLICATION_MODE_PRIMARY;
110f4f2539bSChanglong Xie         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
111f4f2539bSChanglong Xie         if (top_id) {
112f4f2539bSChanglong Xie             error_setg(&local_err, "The primary side does not support option top-id");
113f4f2539bSChanglong Xie             goto fail;
114f4f2539bSChanglong Xie         }
11529ff7890SWen Congyang     } else if (!strcmp(mode, "secondary")) {
11629ff7890SWen Congyang         s->mode = REPLICATION_MODE_SECONDARY;
11729ff7890SWen Congyang         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
11829ff7890SWen Congyang         s->top_id = g_strdup(top_id);
11929ff7890SWen Congyang         if (!s->top_id) {
12029ff7890SWen Congyang             error_setg(&local_err, "Missing the option top-id");
12129ff7890SWen Congyang             goto fail;
12229ff7890SWen Congyang         }
12329ff7890SWen Congyang     } else {
12429ff7890SWen Congyang         error_setg(&local_err,
12529ff7890SWen Congyang                    "The option mode's value should be primary or secondary");
12629ff7890SWen Congyang         goto fail;
12729ff7890SWen Congyang     }
12829ff7890SWen Congyang 
12929ff7890SWen Congyang     s->rs = replication_new(bs, &replication_ops);
13029ff7890SWen Congyang 
13129ff7890SWen Congyang     ret = 0;
13229ff7890SWen Congyang 
13329ff7890SWen Congyang fail:
13429ff7890SWen Congyang     qemu_opts_del(opts);
13529ff7890SWen Congyang     error_propagate(errp, local_err);
13629ff7890SWen Congyang 
13729ff7890SWen Congyang     return ret;
13829ff7890SWen Congyang }
13929ff7890SWen Congyang 
14029ff7890SWen Congyang static void replication_close(BlockDriverState *bs)
14129ff7890SWen Congyang {
14229ff7890SWen Congyang     BDRVReplicationState *s = bs->opaque;
14329ff7890SWen Congyang 
1443c76c606SFam Zheng     if (s->stage == BLOCK_REPLICATION_RUNNING) {
14529ff7890SWen Congyang         replication_stop(s->rs, false, NULL);
14629ff7890SWen Congyang     }
1473c76c606SFam Zheng     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
14850ab0e09SPaolo Bonzini         block_job_cancel_sync(s->active_disk->bs->job);
14950ab0e09SPaolo Bonzini     }
15029ff7890SWen Congyang 
15129ff7890SWen Congyang     if (s->mode == REPLICATION_MODE_SECONDARY) {
15229ff7890SWen Congyang         g_free(s->top_id);
15329ff7890SWen Congyang     }
15429ff7890SWen Congyang 
15529ff7890SWen Congyang     replication_remove(s->rs);
15629ff7890SWen Congyang }
15729ff7890SWen Congyang 
15837a9051cSChanglong Xie static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
15937a9051cSChanglong Xie                                    const BdrvChildRole *role,
16037a9051cSChanglong Xie                                    uint64_t perm, uint64_t shared,
16137a9051cSChanglong Xie                                    uint64_t *nperm, uint64_t *nshared)
16237a9051cSChanglong Xie {
16337a9051cSChanglong Xie     *nperm = *nshared = BLK_PERM_CONSISTENT_READ \
16437a9051cSChanglong Xie                         | BLK_PERM_WRITE \
16537a9051cSChanglong Xie                         | BLK_PERM_WRITE_UNCHANGED;
16637a9051cSChanglong Xie 
16737a9051cSChanglong Xie     return;
16837a9051cSChanglong Xie }
16937a9051cSChanglong Xie 
17029ff7890SWen Congyang static int64_t replication_getlength(BlockDriverState *bs)
17129ff7890SWen Congyang {
17229ff7890SWen Congyang     return bdrv_getlength(bs->file->bs);
17329ff7890SWen Congyang }
17429ff7890SWen Congyang 
17529ff7890SWen Congyang static int replication_get_io_status(BDRVReplicationState *s)
17629ff7890SWen Congyang {
1773c76c606SFam Zheng     switch (s->stage) {
17829ff7890SWen Congyang     case BLOCK_REPLICATION_NONE:
17929ff7890SWen Congyang         return -EIO;
18029ff7890SWen Congyang     case BLOCK_REPLICATION_RUNNING:
18129ff7890SWen Congyang         return 0;
18229ff7890SWen Congyang     case BLOCK_REPLICATION_FAILOVER:
18329ff7890SWen Congyang         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
18429ff7890SWen Congyang     case BLOCK_REPLICATION_FAILOVER_FAILED:
18529ff7890SWen Congyang         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
18629ff7890SWen Congyang     case BLOCK_REPLICATION_DONE:
18729ff7890SWen Congyang         /*
18829ff7890SWen Congyang          * active commit job completes, and active disk and secondary_disk
18929ff7890SWen Congyang          * is swapped, so we can operate bs->file directly
19029ff7890SWen Congyang          */
19129ff7890SWen Congyang         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
19229ff7890SWen Congyang     default:
19329ff7890SWen Congyang         abort();
19429ff7890SWen Congyang     }
19529ff7890SWen Congyang }
19629ff7890SWen Congyang 
19729ff7890SWen Congyang static int replication_return_value(BDRVReplicationState *s, int ret)
19829ff7890SWen Congyang {
19929ff7890SWen Congyang     if (s->mode == REPLICATION_MODE_SECONDARY) {
20029ff7890SWen Congyang         return ret;
20129ff7890SWen Congyang     }
20229ff7890SWen Congyang 
20329ff7890SWen Congyang     if (ret < 0) {
20429ff7890SWen Congyang         s->error = ret;
20529ff7890SWen Congyang         ret = 0;
20629ff7890SWen Congyang     }
20729ff7890SWen Congyang 
20829ff7890SWen Congyang     return ret;
20929ff7890SWen Congyang }
21029ff7890SWen Congyang 
21129ff7890SWen Congyang static coroutine_fn int replication_co_readv(BlockDriverState *bs,
21229ff7890SWen Congyang                                              int64_t sector_num,
21329ff7890SWen Congyang                                              int remaining_sectors,
21429ff7890SWen Congyang                                              QEMUIOVector *qiov)
21529ff7890SWen Congyang {
21629ff7890SWen Congyang     BDRVReplicationState *s = bs->opaque;
21729ff7890SWen Congyang     BdrvChild *child = s->secondary_disk;
21829ff7890SWen Congyang     BlockJob *job = NULL;
21929ff7890SWen Congyang     CowRequest req;
22029ff7890SWen Congyang     int ret;
22129ff7890SWen Congyang 
22229ff7890SWen Congyang     if (s->mode == REPLICATION_MODE_PRIMARY) {
22329ff7890SWen Congyang         /* We only use it to forward primary write requests */
22429ff7890SWen Congyang         return -EIO;
22529ff7890SWen Congyang     }
22629ff7890SWen Congyang 
22729ff7890SWen Congyang     ret = replication_get_io_status(s);
22829ff7890SWen Congyang     if (ret < 0) {
22929ff7890SWen Congyang         return ret;
23029ff7890SWen Congyang     }
23129ff7890SWen Congyang 
23229ff7890SWen Congyang     if (child && child->bs) {
23329ff7890SWen Congyang         job = child->bs->job;
23429ff7890SWen Congyang     }
23529ff7890SWen Congyang 
23629ff7890SWen Congyang     if (job) {
237*f6ac2078SEric Blake         uint64_t remaining_bytes = remaining_sectors * BDRV_SECTOR_SIZE;
238*f6ac2078SEric Blake 
239*f6ac2078SEric Blake         backup_wait_for_overlapping_requests(child->bs->job,
240*f6ac2078SEric Blake                                              sector_num * BDRV_SECTOR_SIZE,
241*f6ac2078SEric Blake                                              remaining_bytes);
242*f6ac2078SEric Blake         backup_cow_request_begin(&req, child->bs->job,
243*f6ac2078SEric Blake                                  sector_num * BDRV_SECTOR_SIZE,
244*f6ac2078SEric Blake                                  remaining_bytes);
24529ff7890SWen Congyang         ret = bdrv_co_readv(bs->file, sector_num, remaining_sectors,
24629ff7890SWen Congyang                             qiov);
24729ff7890SWen Congyang         backup_cow_request_end(&req);
24829ff7890SWen Congyang         goto out;
24929ff7890SWen Congyang     }
25029ff7890SWen Congyang 
25129ff7890SWen Congyang     ret = bdrv_co_readv(bs->file, sector_num, remaining_sectors, qiov);
25229ff7890SWen Congyang out:
25329ff7890SWen Congyang     return replication_return_value(s, ret);
25429ff7890SWen Congyang }
25529ff7890SWen Congyang 
25629ff7890SWen Congyang static coroutine_fn int replication_co_writev(BlockDriverState *bs,
25729ff7890SWen Congyang                                               int64_t sector_num,
25829ff7890SWen Congyang                                               int remaining_sectors,
25929ff7890SWen Congyang                                               QEMUIOVector *qiov)
26029ff7890SWen Congyang {
26129ff7890SWen Congyang     BDRVReplicationState *s = bs->opaque;
26229ff7890SWen Congyang     QEMUIOVector hd_qiov;
26329ff7890SWen Congyang     uint64_t bytes_done = 0;
26429ff7890SWen Congyang     BdrvChild *top = bs->file;
26529ff7890SWen Congyang     BdrvChild *base = s->secondary_disk;
26629ff7890SWen Congyang     BdrvChild *target;
26729ff7890SWen Congyang     int ret, n;
26829ff7890SWen Congyang 
26929ff7890SWen Congyang     ret = replication_get_io_status(s);
27029ff7890SWen Congyang     if (ret < 0) {
27129ff7890SWen Congyang         goto out;
27229ff7890SWen Congyang     }
27329ff7890SWen Congyang 
27429ff7890SWen Congyang     if (ret == 0) {
27529ff7890SWen Congyang         ret = bdrv_co_writev(top, sector_num,
27629ff7890SWen Congyang                              remaining_sectors, qiov);
27729ff7890SWen Congyang         return replication_return_value(s, ret);
27829ff7890SWen Congyang     }
27929ff7890SWen Congyang 
28029ff7890SWen Congyang     /*
28129ff7890SWen Congyang      * Failover failed, only write to active disk if the sectors
28229ff7890SWen Congyang      * have already been allocated in active disk/hidden disk.
28329ff7890SWen Congyang      */
28429ff7890SWen Congyang     qemu_iovec_init(&hd_qiov, qiov->niov);
28529ff7890SWen Congyang     while (remaining_sectors > 0) {
28629ff7890SWen Congyang         ret = bdrv_is_allocated_above(top->bs, base->bs, sector_num,
28729ff7890SWen Congyang                                       remaining_sectors, &n);
28829ff7890SWen Congyang         if (ret < 0) {
28929ff7890SWen Congyang             goto out1;
29029ff7890SWen Congyang         }
29129ff7890SWen Congyang 
29229ff7890SWen Congyang         qemu_iovec_reset(&hd_qiov);
29329ff7890SWen Congyang         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * BDRV_SECTOR_SIZE);
29429ff7890SWen Congyang 
29529ff7890SWen Congyang         target = ret ? top : base;
29629ff7890SWen Congyang         ret = bdrv_co_writev(target, sector_num, n, &hd_qiov);
29729ff7890SWen Congyang         if (ret < 0) {
29829ff7890SWen Congyang             goto out1;
29929ff7890SWen Congyang         }
30029ff7890SWen Congyang 
30129ff7890SWen Congyang         remaining_sectors -= n;
30229ff7890SWen Congyang         sector_num += n;
30329ff7890SWen Congyang         bytes_done += n * BDRV_SECTOR_SIZE;
30429ff7890SWen Congyang     }
30529ff7890SWen Congyang 
30629ff7890SWen Congyang out1:
30729ff7890SWen Congyang     qemu_iovec_destroy(&hd_qiov);
30829ff7890SWen Congyang out:
30929ff7890SWen Congyang     return ret;
31029ff7890SWen Congyang }
31129ff7890SWen Congyang 
31229ff7890SWen Congyang static bool replication_recurse_is_first_non_filter(BlockDriverState *bs,
31329ff7890SWen Congyang                                                     BlockDriverState *candidate)
31429ff7890SWen Congyang {
31529ff7890SWen Congyang     return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
31629ff7890SWen Congyang }
31729ff7890SWen Congyang 
31829ff7890SWen Congyang static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
31929ff7890SWen Congyang {
32029ff7890SWen Congyang     Error *local_err = NULL;
32129ff7890SWen Congyang     int ret;
32229ff7890SWen Congyang 
32329ff7890SWen Congyang     if (!s->secondary_disk->bs->job) {
32429ff7890SWen Congyang         error_setg(errp, "Backup job was cancelled unexpectedly");
32529ff7890SWen Congyang         return;
32629ff7890SWen Congyang     }
32729ff7890SWen Congyang 
32829ff7890SWen Congyang     backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
32929ff7890SWen Congyang     if (local_err) {
33029ff7890SWen Congyang         error_propagate(errp, local_err);
33129ff7890SWen Congyang         return;
33229ff7890SWen Congyang     }
33329ff7890SWen Congyang 
33429ff7890SWen Congyang     ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
33529ff7890SWen Congyang     if (ret < 0) {
33629ff7890SWen Congyang         error_setg(errp, "Cannot make active disk empty");
33729ff7890SWen Congyang         return;
33829ff7890SWen Congyang     }
33929ff7890SWen Congyang 
34029ff7890SWen Congyang     ret = s->hidden_disk->bs->drv->bdrv_make_empty(s->hidden_disk->bs);
34129ff7890SWen Congyang     if (ret < 0) {
34229ff7890SWen Congyang         error_setg(errp, "Cannot make hidden disk empty");
34329ff7890SWen Congyang         return;
34429ff7890SWen Congyang     }
34529ff7890SWen Congyang }
34629ff7890SWen Congyang 
3478dd9006eSPaolo Bonzini static void reopen_backing_file(BlockDriverState *bs, bool writable,
34829ff7890SWen Congyang                                 Error **errp)
34929ff7890SWen Congyang {
3508dd9006eSPaolo Bonzini     BDRVReplicationState *s = bs->opaque;
35129ff7890SWen Congyang     BlockReopenQueue *reopen_queue = NULL;
35229ff7890SWen Congyang     int orig_hidden_flags, orig_secondary_flags;
35329ff7890SWen Congyang     int new_hidden_flags, new_secondary_flags;
35429ff7890SWen Congyang     Error *local_err = NULL;
35529ff7890SWen Congyang 
35629ff7890SWen Congyang     if (writable) {
35729ff7890SWen Congyang         orig_hidden_flags = s->orig_hidden_flags =
35829ff7890SWen Congyang                                 bdrv_get_flags(s->hidden_disk->bs);
35929ff7890SWen Congyang         new_hidden_flags = (orig_hidden_flags | BDRV_O_RDWR) &
36029ff7890SWen Congyang                                                     ~BDRV_O_INACTIVE;
36129ff7890SWen Congyang         orig_secondary_flags = s->orig_secondary_flags =
36229ff7890SWen Congyang                                 bdrv_get_flags(s->secondary_disk->bs);
36329ff7890SWen Congyang         new_secondary_flags = (orig_secondary_flags | BDRV_O_RDWR) &
36429ff7890SWen Congyang                                                      ~BDRV_O_INACTIVE;
36529ff7890SWen Congyang     } else {
36629ff7890SWen Congyang         orig_hidden_flags = (s->orig_hidden_flags | BDRV_O_RDWR) &
36729ff7890SWen Congyang                                                     ~BDRV_O_INACTIVE;
36829ff7890SWen Congyang         new_hidden_flags = s->orig_hidden_flags;
36929ff7890SWen Congyang         orig_secondary_flags = (s->orig_secondary_flags | BDRV_O_RDWR) &
37029ff7890SWen Congyang                                                     ~BDRV_O_INACTIVE;
37129ff7890SWen Congyang         new_secondary_flags = s->orig_secondary_flags;
37229ff7890SWen Congyang     }
37329ff7890SWen Congyang 
37429ff7890SWen Congyang     if (orig_hidden_flags != new_hidden_flags) {
37529ff7890SWen Congyang         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
37629ff7890SWen Congyang                                          new_hidden_flags);
37729ff7890SWen Congyang     }
37829ff7890SWen Congyang 
37929ff7890SWen Congyang     if (!(orig_secondary_flags & BDRV_O_RDWR)) {
38029ff7890SWen Congyang         reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
38129ff7890SWen Congyang                                          NULL, new_secondary_flags);
38229ff7890SWen Congyang     }
38329ff7890SWen Congyang 
38429ff7890SWen Congyang     if (reopen_queue) {
385720150f3SPaolo Bonzini         bdrv_reopen_multiple(bdrv_get_aio_context(bs),
386720150f3SPaolo Bonzini                              reopen_queue, &local_err);
38729ff7890SWen Congyang         error_propagate(errp, local_err);
38829ff7890SWen Congyang     }
38929ff7890SWen Congyang }
39029ff7890SWen Congyang 
3918dd9006eSPaolo Bonzini static void backup_job_cleanup(BlockDriverState *bs)
39229ff7890SWen Congyang {
3938dd9006eSPaolo Bonzini     BDRVReplicationState *s = bs->opaque;
39429ff7890SWen Congyang     BlockDriverState *top_bs;
39529ff7890SWen Congyang 
39629ff7890SWen Congyang     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
39729ff7890SWen Congyang     if (!top_bs) {
39829ff7890SWen Congyang         return;
39929ff7890SWen Congyang     }
40029ff7890SWen Congyang     bdrv_op_unblock_all(top_bs, s->blocker);
40129ff7890SWen Congyang     error_free(s->blocker);
4028dd9006eSPaolo Bonzini     reopen_backing_file(bs, false, NULL);
40329ff7890SWen Congyang }
40429ff7890SWen Congyang 
40529ff7890SWen Congyang static void backup_job_completed(void *opaque, int ret)
40629ff7890SWen Congyang {
4078dd9006eSPaolo Bonzini     BlockDriverState *bs = opaque;
4088dd9006eSPaolo Bonzini     BDRVReplicationState *s = bs->opaque;
40929ff7890SWen Congyang 
4103c76c606SFam Zheng     if (s->stage != BLOCK_REPLICATION_FAILOVER) {
41129ff7890SWen Congyang         /* The backup job is cancelled unexpectedly */
41229ff7890SWen Congyang         s->error = -EIO;
41329ff7890SWen Congyang     }
41429ff7890SWen Congyang 
4158dd9006eSPaolo Bonzini     backup_job_cleanup(bs);
41629ff7890SWen Congyang }
41729ff7890SWen Congyang 
41829ff7890SWen Congyang static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
41929ff7890SWen Congyang {
42029ff7890SWen Congyang     BdrvChild *child;
42129ff7890SWen Congyang 
42229ff7890SWen Congyang     /* The bs itself is the top_bs */
42329ff7890SWen Congyang     if (top_bs == bs) {
42429ff7890SWen Congyang         return true;
42529ff7890SWen Congyang     }
42629ff7890SWen Congyang 
42729ff7890SWen Congyang     /* Iterate over top_bs's children */
42829ff7890SWen Congyang     QLIST_FOREACH(child, &top_bs->children, next) {
42929ff7890SWen Congyang         if (child->bs == bs || check_top_bs(child->bs, bs)) {
43029ff7890SWen Congyang             return true;
43129ff7890SWen Congyang         }
43229ff7890SWen Congyang     }
43329ff7890SWen Congyang 
43429ff7890SWen Congyang     return false;
43529ff7890SWen Congyang }
43629ff7890SWen Congyang 
43729ff7890SWen Congyang static void replication_start(ReplicationState *rs, ReplicationMode mode,
43829ff7890SWen Congyang                               Error **errp)
43929ff7890SWen Congyang {
44029ff7890SWen Congyang     BlockDriverState *bs = rs->opaque;
44129ff7890SWen Congyang     BDRVReplicationState *s;
44229ff7890SWen Congyang     BlockDriverState *top_bs;
44329ff7890SWen Congyang     int64_t active_length, hidden_length, disk_length;
44429ff7890SWen Congyang     AioContext *aio_context;
44529ff7890SWen Congyang     Error *local_err = NULL;
446111049a4SJohn Snow     BlockJob *job;
44729ff7890SWen Congyang 
44829ff7890SWen Congyang     aio_context = bdrv_get_aio_context(bs);
44929ff7890SWen Congyang     aio_context_acquire(aio_context);
45029ff7890SWen Congyang     s = bs->opaque;
45129ff7890SWen Congyang 
4523c76c606SFam Zheng     if (s->stage != BLOCK_REPLICATION_NONE) {
45329ff7890SWen Congyang         error_setg(errp, "Block replication is running or done");
45429ff7890SWen Congyang         aio_context_release(aio_context);
45529ff7890SWen Congyang         return;
45629ff7890SWen Congyang     }
45729ff7890SWen Congyang 
45829ff7890SWen Congyang     if (s->mode != mode) {
45929ff7890SWen Congyang         error_setg(errp, "The parameter mode's value is invalid, needs %d,"
46029ff7890SWen Congyang                    " but got %d", s->mode, mode);
46129ff7890SWen Congyang         aio_context_release(aio_context);
46229ff7890SWen Congyang         return;
46329ff7890SWen Congyang     }
46429ff7890SWen Congyang 
46529ff7890SWen Congyang     switch (s->mode) {
46629ff7890SWen Congyang     case REPLICATION_MODE_PRIMARY:
46729ff7890SWen Congyang         break;
46829ff7890SWen Congyang     case REPLICATION_MODE_SECONDARY:
46929ff7890SWen Congyang         s->active_disk = bs->file;
47029ff7890SWen Congyang         if (!s->active_disk || !s->active_disk->bs ||
47129ff7890SWen Congyang                                     !s->active_disk->bs->backing) {
47229ff7890SWen Congyang             error_setg(errp, "Active disk doesn't have backing file");
47329ff7890SWen Congyang             aio_context_release(aio_context);
47429ff7890SWen Congyang             return;
47529ff7890SWen Congyang         }
47629ff7890SWen Congyang 
47729ff7890SWen Congyang         s->hidden_disk = s->active_disk->bs->backing;
47829ff7890SWen Congyang         if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
47929ff7890SWen Congyang             error_setg(errp, "Hidden disk doesn't have backing file");
48029ff7890SWen Congyang             aio_context_release(aio_context);
48129ff7890SWen Congyang             return;
48229ff7890SWen Congyang         }
48329ff7890SWen Congyang 
48429ff7890SWen Congyang         s->secondary_disk = s->hidden_disk->bs->backing;
48529ff7890SWen Congyang         if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
48629ff7890SWen Congyang             error_setg(errp, "The secondary disk doesn't have block backend");
48729ff7890SWen Congyang             aio_context_release(aio_context);
48829ff7890SWen Congyang             return;
48929ff7890SWen Congyang         }
49029ff7890SWen Congyang 
49129ff7890SWen Congyang         /* verify the length */
49229ff7890SWen Congyang         active_length = bdrv_getlength(s->active_disk->bs);
49329ff7890SWen Congyang         hidden_length = bdrv_getlength(s->hidden_disk->bs);
49429ff7890SWen Congyang         disk_length = bdrv_getlength(s->secondary_disk->bs);
49529ff7890SWen Congyang         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
49629ff7890SWen Congyang             active_length != hidden_length || hidden_length != disk_length) {
49729ff7890SWen Congyang             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
49829ff7890SWen Congyang                        " are not the same");
49929ff7890SWen Congyang             aio_context_release(aio_context);
50029ff7890SWen Congyang             return;
50129ff7890SWen Congyang         }
50229ff7890SWen Congyang 
50329ff7890SWen Congyang         if (!s->active_disk->bs->drv->bdrv_make_empty ||
50429ff7890SWen Congyang             !s->hidden_disk->bs->drv->bdrv_make_empty) {
50529ff7890SWen Congyang             error_setg(errp,
50629ff7890SWen Congyang                        "Active disk or hidden disk doesn't support make_empty");
50729ff7890SWen Congyang             aio_context_release(aio_context);
50829ff7890SWen Congyang             return;
50929ff7890SWen Congyang         }
51029ff7890SWen Congyang 
51129ff7890SWen Congyang         /* reopen the backing file in r/w mode */
5128dd9006eSPaolo Bonzini         reopen_backing_file(bs, true, &local_err);
51329ff7890SWen Congyang         if (local_err) {
51429ff7890SWen Congyang             error_propagate(errp, local_err);
51529ff7890SWen Congyang             aio_context_release(aio_context);
51629ff7890SWen Congyang             return;
51729ff7890SWen Congyang         }
51829ff7890SWen Congyang 
51929ff7890SWen Congyang         /* start backup job now */
52029ff7890SWen Congyang         error_setg(&s->blocker,
52129ff7890SWen Congyang                    "Block device is in use by internal backup job");
52229ff7890SWen Congyang 
52329ff7890SWen Congyang         top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
52429ff7890SWen Congyang         if (!top_bs || !bdrv_is_root_node(top_bs) ||
52529ff7890SWen Congyang             !check_top_bs(top_bs, bs)) {
52629ff7890SWen Congyang             error_setg(errp, "No top_bs or it is invalid");
5278dd9006eSPaolo Bonzini             reopen_backing_file(bs, false, NULL);
52829ff7890SWen Congyang             aio_context_release(aio_context);
52929ff7890SWen Congyang             return;
53029ff7890SWen Congyang         }
53129ff7890SWen Congyang         bdrv_op_block_all(top_bs, s->blocker);
53229ff7890SWen Congyang         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
53329ff7890SWen Congyang 
534111049a4SJohn Snow         job = backup_job_create(NULL, s->secondary_disk->bs, s->hidden_disk->bs,
535111049a4SJohn Snow                                 0, MIRROR_SYNC_MODE_NONE, NULL, false,
536111049a4SJohn Snow                                 BLOCKDEV_ON_ERROR_REPORT,
537111049a4SJohn Snow                                 BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
538111049a4SJohn Snow                                 backup_job_completed, bs, NULL, &local_err);
53929ff7890SWen Congyang         if (local_err) {
54029ff7890SWen Congyang             error_propagate(errp, local_err);
5418dd9006eSPaolo Bonzini             backup_job_cleanup(bs);
54229ff7890SWen Congyang             aio_context_release(aio_context);
54329ff7890SWen Congyang             return;
54429ff7890SWen Congyang         }
545111049a4SJohn Snow         block_job_start(job);
54629ff7890SWen Congyang         break;
54729ff7890SWen Congyang     default:
54829ff7890SWen Congyang         aio_context_release(aio_context);
54929ff7890SWen Congyang         abort();
55029ff7890SWen Congyang     }
55129ff7890SWen Congyang 
5523c76c606SFam Zheng     s->stage = BLOCK_REPLICATION_RUNNING;
55329ff7890SWen Congyang 
55429ff7890SWen Congyang     if (s->mode == REPLICATION_MODE_SECONDARY) {
55529ff7890SWen Congyang         secondary_do_checkpoint(s, errp);
55629ff7890SWen Congyang     }
55729ff7890SWen Congyang 
55829ff7890SWen Congyang     s->error = 0;
55929ff7890SWen Congyang     aio_context_release(aio_context);
56029ff7890SWen Congyang }
56129ff7890SWen Congyang 
56229ff7890SWen Congyang static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
56329ff7890SWen Congyang {
56429ff7890SWen Congyang     BlockDriverState *bs = rs->opaque;
56529ff7890SWen Congyang     BDRVReplicationState *s;
56629ff7890SWen Congyang     AioContext *aio_context;
56729ff7890SWen Congyang 
56829ff7890SWen Congyang     aio_context = bdrv_get_aio_context(bs);
56929ff7890SWen Congyang     aio_context_acquire(aio_context);
57029ff7890SWen Congyang     s = bs->opaque;
57129ff7890SWen Congyang 
57229ff7890SWen Congyang     if (s->mode == REPLICATION_MODE_SECONDARY) {
57329ff7890SWen Congyang         secondary_do_checkpoint(s, errp);
57429ff7890SWen Congyang     }
57529ff7890SWen Congyang     aio_context_release(aio_context);
57629ff7890SWen Congyang }
57729ff7890SWen Congyang 
57829ff7890SWen Congyang static void replication_get_error(ReplicationState *rs, Error **errp)
57929ff7890SWen Congyang {
58029ff7890SWen Congyang     BlockDriverState *bs = rs->opaque;
58129ff7890SWen Congyang     BDRVReplicationState *s;
58229ff7890SWen Congyang     AioContext *aio_context;
58329ff7890SWen Congyang 
58429ff7890SWen Congyang     aio_context = bdrv_get_aio_context(bs);
58529ff7890SWen Congyang     aio_context_acquire(aio_context);
58629ff7890SWen Congyang     s = bs->opaque;
58729ff7890SWen Congyang 
5883c76c606SFam Zheng     if (s->stage != BLOCK_REPLICATION_RUNNING) {
58929ff7890SWen Congyang         error_setg(errp, "Block replication is not running");
59029ff7890SWen Congyang         aio_context_release(aio_context);
59129ff7890SWen Congyang         return;
59229ff7890SWen Congyang     }
59329ff7890SWen Congyang 
59429ff7890SWen Congyang     if (s->error) {
59529ff7890SWen Congyang         error_setg(errp, "I/O error occurred");
59629ff7890SWen Congyang         aio_context_release(aio_context);
59729ff7890SWen Congyang         return;
59829ff7890SWen Congyang     }
59929ff7890SWen Congyang     aio_context_release(aio_context);
60029ff7890SWen Congyang }
60129ff7890SWen Congyang 
60229ff7890SWen Congyang static void replication_done(void *opaque, int ret)
60329ff7890SWen Congyang {
60429ff7890SWen Congyang     BlockDriverState *bs = opaque;
60529ff7890SWen Congyang     BDRVReplicationState *s = bs->opaque;
60629ff7890SWen Congyang 
60729ff7890SWen Congyang     if (ret == 0) {
6083c76c606SFam Zheng         s->stage = BLOCK_REPLICATION_DONE;
60929ff7890SWen Congyang 
61029ff7890SWen Congyang         /* refresh top bs's filename */
61129ff7890SWen Congyang         bdrv_refresh_filename(bs);
61229ff7890SWen Congyang         s->active_disk = NULL;
61329ff7890SWen Congyang         s->secondary_disk = NULL;
61429ff7890SWen Congyang         s->hidden_disk = NULL;
61529ff7890SWen Congyang         s->error = 0;
61629ff7890SWen Congyang     } else {
6173c76c606SFam Zheng         s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
61829ff7890SWen Congyang         s->error = -EIO;
61929ff7890SWen Congyang     }
62029ff7890SWen Congyang }
62129ff7890SWen Congyang 
62229ff7890SWen Congyang static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
62329ff7890SWen Congyang {
62429ff7890SWen Congyang     BlockDriverState *bs = rs->opaque;
62529ff7890SWen Congyang     BDRVReplicationState *s;
62629ff7890SWen Congyang     AioContext *aio_context;
62729ff7890SWen Congyang 
62829ff7890SWen Congyang     aio_context = bdrv_get_aio_context(bs);
62929ff7890SWen Congyang     aio_context_acquire(aio_context);
63029ff7890SWen Congyang     s = bs->opaque;
63129ff7890SWen Congyang 
6323c76c606SFam Zheng     if (s->stage != BLOCK_REPLICATION_RUNNING) {
63329ff7890SWen Congyang         error_setg(errp, "Block replication is not running");
63429ff7890SWen Congyang         aio_context_release(aio_context);
63529ff7890SWen Congyang         return;
63629ff7890SWen Congyang     }
63729ff7890SWen Congyang 
63829ff7890SWen Congyang     switch (s->mode) {
63929ff7890SWen Congyang     case REPLICATION_MODE_PRIMARY:
6403c76c606SFam Zheng         s->stage = BLOCK_REPLICATION_DONE;
64129ff7890SWen Congyang         s->error = 0;
64229ff7890SWen Congyang         break;
64329ff7890SWen Congyang     case REPLICATION_MODE_SECONDARY:
64429ff7890SWen Congyang         /*
64529ff7890SWen Congyang          * This BDS will be closed, and the job should be completed
64629ff7890SWen Congyang          * before the BDS is closed, because we will access hidden
64729ff7890SWen Congyang          * disk, secondary disk in backup_job_completed().
64829ff7890SWen Congyang          */
64929ff7890SWen Congyang         if (s->secondary_disk->bs->job) {
65029ff7890SWen Congyang             block_job_cancel_sync(s->secondary_disk->bs->job);
65129ff7890SWen Congyang         }
65229ff7890SWen Congyang 
65329ff7890SWen Congyang         if (!failover) {
65429ff7890SWen Congyang             secondary_do_checkpoint(s, errp);
6553c76c606SFam Zheng             s->stage = BLOCK_REPLICATION_DONE;
65629ff7890SWen Congyang             aio_context_release(aio_context);
65729ff7890SWen Congyang             return;
65829ff7890SWen Congyang         }
65929ff7890SWen Congyang 
6603c76c606SFam Zheng         s->stage = BLOCK_REPLICATION_FAILOVER;
66147970dfbSJohn Snow         commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
66247970dfbSJohn Snow                             BLOCK_JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
66378bbd910SFam Zheng                             NULL, replication_done, bs, true, errp);
66429ff7890SWen Congyang         break;
66529ff7890SWen Congyang     default:
66629ff7890SWen Congyang         aio_context_release(aio_context);
66729ff7890SWen Congyang         abort();
66829ff7890SWen Congyang     }
66929ff7890SWen Congyang     aio_context_release(aio_context);
67029ff7890SWen Congyang }
67129ff7890SWen Congyang 
67229ff7890SWen Congyang BlockDriver bdrv_replication = {
67329ff7890SWen Congyang     .format_name                = "replication",
67429ff7890SWen Congyang     .protocol_name              = "replication",
67529ff7890SWen Congyang     .instance_size              = sizeof(BDRVReplicationState),
67629ff7890SWen Congyang 
67729ff7890SWen Congyang     .bdrv_open                  = replication_open,
67829ff7890SWen Congyang     .bdrv_close                 = replication_close,
67937a9051cSChanglong Xie     .bdrv_child_perm            = replication_child_perm,
68029ff7890SWen Congyang 
68129ff7890SWen Congyang     .bdrv_getlength             = replication_getlength,
68229ff7890SWen Congyang     .bdrv_co_readv              = replication_co_readv,
68329ff7890SWen Congyang     .bdrv_co_writev             = replication_co_writev,
68429ff7890SWen Congyang 
68529ff7890SWen Congyang     .is_filter                  = true,
68629ff7890SWen Congyang     .bdrv_recurse_is_first_non_filter = replication_recurse_is_first_non_filter,
68729ff7890SWen Congyang 
68829ff7890SWen Congyang     .has_variable_length        = true,
68929ff7890SWen Congyang };
69029ff7890SWen Congyang 
69129ff7890SWen Congyang static void bdrv_replication_init(void)
69229ff7890SWen Congyang {
69329ff7890SWen Congyang     bdrv_register(&bdrv_replication);
69429ff7890SWen Congyang }
69529ff7890SWen Congyang 
69629ff7890SWen Congyang block_init(bdrv_replication_init);
697