129ff7890SWen Congyang /*
229ff7890SWen Congyang * Replication Block filter
329ff7890SWen Congyang *
429ff7890SWen Congyang * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
529ff7890SWen Congyang * Copyright (c) 2016 Intel Corporation
629ff7890SWen Congyang * Copyright (c) 2016 FUJITSU LIMITED
729ff7890SWen Congyang *
829ff7890SWen Congyang * Author:
929ff7890SWen Congyang * Wen Congyang <wency@cn.fujitsu.com>
1029ff7890SWen Congyang *
1129ff7890SWen Congyang * This work is licensed under the terms of the GNU GPL, version 2 or later.
1229ff7890SWen Congyang * See the COPYING file in the top-level directory.
1329ff7890SWen Congyang */
1429ff7890SWen Congyang
1529ff7890SWen Congyang #include "qemu/osdep.h"
160b8fa32fSMarkus Armbruster #include "qemu/module.h"
17922a01a0SMarkus Armbruster #include "qemu/option.h"
1829ff7890SWen Congyang #include "block/nbd.h"
1929ff7890SWen Congyang #include "block/blockjob.h"
2029ff7890SWen Congyang #include "block/block_int.h"
2129ff7890SWen Congyang #include "block/block_backup.h"
2229ff7890SWen Congyang #include "sysemu/block-backend.h"
2329ff7890SWen Congyang #include "qapi/error.h"
243c4e9647SAlberto Garcia #include "qapi/qmp/qdict.h"
25b0262955SPaolo Bonzini #include "block/replication.h"
2629ff7890SWen Congyang
273c76c606SFam Zheng typedef enum {
283c76c606SFam Zheng BLOCK_REPLICATION_NONE, /* block replication is not started */
293c76c606SFam Zheng BLOCK_REPLICATION_RUNNING, /* block replication is running */
303c76c606SFam Zheng BLOCK_REPLICATION_FAILOVER, /* failover is running in background */
313c76c606SFam Zheng BLOCK_REPLICATION_FAILOVER_FAILED, /* failover failed */
323c76c606SFam Zheng BLOCK_REPLICATION_DONE, /* block replication is done */
333c76c606SFam Zheng } ReplicationStage;
343c76c606SFam Zheng
3529ff7890SWen Congyang typedef struct BDRVReplicationState {
3629ff7890SWen Congyang ReplicationMode mode;
373c76c606SFam Zheng ReplicationStage stage;
38cc19f177SVladimir Sementsov-Ogievskiy BlockJob *commit_job;
3929ff7890SWen Congyang BdrvChild *hidden_disk;
4029ff7890SWen Congyang BdrvChild *secondary_disk;
41cc19f177SVladimir Sementsov-Ogievskiy BlockJob *backup_job;
4229ff7890SWen Congyang char *top_id;
4329ff7890SWen Congyang ReplicationState *rs;
4429ff7890SWen Congyang Error *blocker;
453c4e9647SAlberto Garcia bool orig_hidden_read_only;
463c4e9647SAlberto Garcia bool orig_secondary_read_only;
4729ff7890SWen Congyang int error;
4829ff7890SWen Congyang } BDRVReplicationState;
4929ff7890SWen Congyang
5029ff7890SWen Congyang static void replication_start(ReplicationState *rs, ReplicationMode mode,
5129ff7890SWen Congyang Error **errp);
5229ff7890SWen Congyang static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
5329ff7890SWen Congyang static void replication_get_error(ReplicationState *rs, Error **errp);
5429ff7890SWen Congyang static void replication_stop(ReplicationState *rs, bool failover,
5529ff7890SWen Congyang Error **errp);
5629ff7890SWen Congyang
5729ff7890SWen Congyang #define REPLICATION_MODE "mode"
5829ff7890SWen Congyang #define REPLICATION_TOP_ID "top-id"
5929ff7890SWen Congyang static QemuOptsList replication_runtime_opts = {
6029ff7890SWen Congyang .name = "replication",
6129ff7890SWen Congyang .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
6229ff7890SWen Congyang .desc = {
6329ff7890SWen Congyang {
6429ff7890SWen Congyang .name = REPLICATION_MODE,
6529ff7890SWen Congyang .type = QEMU_OPT_STRING,
6629ff7890SWen Congyang },
6729ff7890SWen Congyang {
6829ff7890SWen Congyang .name = REPLICATION_TOP_ID,
6929ff7890SWen Congyang .type = QEMU_OPT_STRING,
7029ff7890SWen Congyang },
7129ff7890SWen Congyang { /* end of list */ }
7229ff7890SWen Congyang },
7329ff7890SWen Congyang };
7429ff7890SWen Congyang
7529ff7890SWen Congyang static ReplicationOps replication_ops = {
7629ff7890SWen Congyang .start = replication_start,
7729ff7890SWen Congyang .checkpoint = replication_do_checkpoint,
7829ff7890SWen Congyang .get_error = replication_get_error,
7929ff7890SWen Congyang .stop = replication_stop,
8029ff7890SWen Congyang };
8129ff7890SWen Congyang
replication_open(BlockDriverState * bs,QDict * options,int flags,Error ** errp)8229ff7890SWen Congyang static int replication_open(BlockDriverState *bs, QDict *options,
8329ff7890SWen Congyang int flags, Error **errp)
8429ff7890SWen Congyang {
8529ff7890SWen Congyang int ret;
8629ff7890SWen Congyang BDRVReplicationState *s = bs->opaque;
8729ff7890SWen Congyang QemuOpts *opts = NULL;
8829ff7890SWen Congyang const char *mode;
8929ff7890SWen Congyang const char *top_id;
9029ff7890SWen Congyang
9183930780SVladimir Sementsov-Ogievskiy ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
9283930780SVladimir Sementsov-Ogievskiy if (ret < 0) {
9383930780SVladimir Sementsov-Ogievskiy return ret;
944e4bf5c4SKevin Wolf }
954e4bf5c4SKevin Wolf
9629ff7890SWen Congyang ret = -EINVAL;
9729ff7890SWen Congyang opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
98a5f9b9dfSMarkus Armbruster if (!qemu_opts_absorb_qdict(opts, options, errp)) {
9929ff7890SWen Congyang goto fail;
10029ff7890SWen Congyang }
10129ff7890SWen Congyang
10229ff7890SWen Congyang mode = qemu_opt_get(opts, REPLICATION_MODE);
10329ff7890SWen Congyang if (!mode) {
104dcfe4805SMarkus Armbruster error_setg(errp, "Missing the option mode");
10529ff7890SWen Congyang goto fail;
10629ff7890SWen Congyang }
10729ff7890SWen Congyang
10829ff7890SWen Congyang if (!strcmp(mode, "primary")) {
10929ff7890SWen Congyang s->mode = REPLICATION_MODE_PRIMARY;
110f4f2539bSChanglong Xie top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
111f4f2539bSChanglong Xie if (top_id) {
112dcfe4805SMarkus Armbruster error_setg(errp,
113dcfe4805SMarkus Armbruster "The primary side does not support option top-id");
114f4f2539bSChanglong Xie goto fail;
115f4f2539bSChanglong Xie }
11629ff7890SWen Congyang } else if (!strcmp(mode, "secondary")) {
11729ff7890SWen Congyang s->mode = REPLICATION_MODE_SECONDARY;
11829ff7890SWen Congyang top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
11929ff7890SWen Congyang s->top_id = g_strdup(top_id);
12029ff7890SWen Congyang if (!s->top_id) {
121dcfe4805SMarkus Armbruster error_setg(errp, "Missing the option top-id");
12229ff7890SWen Congyang goto fail;
12329ff7890SWen Congyang }
12429ff7890SWen Congyang } else {
125dcfe4805SMarkus Armbruster error_setg(errp,
12629ff7890SWen Congyang "The option mode's value should be primary or secondary");
12729ff7890SWen Congyang goto fail;
12829ff7890SWen Congyang }
12929ff7890SWen Congyang
13029ff7890SWen Congyang s->rs = replication_new(bs, &replication_ops);
13129ff7890SWen Congyang
13229ff7890SWen Congyang ret = 0;
13329ff7890SWen Congyang
13429ff7890SWen Congyang fail:
13529ff7890SWen Congyang qemu_opts_del(opts);
13629ff7890SWen Congyang return ret;
13729ff7890SWen Congyang }
13829ff7890SWen Congyang
replication_close(BlockDriverState * bs)13929ff7890SWen Congyang static void replication_close(BlockDriverState *bs)
14029ff7890SWen Congyang {
14129ff7890SWen Congyang BDRVReplicationState *s = bs->opaque;
14208558e33SStefan Reiter Job *commit_job;
1433ed4f708SEmanuele Giuseppe Esposito GLOBAL_STATE_CODE();
14429ff7890SWen Congyang
1453c76c606SFam Zheng if (s->stage == BLOCK_REPLICATION_RUNNING) {
14629ff7890SWen Congyang replication_stop(s->rs, false, NULL);
14729ff7890SWen Congyang }
1483c76c606SFam Zheng if (s->stage == BLOCK_REPLICATION_FAILOVER) {
14908558e33SStefan Reiter commit_job = &s->commit_job->job;
15008558e33SStefan Reiter assert(commit_job->aio_context == qemu_get_current_aio_context());
1514cfb3f05SHanna Reitz job_cancel_sync(commit_job, false);
15250ab0e09SPaolo Bonzini }
15329ff7890SWen Congyang
15429ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) {
15529ff7890SWen Congyang g_free(s->top_id);
15629ff7890SWen Congyang }
15729ff7890SWen Congyang
15829ff7890SWen Congyang replication_remove(s->rs);
15929ff7890SWen Congyang }
16029ff7890SWen Congyang
replication_child_perm(BlockDriverState * bs,BdrvChild * c,BdrvChildRole role,BlockReopenQueue * reopen_queue,uint64_t perm,uint64_t shared,uint64_t * nperm,uint64_t * nshared)16137a9051cSChanglong Xie static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
162bf8e925eSMax Reitz BdrvChildRole role,
163e0995dc3SKevin Wolf BlockReopenQueue *reopen_queue,
16437a9051cSChanglong Xie uint64_t perm, uint64_t shared,
16537a9051cSChanglong Xie uint64_t *nperm, uint64_t *nshared)
16637a9051cSChanglong Xie {
1673b78420bSLukas Straub if (role & BDRV_CHILD_PRIMARY) {
168611e0653SWang Guang *nperm = BLK_PERM_CONSISTENT_READ;
1693b78420bSLukas Straub } else {
1703b78420bSLukas Straub *nperm = 0;
1713b78420bSLukas Straub }
1723b78420bSLukas Straub
173611e0653SWang Guang if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
174611e0653SWang Guang *nperm |= BLK_PERM_WRITE;
175611e0653SWang Guang }
17678ee6bd0SPhilippe Mathieu-Daudé *nshared = BLK_PERM_CONSISTENT_READ
17778ee6bd0SPhilippe Mathieu-Daudé | BLK_PERM_WRITE
17837a9051cSChanglong Xie | BLK_PERM_WRITE_UNCHANGED;
17937a9051cSChanglong Xie return;
18037a9051cSChanglong Xie }
18137a9051cSChanglong Xie
1828ab8140aSKevin Wolf static int64_t coroutine_fn GRAPH_RDLOCK
replication_co_getlength(BlockDriverState * bs)1838ab8140aSKevin Wolf replication_co_getlength(BlockDriverState *bs)
18429ff7890SWen Congyang {
185c86422c5SEmanuele Giuseppe Esposito return bdrv_co_getlength(bs->file->bs);
18629ff7890SWen Congyang }
18729ff7890SWen Congyang
replication_get_io_status(BDRVReplicationState * s)18829ff7890SWen Congyang static int replication_get_io_status(BDRVReplicationState *s)
18929ff7890SWen Congyang {
1903c76c606SFam Zheng switch (s->stage) {
19129ff7890SWen Congyang case BLOCK_REPLICATION_NONE:
19229ff7890SWen Congyang return -EIO;
19329ff7890SWen Congyang case BLOCK_REPLICATION_RUNNING:
19429ff7890SWen Congyang return 0;
19529ff7890SWen Congyang case BLOCK_REPLICATION_FAILOVER:
19629ff7890SWen Congyang return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
19729ff7890SWen Congyang case BLOCK_REPLICATION_FAILOVER_FAILED:
19829ff7890SWen Congyang return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
19929ff7890SWen Congyang case BLOCK_REPLICATION_DONE:
20029ff7890SWen Congyang /*
20129ff7890SWen Congyang * active commit job completes, and active disk and secondary_disk
20229ff7890SWen Congyang * is swapped, so we can operate bs->file directly
20329ff7890SWen Congyang */
20429ff7890SWen Congyang return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
20529ff7890SWen Congyang default:
20629ff7890SWen Congyang abort();
20729ff7890SWen Congyang }
20829ff7890SWen Congyang }
20929ff7890SWen Congyang
replication_return_value(BDRVReplicationState * s,int ret)21029ff7890SWen Congyang static int replication_return_value(BDRVReplicationState *s, int ret)
21129ff7890SWen Congyang {
21229ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) {
21329ff7890SWen Congyang return ret;
21429ff7890SWen Congyang }
21529ff7890SWen Congyang
21629ff7890SWen Congyang if (ret < 0) {
21729ff7890SWen Congyang s->error = ret;
21829ff7890SWen Congyang ret = 0;
21929ff7890SWen Congyang }
22029ff7890SWen Congyang
22129ff7890SWen Congyang return ret;
22229ff7890SWen Congyang }
22329ff7890SWen Congyang
224b9b10c35SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
replication_co_readv(BlockDriverState * bs,int64_t sector_num,int remaining_sectors,QEMUIOVector * qiov)225b9b10c35SKevin Wolf replication_co_readv(BlockDriverState *bs, int64_t sector_num,
226b9b10c35SKevin Wolf int remaining_sectors, QEMUIOVector *qiov)
22729ff7890SWen Congyang {
22829ff7890SWen Congyang BDRVReplicationState *s = bs->opaque;
22929ff7890SWen Congyang int ret;
23029ff7890SWen Congyang
23129ff7890SWen Congyang if (s->mode == REPLICATION_MODE_PRIMARY) {
23229ff7890SWen Congyang /* We only use it to forward primary write requests */
23329ff7890SWen Congyang return -EIO;
23429ff7890SWen Congyang }
23529ff7890SWen Congyang
23629ff7890SWen Congyang ret = replication_get_io_status(s);
23729ff7890SWen Congyang if (ret < 0) {
23829ff7890SWen Congyang return ret;
23929ff7890SWen Congyang }
24029ff7890SWen Congyang
24104a11d87SEric Blake ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
24204a11d87SEric Blake remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
243e4f9752cSVladimir Sementsov-Ogievskiy
24429ff7890SWen Congyang return replication_return_value(s, ret);
24529ff7890SWen Congyang }
24629ff7890SWen Congyang
247b9b10c35SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
replication_co_writev(BlockDriverState * bs,int64_t sector_num,int remaining_sectors,QEMUIOVector * qiov,int flags)248b9b10c35SKevin Wolf replication_co_writev(BlockDriverState *bs, int64_t sector_num,
249b9b10c35SKevin Wolf int remaining_sectors, QEMUIOVector *qiov, int flags)
25029ff7890SWen Congyang {
25129ff7890SWen Congyang BDRVReplicationState *s = bs->opaque;
25229ff7890SWen Congyang QEMUIOVector hd_qiov;
25329ff7890SWen Congyang uint64_t bytes_done = 0;
25429ff7890SWen Congyang BdrvChild *top = bs->file;
25529ff7890SWen Congyang BdrvChild *base = s->secondary_disk;
25629ff7890SWen Congyang BdrvChild *target;
25751b0a488SEric Blake int ret;
25851b0a488SEric Blake int64_t n;
25929ff7890SWen Congyang
26029ff7890SWen Congyang ret = replication_get_io_status(s);
26129ff7890SWen Congyang if (ret < 0) {
26229ff7890SWen Congyang goto out;
26329ff7890SWen Congyang }
26429ff7890SWen Congyang
26529ff7890SWen Congyang if (ret == 0) {
26604a11d87SEric Blake ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
26704a11d87SEric Blake remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
26829ff7890SWen Congyang return replication_return_value(s, ret);
26929ff7890SWen Congyang }
27029ff7890SWen Congyang
27129ff7890SWen Congyang /*
27229ff7890SWen Congyang * Failover failed, only write to active disk if the sectors
27329ff7890SWen Congyang * have already been allocated in active disk/hidden disk.
27429ff7890SWen Congyang */
27529ff7890SWen Congyang qemu_iovec_init(&hd_qiov, qiov->niov);
27629ff7890SWen Congyang while (remaining_sectors > 0) {
27751b0a488SEric Blake int64_t count;
27851b0a488SEric Blake
279cc323997SPaolo Bonzini ret = bdrv_co_is_allocated_above(top->bs, base->bs, false,
28051b0a488SEric Blake sector_num * BDRV_SECTOR_SIZE,
28151b0a488SEric Blake remaining_sectors * BDRV_SECTOR_SIZE,
28251b0a488SEric Blake &count);
28329ff7890SWen Congyang if (ret < 0) {
28429ff7890SWen Congyang goto out1;
28529ff7890SWen Congyang }
28629ff7890SWen Congyang
28751b0a488SEric Blake assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
28851b0a488SEric Blake n = count >> BDRV_SECTOR_BITS;
28929ff7890SWen Congyang qemu_iovec_reset(&hd_qiov);
29051b0a488SEric Blake qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
29129ff7890SWen Congyang
29229ff7890SWen Congyang target = ret ? top : base;
29304a11d87SEric Blake ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
29404a11d87SEric Blake n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
29529ff7890SWen Congyang if (ret < 0) {
29629ff7890SWen Congyang goto out1;
29729ff7890SWen Congyang }
29829ff7890SWen Congyang
29929ff7890SWen Congyang remaining_sectors -= n;
30029ff7890SWen Congyang sector_num += n;
30151b0a488SEric Blake bytes_done += count;
30229ff7890SWen Congyang }
30329ff7890SWen Congyang
30429ff7890SWen Congyang out1:
30529ff7890SWen Congyang qemu_iovec_destroy(&hd_qiov);
30629ff7890SWen Congyang out:
30729ff7890SWen Congyang return ret;
30829ff7890SWen Congyang }
30929ff7890SWen Congyang
3100bb79c97SKevin Wolf static void GRAPH_UNLOCKED
secondary_do_checkpoint(BlockDriverState * bs,Error ** errp)3110bb79c97SKevin Wolf secondary_do_checkpoint(BlockDriverState *bs, Error **errp)
31229ff7890SWen Congyang {
3131e12ecfdSLukas Straub BDRVReplicationState *s = bs->opaque;
3141f051dcbSKevin Wolf BdrvChild *active_disk;
31529ff7890SWen Congyang Error *local_err = NULL;
31629ff7890SWen Congyang int ret;
31729ff7890SWen Congyang
3180bb79c97SKevin Wolf GRAPH_RDLOCK_GUARD_MAINLOOP();
3190bb79c97SKevin Wolf
320cc19f177SVladimir Sementsov-Ogievskiy if (!s->backup_job) {
32129ff7890SWen Congyang error_setg(errp, "Backup job was cancelled unexpectedly");
32229ff7890SWen Congyang return;
32329ff7890SWen Congyang }
32429ff7890SWen Congyang
325cc19f177SVladimir Sementsov-Ogievskiy backup_do_checkpoint(s->backup_job, &local_err);
32629ff7890SWen Congyang if (local_err) {
32729ff7890SWen Congyang error_propagate(errp, local_err);
32829ff7890SWen Congyang return;
32929ff7890SWen Congyang }
33029ff7890SWen Congyang
3311f051dcbSKevin Wolf active_disk = bs->file;
3321e12ecfdSLukas Straub if (!active_disk->bs->drv) {
333d470ad42SMax Reitz error_setg(errp, "Active disk %s is ejected",
3341e12ecfdSLukas Straub active_disk->bs->node_name);
335d470ad42SMax Reitz return;
336d470ad42SMax Reitz }
337d470ad42SMax Reitz
3381e12ecfdSLukas Straub ret = bdrv_make_empty(active_disk, errp);
33929ff7890SWen Congyang if (ret < 0) {
34029ff7890SWen Congyang return;
34129ff7890SWen Congyang }
34229ff7890SWen Congyang
343d470ad42SMax Reitz if (!s->hidden_disk->bs->drv) {
344d470ad42SMax Reitz error_setg(errp, "Hidden disk %s is ejected",
345d470ad42SMax Reitz s->hidden_disk->bs->node_name);
346d470ad42SMax Reitz return;
347d470ad42SMax Reitz }
348d470ad42SMax Reitz
349c2cf0ecaSLukas Straub ret = bdrv_make_empty(s->hidden_disk, errp);
35029ff7890SWen Congyang if (ret < 0) {
35129ff7890SWen Congyang return;
35229ff7890SWen Congyang }
35329ff7890SWen Congyang }
35429ff7890SWen Congyang
3553c4e9647SAlberto Garcia /* This function is supposed to be called twice:
3563c4e9647SAlberto Garcia * first with writable = true, then with writable = false.
3573c4e9647SAlberto Garcia * The first call puts s->hidden_disk and s->secondary_disk in
3583c4e9647SAlberto Garcia * r/w mode, and the second puts them back in their original state.
3593c4e9647SAlberto Garcia */
reopen_backing_file(BlockDriverState * bs,bool writable,Error ** errp)3608dd9006eSPaolo Bonzini static void reopen_backing_file(BlockDriverState *bs, bool writable,
36129ff7890SWen Congyang Error **errp)
36229ff7890SWen Congyang {
3638dd9006eSPaolo Bonzini BDRVReplicationState *s = bs->opaque;
364a990a42bSLukas Straub BdrvChild *hidden_disk, *secondary_disk;
36529ff7890SWen Congyang BlockReopenQueue *reopen_queue = NULL;
36629ff7890SWen Congyang
367004915a9SKevin Wolf GLOBAL_STATE_CODE();
368004915a9SKevin Wolf GRAPH_RDLOCK_GUARD_MAINLOOP();
369004915a9SKevin Wolf
370a990a42bSLukas Straub /*
371a990a42bSLukas Straub * s->hidden_disk and s->secondary_disk may not be set yet, as they will
372a990a42bSLukas Straub * only be set after the children are writable.
373a990a42bSLukas Straub */
374a990a42bSLukas Straub hidden_disk = bs->file->bs->backing;
375a990a42bSLukas Straub secondary_disk = hidden_disk->bs->backing;
376a990a42bSLukas Straub
37729ff7890SWen Congyang if (writable) {
378a990a42bSLukas Straub s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
379a990a42bSLukas Straub s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
38029ff7890SWen Congyang }
38129ff7890SWen Congyang
3823c4e9647SAlberto Garcia if (s->orig_hidden_read_only) {
3833c4e9647SAlberto Garcia QDict *opts = qdict_new();
3843c4e9647SAlberto Garcia qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
385a990a42bSLukas Straub reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
386077e8e20SAlberto Garcia opts, true);
38729ff7890SWen Congyang }
38829ff7890SWen Congyang
3893c4e9647SAlberto Garcia if (s->orig_secondary_read_only) {
3903c4e9647SAlberto Garcia QDict *opts = qdict_new();
3913c4e9647SAlberto Garcia qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
392a990a42bSLukas Straub reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
393077e8e20SAlberto Garcia opts, true);
39429ff7890SWen Congyang }
39529ff7890SWen Congyang
39629ff7890SWen Congyang if (reopen_queue) {
397992861fbSMarkus Armbruster bdrv_reopen_multiple(reopen_queue, errp);
39829ff7890SWen Congyang }
39929ff7890SWen Congyang }
40029ff7890SWen Congyang
backup_job_cleanup(BlockDriverState * bs)4018dd9006eSPaolo Bonzini static void backup_job_cleanup(BlockDriverState *bs)
40229ff7890SWen Congyang {
4038dd9006eSPaolo Bonzini BDRVReplicationState *s = bs->opaque;
40429ff7890SWen Congyang BlockDriverState *top_bs;
40529ff7890SWen Congyang
406e140f4b7SLukas Straub s->backup_job = NULL;
407e140f4b7SLukas Straub
40829ff7890SWen Congyang top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
40929ff7890SWen Congyang if (!top_bs) {
41029ff7890SWen Congyang return;
41129ff7890SWen Congyang }
41229ff7890SWen Congyang bdrv_op_unblock_all(top_bs, s->blocker);
41329ff7890SWen Congyang error_free(s->blocker);
4148dd9006eSPaolo Bonzini reopen_backing_file(bs, false, NULL);
41529ff7890SWen Congyang }
41629ff7890SWen Congyang
backup_job_completed(void * opaque,int ret)41729ff7890SWen Congyang static void backup_job_completed(void *opaque, int ret)
41829ff7890SWen Congyang {
4198dd9006eSPaolo Bonzini BlockDriverState *bs = opaque;
4208dd9006eSPaolo Bonzini BDRVReplicationState *s = bs->opaque;
42129ff7890SWen Congyang
4223c76c606SFam Zheng if (s->stage != BLOCK_REPLICATION_FAILOVER) {
42329ff7890SWen Congyang /* The backup job is cancelled unexpectedly */
42429ff7890SWen Congyang s->error = -EIO;
42529ff7890SWen Congyang }
42629ff7890SWen Congyang
4278dd9006eSPaolo Bonzini backup_job_cleanup(bs);
42829ff7890SWen Congyang }
42929ff7890SWen Congyang
430680e0cc4SKevin Wolf static bool GRAPH_RDLOCK
check_top_bs(BlockDriverState * top_bs,BlockDriverState * bs)431680e0cc4SKevin Wolf check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
43229ff7890SWen Congyang {
43329ff7890SWen Congyang BdrvChild *child;
43429ff7890SWen Congyang
43529ff7890SWen Congyang /* The bs itself is the top_bs */
43629ff7890SWen Congyang if (top_bs == bs) {
43729ff7890SWen Congyang return true;
43829ff7890SWen Congyang }
43929ff7890SWen Congyang
44029ff7890SWen Congyang /* Iterate over top_bs's children */
44129ff7890SWen Congyang QLIST_FOREACH(child, &top_bs->children, next) {
44229ff7890SWen Congyang if (child->bs == bs || check_top_bs(child->bs, bs)) {
44329ff7890SWen Congyang return true;
44429ff7890SWen Congyang }
44529ff7890SWen Congyang }
44629ff7890SWen Congyang
44729ff7890SWen Congyang return false;
44829ff7890SWen Congyang }
44929ff7890SWen Congyang
replication_start(ReplicationState * rs,ReplicationMode mode,Error ** errp)45029ff7890SWen Congyang static void replication_start(ReplicationState *rs, ReplicationMode mode,
45129ff7890SWen Congyang Error **errp)
45229ff7890SWen Congyang {
45329ff7890SWen Congyang BlockDriverState *bs = rs->opaque;
45429ff7890SWen Congyang BDRVReplicationState *s;
45529ff7890SWen Congyang BlockDriverState *top_bs;
456a990a42bSLukas Straub BdrvChild *active_disk, *hidden_disk, *secondary_disk;
45729ff7890SWen Congyang int64_t active_length, hidden_length, disk_length;
45829ff7890SWen Congyang Error *local_err = NULL;
4592c59fd83SVladimir Sementsov-Ogievskiy BackupPerf perf = { .use_copy_range = true, .max_workers = 1 };
46029ff7890SWen Congyang
4612b3912f1SKevin Wolf GLOBAL_STATE_CODE();
4622b3912f1SKevin Wolf
46329ff7890SWen Congyang s = bs->opaque;
46429ff7890SWen Congyang
46508ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_DONE ||
46608ddb4ebSLukas Straub s->stage == BLOCK_REPLICATION_FAILOVER) {
46708ddb4ebSLukas Straub /*
46808ddb4ebSLukas Straub * This case happens when a secondary is promoted to primary.
46908ddb4ebSLukas Straub * Ignore the request because the secondary side of replication
47008ddb4ebSLukas Straub * doesn't have to do anything anymore.
47108ddb4ebSLukas Straub */
47208ddb4ebSLukas Straub return;
47308ddb4ebSLukas Straub }
47408ddb4ebSLukas Straub
4753c76c606SFam Zheng if (s->stage != BLOCK_REPLICATION_NONE) {
47629ff7890SWen Congyang error_setg(errp, "Block replication is running or done");
47729ff7890SWen Congyang return;
47829ff7890SWen Congyang }
47929ff7890SWen Congyang
48029ff7890SWen Congyang if (s->mode != mode) {
48129ff7890SWen Congyang error_setg(errp, "The parameter mode's value is invalid, needs %d,"
48229ff7890SWen Congyang " but got %d", s->mode, mode);
48329ff7890SWen Congyang return;
48429ff7890SWen Congyang }
48529ff7890SWen Congyang
48629ff7890SWen Congyang switch (s->mode) {
48729ff7890SWen Congyang case REPLICATION_MODE_PRIMARY:
48829ff7890SWen Congyang break;
48929ff7890SWen Congyang case REPLICATION_MODE_SECONDARY:
490004915a9SKevin Wolf bdrv_graph_rdlock_main_loop();
4911e12ecfdSLukas Straub active_disk = bs->file;
4921e12ecfdSLukas Straub if (!active_disk || !active_disk->bs || !active_disk->bs->backing) {
49329ff7890SWen Congyang error_setg(errp, "Active disk doesn't have backing file");
494004915a9SKevin Wolf bdrv_graph_rdunlock_main_loop();
49529ff7890SWen Congyang return;
49629ff7890SWen Congyang }
49729ff7890SWen Congyang
498a990a42bSLukas Straub hidden_disk = active_disk->bs->backing;
499a990a42bSLukas Straub if (!hidden_disk->bs || !hidden_disk->bs->backing) {
50029ff7890SWen Congyang error_setg(errp, "Hidden disk doesn't have backing file");
501004915a9SKevin Wolf bdrv_graph_rdunlock_main_loop();
50229ff7890SWen Congyang return;
50329ff7890SWen Congyang }
50429ff7890SWen Congyang
505a990a42bSLukas Straub secondary_disk = hidden_disk->bs->backing;
506a990a42bSLukas Straub if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
50729ff7890SWen Congyang error_setg(errp, "The secondary disk doesn't have block backend");
5082b3912f1SKevin Wolf bdrv_graph_rdunlock_main_loop();
50929ff7890SWen Congyang return;
51029ff7890SWen Congyang }
5112b3912f1SKevin Wolf bdrv_graph_rdunlock_main_loop();
51229ff7890SWen Congyang
51329ff7890SWen Congyang /* verify the length */
5141e12ecfdSLukas Straub active_length = bdrv_getlength(active_disk->bs);
515a990a42bSLukas Straub hidden_length = bdrv_getlength(hidden_disk->bs);
516a990a42bSLukas Straub disk_length = bdrv_getlength(secondary_disk->bs);
51729ff7890SWen Congyang if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
51829ff7890SWen Congyang active_length != hidden_length || hidden_length != disk_length) {
51929ff7890SWen Congyang error_setg(errp, "Active disk, hidden disk, secondary disk's length"
52029ff7890SWen Congyang " are not the same");
52129ff7890SWen Congyang return;
52229ff7890SWen Congyang }
52329ff7890SWen Congyang
524d470ad42SMax Reitz /* Must be true, or the bdrv_getlength() calls would have failed */
525a990a42bSLukas Straub assert(active_disk->bs->drv && hidden_disk->bs->drv);
526d470ad42SMax Reitz
5270bb79c97SKevin Wolf bdrv_graph_rdlock_main_loop();
5281e12ecfdSLukas Straub if (!active_disk->bs->drv->bdrv_make_empty ||
529a990a42bSLukas Straub !hidden_disk->bs->drv->bdrv_make_empty) {
53029ff7890SWen Congyang error_setg(errp,
53129ff7890SWen Congyang "Active disk or hidden disk doesn't support make_empty");
5320bb79c97SKevin Wolf bdrv_graph_rdunlock_main_loop();
53329ff7890SWen Congyang return;
53429ff7890SWen Congyang }
5350bb79c97SKevin Wolf bdrv_graph_rdunlock_main_loop();
53629ff7890SWen Congyang
53729ff7890SWen Congyang /* reopen the backing file in r/w mode */
5388dd9006eSPaolo Bonzini reopen_backing_file(bs, true, &local_err);
53929ff7890SWen Congyang if (local_err) {
54029ff7890SWen Congyang error_propagate(errp, local_err);
54129ff7890SWen Congyang return;
54229ff7890SWen Congyang }
54329ff7890SWen Congyang
5446bc30f19SStefan Hajnoczi bdrv_graph_wrlock();
545afdaeb9eSKevin Wolf
5463b78420bSLukas Straub bdrv_ref(hidden_disk->bs);
5473b78420bSLukas Straub s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
5483b78420bSLukas Straub &child_of_bds, BDRV_CHILD_DATA,
5493b78420bSLukas Straub &local_err);
5503b78420bSLukas Straub if (local_err) {
5513b78420bSLukas Straub error_propagate(errp, local_err);
5526bc30f19SStefan Hajnoczi bdrv_graph_wrunlock();
5533b78420bSLukas Straub return;
5543b78420bSLukas Straub }
5553b78420bSLukas Straub
5563b78420bSLukas Straub bdrv_ref(secondary_disk->bs);
5573b78420bSLukas Straub s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
5583b78420bSLukas Straub "secondary disk", &child_of_bds,
5593b78420bSLukas Straub BDRV_CHILD_DATA, &local_err);
5603b78420bSLukas Straub if (local_err) {
5613b78420bSLukas Straub error_propagate(errp, local_err);
5626bc30f19SStefan Hajnoczi bdrv_graph_wrunlock();
5633b78420bSLukas Straub return;
5643b78420bSLukas Straub }
565a990a42bSLukas Straub
56629ff7890SWen Congyang /* start backup job now */
56729ff7890SWen Congyang error_setg(&s->blocker,
56829ff7890SWen Congyang "Block device is in use by internal backup job");
56929ff7890SWen Congyang
57029ff7890SWen Congyang top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
57129ff7890SWen Congyang if (!top_bs || !bdrv_is_root_node(top_bs) ||
57229ff7890SWen Congyang !check_top_bs(top_bs, bs)) {
57329ff7890SWen Congyang error_setg(errp, "No top_bs or it is invalid");
5746bc30f19SStefan Hajnoczi bdrv_graph_wrunlock();
5758dd9006eSPaolo Bonzini reopen_backing_file(bs, false, NULL);
57629ff7890SWen Congyang return;
57729ff7890SWen Congyang }
57829ff7890SWen Congyang bdrv_op_block_all(top_bs, s->blocker);
57929ff7890SWen Congyang bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
58029ff7890SWen Congyang
5816bc30f19SStefan Hajnoczi bdrv_graph_wrunlock();
5822b3912f1SKevin Wolf
583cc19f177SVladimir Sementsov-Ogievskiy s->backup_job = backup_job_create(
584cc19f177SVladimir Sementsov-Ogievskiy NULL, s->secondary_disk->bs, s->hidden_disk->bs,
585*0fd05c8dSVladimir Sementsov-Ogievskiy 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
586*0fd05c8dSVladimir Sementsov-Ogievskiy NULL, &perf,
587111049a4SJohn Snow BLOCKDEV_ON_ERROR_REPORT,
588bb02b65cSKevin Wolf BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
589111049a4SJohn Snow backup_job_completed, bs, NULL, &local_err);
59029ff7890SWen Congyang if (local_err) {
59129ff7890SWen Congyang error_propagate(errp, local_err);
5928dd9006eSPaolo Bonzini backup_job_cleanup(bs);
59329ff7890SWen Congyang return;
59429ff7890SWen Congyang }
595cc19f177SVladimir Sementsov-Ogievskiy job_start(&s->backup_job->job);
59629ff7890SWen Congyang break;
59729ff7890SWen Congyang default:
59829ff7890SWen Congyang abort();
59929ff7890SWen Congyang }
60029ff7890SWen Congyang
6013c76c606SFam Zheng s->stage = BLOCK_REPLICATION_RUNNING;
60229ff7890SWen Congyang
60329ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) {
6041e12ecfdSLukas Straub secondary_do_checkpoint(bs, errp);
60529ff7890SWen Congyang }
60629ff7890SWen Congyang
60729ff7890SWen Congyang s->error = 0;
60829ff7890SWen Congyang }
60929ff7890SWen Congyang
replication_do_checkpoint(ReplicationState * rs,Error ** errp)61029ff7890SWen Congyang static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
61129ff7890SWen Congyang {
61229ff7890SWen Congyang BlockDriverState *bs = rs->opaque;
613b49f4755SStefan Hajnoczi BDRVReplicationState *s = bs->opaque;
61429ff7890SWen Congyang
61508ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_DONE ||
61608ddb4ebSLukas Straub s->stage == BLOCK_REPLICATION_FAILOVER) {
61708ddb4ebSLukas Straub /*
61808ddb4ebSLukas Straub * This case happens when a secondary was promoted to primary.
61908ddb4ebSLukas Straub * Ignore the request because the secondary side of replication
62008ddb4ebSLukas Straub * doesn't have to do anything anymore.
62108ddb4ebSLukas Straub */
62208ddb4ebSLukas Straub return;
62308ddb4ebSLukas Straub }
62408ddb4ebSLukas Straub
62529ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) {
6261e12ecfdSLukas Straub secondary_do_checkpoint(bs, errp);
62729ff7890SWen Congyang }
62829ff7890SWen Congyang }
62929ff7890SWen Congyang
replication_get_error(ReplicationState * rs,Error ** errp)63029ff7890SWen Congyang static void replication_get_error(ReplicationState *rs, Error **errp)
63129ff7890SWen Congyang {
63229ff7890SWen Congyang BlockDriverState *bs = rs->opaque;
633b49f4755SStefan Hajnoczi BDRVReplicationState *s = bs->opaque;
63429ff7890SWen Congyang
63508ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_NONE) {
63629ff7890SWen Congyang error_setg(errp, "Block replication is not running");
63729ff7890SWen Congyang return;
63829ff7890SWen Congyang }
63929ff7890SWen Congyang
64029ff7890SWen Congyang if (s->error) {
64129ff7890SWen Congyang error_setg(errp, "I/O error occurred");
64229ff7890SWen Congyang return;
64329ff7890SWen Congyang }
64429ff7890SWen Congyang }
64529ff7890SWen Congyang
replication_done(void * opaque,int ret)64629ff7890SWen Congyang static void replication_done(void *opaque, int ret)
64729ff7890SWen Congyang {
64829ff7890SWen Congyang BlockDriverState *bs = opaque;
64929ff7890SWen Congyang BDRVReplicationState *s = bs->opaque;
65029ff7890SWen Congyang
65129ff7890SWen Congyang if (ret == 0) {
6523c76c606SFam Zheng s->stage = BLOCK_REPLICATION_DONE;
65329ff7890SWen Congyang
6546bc30f19SStefan Hajnoczi bdrv_graph_wrlock();
6553b78420bSLukas Straub bdrv_unref_child(bs, s->secondary_disk);
65629ff7890SWen Congyang s->secondary_disk = NULL;
6573b78420bSLukas Straub bdrv_unref_child(bs, s->hidden_disk);
65829ff7890SWen Congyang s->hidden_disk = NULL;
6596bc30f19SStefan Hajnoczi bdrv_graph_wrunlock();
66032a8aba3SKevin Wolf
66129ff7890SWen Congyang s->error = 0;
66229ff7890SWen Congyang } else {
6633c76c606SFam Zheng s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
66429ff7890SWen Congyang s->error = -EIO;
66529ff7890SWen Congyang }
66629ff7890SWen Congyang }
66729ff7890SWen Congyang
replication_stop(ReplicationState * rs,bool failover,Error ** errp)66829ff7890SWen Congyang static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
66929ff7890SWen Congyang {
67029ff7890SWen Congyang BlockDriverState *bs = rs->opaque;
671b49f4755SStefan Hajnoczi BDRVReplicationState *s = bs->opaque;
67229ff7890SWen Congyang
67308ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_DONE ||
67408ddb4ebSLukas Straub s->stage == BLOCK_REPLICATION_FAILOVER) {
67508ddb4ebSLukas Straub /*
67608ddb4ebSLukas Straub * This case happens when a secondary was promoted to primary.
67708ddb4ebSLukas Straub * Ignore the request because the secondary side of replication
67808ddb4ebSLukas Straub * doesn't have to do anything anymore.
67908ddb4ebSLukas Straub */
68008ddb4ebSLukas Straub return;
68108ddb4ebSLukas Straub }
68208ddb4ebSLukas Straub
6833c76c606SFam Zheng if (s->stage != BLOCK_REPLICATION_RUNNING) {
68429ff7890SWen Congyang error_setg(errp, "Block replication is not running");
68529ff7890SWen Congyang return;
68629ff7890SWen Congyang }
68729ff7890SWen Congyang
68829ff7890SWen Congyang switch (s->mode) {
68929ff7890SWen Congyang case REPLICATION_MODE_PRIMARY:
6903c76c606SFam Zheng s->stage = BLOCK_REPLICATION_DONE;
69129ff7890SWen Congyang s->error = 0;
69229ff7890SWen Congyang break;
69329ff7890SWen Congyang case REPLICATION_MODE_SECONDARY:
69429ff7890SWen Congyang /*
69529ff7890SWen Congyang * This BDS will be closed, and the job should be completed
69629ff7890SWen Congyang * before the BDS is closed, because we will access hidden
69729ff7890SWen Congyang * disk, secondary disk in backup_job_completed().
69829ff7890SWen Congyang */
699cc19f177SVladimir Sementsov-Ogievskiy if (s->backup_job) {
7004cfb3f05SHanna Reitz job_cancel_sync(&s->backup_job->job, true);
70129ff7890SWen Congyang }
70229ff7890SWen Congyang
70329ff7890SWen Congyang if (!failover) {
7041e12ecfdSLukas Straub secondary_do_checkpoint(bs, errp);
7053c76c606SFam Zheng s->stage = BLOCK_REPLICATION_DONE;
70629ff7890SWen Congyang return;
70729ff7890SWen Congyang }
70829ff7890SWen Congyang
7091f051dcbSKevin Wolf bdrv_graph_rdlock_main_loop();
7103c76c606SFam Zheng s->stage = BLOCK_REPLICATION_FAILOVER;
711cc19f177SVladimir Sementsov-Ogievskiy s->commit_job = commit_active_start(
7121e12ecfdSLukas Straub NULL, bs->file->bs, s->secondary_disk->bs,
713bb02b65cSKevin Wolf JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
71478bbd910SFam Zheng NULL, replication_done, bs, true, errp);
7151f051dcbSKevin Wolf bdrv_graph_rdunlock_main_loop();
71629ff7890SWen Congyang break;
71729ff7890SWen Congyang default:
71829ff7890SWen Congyang abort();
71929ff7890SWen Congyang }
72029ff7890SWen Congyang }
72129ff7890SWen Congyang
7222654267cSMax Reitz static const char *const replication_strong_runtime_opts[] = {
7232654267cSMax Reitz REPLICATION_MODE,
7242654267cSMax Reitz REPLICATION_TOP_ID,
7252654267cSMax Reitz
7262654267cSMax Reitz NULL
7272654267cSMax Reitz };
7282654267cSMax Reitz
729782b9d06SAlberto Garcia static BlockDriver bdrv_replication = {
73029ff7890SWen Congyang .format_name = "replication",
73129ff7890SWen Congyang .instance_size = sizeof(BDRVReplicationState),
73229ff7890SWen Congyang
73329ff7890SWen Congyang .bdrv_open = replication_open,
73429ff7890SWen Congyang .bdrv_close = replication_close,
73537a9051cSChanglong Xie .bdrv_child_perm = replication_child_perm,
73629ff7890SWen Congyang
737c86422c5SEmanuele Giuseppe Esposito .bdrv_co_getlength = replication_co_getlength,
73829ff7890SWen Congyang .bdrv_co_readv = replication_co_readv,
73929ff7890SWen Congyang .bdrv_co_writev = replication_co_writev,
74029ff7890SWen Congyang
74129ff7890SWen Congyang .is_filter = true,
74229ff7890SWen Congyang
7432654267cSMax Reitz .strong_runtime_opts = replication_strong_runtime_opts,
74429ff7890SWen Congyang };
74529ff7890SWen Congyang
bdrv_replication_init(void)74629ff7890SWen Congyang static void bdrv_replication_init(void)
74729ff7890SWen Congyang {
74829ff7890SWen Congyang bdrv_register(&bdrv_replication);
74929ff7890SWen Congyang }
75029ff7890SWen Congyang
75129ff7890SWen Congyang block_init(bdrv_replication_init);
752