129ff7890SWen Congyang /* 229ff7890SWen Congyang * Replication Block filter 329ff7890SWen Congyang * 429ff7890SWen Congyang * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. 529ff7890SWen Congyang * Copyright (c) 2016 Intel Corporation 629ff7890SWen Congyang * Copyright (c) 2016 FUJITSU LIMITED 729ff7890SWen Congyang * 829ff7890SWen Congyang * Author: 929ff7890SWen Congyang * Wen Congyang <wency@cn.fujitsu.com> 1029ff7890SWen Congyang * 1129ff7890SWen Congyang * This work is licensed under the terms of the GNU GPL, version 2 or later. 1229ff7890SWen Congyang * See the COPYING file in the top-level directory. 1329ff7890SWen Congyang */ 1429ff7890SWen Congyang 1529ff7890SWen Congyang #include "qemu/osdep.h" 160b8fa32fSMarkus Armbruster #include "qemu/module.h" 17922a01a0SMarkus Armbruster #include "qemu/option.h" 1829ff7890SWen Congyang #include "block/nbd.h" 1929ff7890SWen Congyang #include "block/blockjob.h" 2029ff7890SWen Congyang #include "block/block_int.h" 2129ff7890SWen Congyang #include "block/block_backup.h" 2229ff7890SWen Congyang #include "sysemu/block-backend.h" 2329ff7890SWen Congyang #include "qapi/error.h" 243c4e9647SAlberto Garcia #include "qapi/qmp/qdict.h" 25b0262955SPaolo Bonzini #include "block/replication.h" 2629ff7890SWen Congyang 273c76c606SFam Zheng typedef enum { 283c76c606SFam Zheng BLOCK_REPLICATION_NONE, /* block replication is not started */ 293c76c606SFam Zheng BLOCK_REPLICATION_RUNNING, /* block replication is running */ 303c76c606SFam Zheng BLOCK_REPLICATION_FAILOVER, /* failover is running in background */ 313c76c606SFam Zheng BLOCK_REPLICATION_FAILOVER_FAILED, /* failover failed */ 323c76c606SFam Zheng BLOCK_REPLICATION_DONE, /* block replication is done */ 333c76c606SFam Zheng } ReplicationStage; 343c76c606SFam Zheng 3529ff7890SWen Congyang typedef struct BDRVReplicationState { 3629ff7890SWen Congyang ReplicationMode mode; 373c76c606SFam Zheng ReplicationStage stage; 38cc19f177SVladimir Sementsov-Ogievskiy BlockJob *commit_job; 3929ff7890SWen Congyang BdrvChild *hidden_disk; 4029ff7890SWen Congyang BdrvChild *secondary_disk; 41cc19f177SVladimir Sementsov-Ogievskiy BlockJob *backup_job; 4229ff7890SWen Congyang char *top_id; 4329ff7890SWen Congyang ReplicationState *rs; 4429ff7890SWen Congyang Error *blocker; 453c4e9647SAlberto Garcia bool orig_hidden_read_only; 463c4e9647SAlberto Garcia bool orig_secondary_read_only; 4729ff7890SWen Congyang int error; 4829ff7890SWen Congyang } BDRVReplicationState; 4929ff7890SWen Congyang 5029ff7890SWen Congyang static void replication_start(ReplicationState *rs, ReplicationMode mode, 5129ff7890SWen Congyang Error **errp); 5229ff7890SWen Congyang static void replication_do_checkpoint(ReplicationState *rs, Error **errp); 5329ff7890SWen Congyang static void replication_get_error(ReplicationState *rs, Error **errp); 5429ff7890SWen Congyang static void replication_stop(ReplicationState *rs, bool failover, 5529ff7890SWen Congyang Error **errp); 5629ff7890SWen Congyang 5729ff7890SWen Congyang #define REPLICATION_MODE "mode" 5829ff7890SWen Congyang #define REPLICATION_TOP_ID "top-id" 5929ff7890SWen Congyang static QemuOptsList replication_runtime_opts = { 6029ff7890SWen Congyang .name = "replication", 6129ff7890SWen Congyang .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head), 6229ff7890SWen Congyang .desc = { 6329ff7890SWen Congyang { 6429ff7890SWen Congyang .name = REPLICATION_MODE, 6529ff7890SWen Congyang .type = QEMU_OPT_STRING, 6629ff7890SWen Congyang }, 6729ff7890SWen Congyang { 6829ff7890SWen Congyang .name = REPLICATION_TOP_ID, 6929ff7890SWen Congyang .type = QEMU_OPT_STRING, 7029ff7890SWen Congyang }, 7129ff7890SWen Congyang { /* end of list */ } 7229ff7890SWen Congyang }, 7329ff7890SWen Congyang }; 7429ff7890SWen Congyang 7529ff7890SWen Congyang static ReplicationOps replication_ops = { 7629ff7890SWen Congyang .start = replication_start, 7729ff7890SWen Congyang .checkpoint = replication_do_checkpoint, 7829ff7890SWen Congyang .get_error = replication_get_error, 7929ff7890SWen Congyang .stop = replication_stop, 8029ff7890SWen Congyang }; 8129ff7890SWen Congyang 8229ff7890SWen Congyang static int replication_open(BlockDriverState *bs, QDict *options, 8329ff7890SWen Congyang int flags, Error **errp) 8429ff7890SWen Congyang { 8529ff7890SWen Congyang int ret; 8629ff7890SWen Congyang BDRVReplicationState *s = bs->opaque; 8729ff7890SWen Congyang QemuOpts *opts = NULL; 8829ff7890SWen Congyang const char *mode; 8929ff7890SWen Congyang const char *top_id; 9029ff7890SWen Congyang 91b3af2af4SMax Reitz bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, 92b3af2af4SMax Reitz BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, 934e4bf5c4SKevin Wolf false, errp); 944e4bf5c4SKevin Wolf if (!bs->file) { 954e4bf5c4SKevin Wolf return -EINVAL; 964e4bf5c4SKevin Wolf } 974e4bf5c4SKevin Wolf 9829ff7890SWen Congyang ret = -EINVAL; 9929ff7890SWen Congyang opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort); 100a5f9b9dfSMarkus Armbruster if (!qemu_opts_absorb_qdict(opts, options, errp)) { 10129ff7890SWen Congyang goto fail; 10229ff7890SWen Congyang } 10329ff7890SWen Congyang 10429ff7890SWen Congyang mode = qemu_opt_get(opts, REPLICATION_MODE); 10529ff7890SWen Congyang if (!mode) { 106dcfe4805SMarkus Armbruster error_setg(errp, "Missing the option mode"); 10729ff7890SWen Congyang goto fail; 10829ff7890SWen Congyang } 10929ff7890SWen Congyang 11029ff7890SWen Congyang if (!strcmp(mode, "primary")) { 11129ff7890SWen Congyang s->mode = REPLICATION_MODE_PRIMARY; 112f4f2539bSChanglong Xie top_id = qemu_opt_get(opts, REPLICATION_TOP_ID); 113f4f2539bSChanglong Xie if (top_id) { 114dcfe4805SMarkus Armbruster error_setg(errp, 115dcfe4805SMarkus Armbruster "The primary side does not support option top-id"); 116f4f2539bSChanglong Xie goto fail; 117f4f2539bSChanglong Xie } 11829ff7890SWen Congyang } else if (!strcmp(mode, "secondary")) { 11929ff7890SWen Congyang s->mode = REPLICATION_MODE_SECONDARY; 12029ff7890SWen Congyang top_id = qemu_opt_get(opts, REPLICATION_TOP_ID); 12129ff7890SWen Congyang s->top_id = g_strdup(top_id); 12229ff7890SWen Congyang if (!s->top_id) { 123dcfe4805SMarkus Armbruster error_setg(errp, "Missing the option top-id"); 12429ff7890SWen Congyang goto fail; 12529ff7890SWen Congyang } 12629ff7890SWen Congyang } else { 127dcfe4805SMarkus Armbruster error_setg(errp, 12829ff7890SWen Congyang "The option mode's value should be primary or secondary"); 12929ff7890SWen Congyang goto fail; 13029ff7890SWen Congyang } 13129ff7890SWen Congyang 13229ff7890SWen Congyang s->rs = replication_new(bs, &replication_ops); 13329ff7890SWen Congyang 13429ff7890SWen Congyang ret = 0; 13529ff7890SWen Congyang 13629ff7890SWen Congyang fail: 13729ff7890SWen Congyang qemu_opts_del(opts); 13829ff7890SWen Congyang return ret; 13929ff7890SWen Congyang } 14029ff7890SWen Congyang 14129ff7890SWen Congyang static void replication_close(BlockDriverState *bs) 14229ff7890SWen Congyang { 14329ff7890SWen Congyang BDRVReplicationState *s = bs->opaque; 14408558e33SStefan Reiter Job *commit_job; 1453ed4f708SEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 14629ff7890SWen Congyang 1473c76c606SFam Zheng if (s->stage == BLOCK_REPLICATION_RUNNING) { 14829ff7890SWen Congyang replication_stop(s->rs, false, NULL); 14929ff7890SWen Congyang } 1503c76c606SFam Zheng if (s->stage == BLOCK_REPLICATION_FAILOVER) { 15108558e33SStefan Reiter commit_job = &s->commit_job->job; 15208558e33SStefan Reiter assert(commit_job->aio_context == qemu_get_current_aio_context()); 1534cfb3f05SHanna Reitz job_cancel_sync(commit_job, false); 15450ab0e09SPaolo Bonzini } 15529ff7890SWen Congyang 15629ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) { 15729ff7890SWen Congyang g_free(s->top_id); 15829ff7890SWen Congyang } 15929ff7890SWen Congyang 16029ff7890SWen Congyang replication_remove(s->rs); 16129ff7890SWen Congyang } 16229ff7890SWen Congyang 16337a9051cSChanglong Xie static void replication_child_perm(BlockDriverState *bs, BdrvChild *c, 164bf8e925eSMax Reitz BdrvChildRole role, 165e0995dc3SKevin Wolf BlockReopenQueue *reopen_queue, 16637a9051cSChanglong Xie uint64_t perm, uint64_t shared, 16737a9051cSChanglong Xie uint64_t *nperm, uint64_t *nshared) 16837a9051cSChanglong Xie { 1693b78420bSLukas Straub if (role & BDRV_CHILD_PRIMARY) { 170611e0653SWang Guang *nperm = BLK_PERM_CONSISTENT_READ; 1713b78420bSLukas Straub } else { 1723b78420bSLukas Straub *nperm = 0; 1733b78420bSLukas Straub } 1743b78420bSLukas Straub 175611e0653SWang Guang if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) { 176611e0653SWang Guang *nperm |= BLK_PERM_WRITE; 177611e0653SWang Guang } 17878ee6bd0SPhilippe Mathieu-Daudé *nshared = BLK_PERM_CONSISTENT_READ 17978ee6bd0SPhilippe Mathieu-Daudé | BLK_PERM_WRITE 18037a9051cSChanglong Xie | BLK_PERM_WRITE_UNCHANGED; 18137a9051cSChanglong Xie return; 18237a9051cSChanglong Xie } 18337a9051cSChanglong Xie 18429ff7890SWen Congyang static int64_t replication_getlength(BlockDriverState *bs) 18529ff7890SWen Congyang { 18629ff7890SWen Congyang return bdrv_getlength(bs->file->bs); 18729ff7890SWen Congyang } 18829ff7890SWen Congyang 18929ff7890SWen Congyang static int replication_get_io_status(BDRVReplicationState *s) 19029ff7890SWen Congyang { 1913c76c606SFam Zheng switch (s->stage) { 19229ff7890SWen Congyang case BLOCK_REPLICATION_NONE: 19329ff7890SWen Congyang return -EIO; 19429ff7890SWen Congyang case BLOCK_REPLICATION_RUNNING: 19529ff7890SWen Congyang return 0; 19629ff7890SWen Congyang case BLOCK_REPLICATION_FAILOVER: 19729ff7890SWen Congyang return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0; 19829ff7890SWen Congyang case BLOCK_REPLICATION_FAILOVER_FAILED: 19929ff7890SWen Congyang return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1; 20029ff7890SWen Congyang case BLOCK_REPLICATION_DONE: 20129ff7890SWen Congyang /* 20229ff7890SWen Congyang * active commit job completes, and active disk and secondary_disk 20329ff7890SWen Congyang * is swapped, so we can operate bs->file directly 20429ff7890SWen Congyang */ 20529ff7890SWen Congyang return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0; 20629ff7890SWen Congyang default: 20729ff7890SWen Congyang abort(); 20829ff7890SWen Congyang } 20929ff7890SWen Congyang } 21029ff7890SWen Congyang 21129ff7890SWen Congyang static int replication_return_value(BDRVReplicationState *s, int ret) 21229ff7890SWen Congyang { 21329ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) { 21429ff7890SWen Congyang return ret; 21529ff7890SWen Congyang } 21629ff7890SWen Congyang 21729ff7890SWen Congyang if (ret < 0) { 21829ff7890SWen Congyang s->error = ret; 21929ff7890SWen Congyang ret = 0; 22029ff7890SWen Congyang } 22129ff7890SWen Congyang 22229ff7890SWen Congyang return ret; 22329ff7890SWen Congyang } 22429ff7890SWen Congyang 22529ff7890SWen Congyang static coroutine_fn int replication_co_readv(BlockDriverState *bs, 22629ff7890SWen Congyang int64_t sector_num, 22729ff7890SWen Congyang int remaining_sectors, 22829ff7890SWen Congyang QEMUIOVector *qiov) 22929ff7890SWen Congyang { 23029ff7890SWen Congyang BDRVReplicationState *s = bs->opaque; 23129ff7890SWen Congyang int ret; 23229ff7890SWen Congyang 23329ff7890SWen Congyang if (s->mode == REPLICATION_MODE_PRIMARY) { 23429ff7890SWen Congyang /* We only use it to forward primary write requests */ 23529ff7890SWen Congyang return -EIO; 23629ff7890SWen Congyang } 23729ff7890SWen Congyang 23829ff7890SWen Congyang ret = replication_get_io_status(s); 23929ff7890SWen Congyang if (ret < 0) { 24029ff7890SWen Congyang return ret; 24129ff7890SWen Congyang } 24229ff7890SWen Congyang 24304a11d87SEric Blake ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE, 24404a11d87SEric Blake remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0); 245e4f9752cSVladimir Sementsov-Ogievskiy 24629ff7890SWen Congyang return replication_return_value(s, ret); 24729ff7890SWen Congyang } 24829ff7890SWen Congyang 24929ff7890SWen Congyang static coroutine_fn int replication_co_writev(BlockDriverState *bs, 25029ff7890SWen Congyang int64_t sector_num, 25129ff7890SWen Congyang int remaining_sectors, 252e18a58b4SEric Blake QEMUIOVector *qiov, 253e18a58b4SEric Blake int flags) 25429ff7890SWen Congyang { 25529ff7890SWen Congyang BDRVReplicationState *s = bs->opaque; 25629ff7890SWen Congyang QEMUIOVector hd_qiov; 25729ff7890SWen Congyang uint64_t bytes_done = 0; 25829ff7890SWen Congyang BdrvChild *top = bs->file; 25929ff7890SWen Congyang BdrvChild *base = s->secondary_disk; 26029ff7890SWen Congyang BdrvChild *target; 26151b0a488SEric Blake int ret; 26251b0a488SEric Blake int64_t n; 26329ff7890SWen Congyang 264e18a58b4SEric Blake assert(!flags); 26529ff7890SWen Congyang ret = replication_get_io_status(s); 26629ff7890SWen Congyang if (ret < 0) { 26729ff7890SWen Congyang goto out; 26829ff7890SWen Congyang } 26929ff7890SWen Congyang 27029ff7890SWen Congyang if (ret == 0) { 27104a11d87SEric Blake ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE, 27204a11d87SEric Blake remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0); 27329ff7890SWen Congyang return replication_return_value(s, ret); 27429ff7890SWen Congyang } 27529ff7890SWen Congyang 27629ff7890SWen Congyang /* 27729ff7890SWen Congyang * Failover failed, only write to active disk if the sectors 27829ff7890SWen Congyang * have already been allocated in active disk/hidden disk. 27929ff7890SWen Congyang */ 28029ff7890SWen Congyang qemu_iovec_init(&hd_qiov, qiov->niov); 28129ff7890SWen Congyang while (remaining_sectors > 0) { 28251b0a488SEric Blake int64_t count; 28351b0a488SEric Blake 284170d3bd3SAndrey Shinkevich ret = bdrv_is_allocated_above(top->bs, base->bs, false, 28551b0a488SEric Blake sector_num * BDRV_SECTOR_SIZE, 28651b0a488SEric Blake remaining_sectors * BDRV_SECTOR_SIZE, 28751b0a488SEric Blake &count); 28829ff7890SWen Congyang if (ret < 0) { 28929ff7890SWen Congyang goto out1; 29029ff7890SWen Congyang } 29129ff7890SWen Congyang 29251b0a488SEric Blake assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)); 29351b0a488SEric Blake n = count >> BDRV_SECTOR_BITS; 29429ff7890SWen Congyang qemu_iovec_reset(&hd_qiov); 29551b0a488SEric Blake qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count); 29629ff7890SWen Congyang 29729ff7890SWen Congyang target = ret ? top : base; 29804a11d87SEric Blake ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE, 29904a11d87SEric Blake n * BDRV_SECTOR_SIZE, &hd_qiov, 0); 30029ff7890SWen Congyang if (ret < 0) { 30129ff7890SWen Congyang goto out1; 30229ff7890SWen Congyang } 30329ff7890SWen Congyang 30429ff7890SWen Congyang remaining_sectors -= n; 30529ff7890SWen Congyang sector_num += n; 30651b0a488SEric Blake bytes_done += count; 30729ff7890SWen Congyang } 30829ff7890SWen Congyang 30929ff7890SWen Congyang out1: 31029ff7890SWen Congyang qemu_iovec_destroy(&hd_qiov); 31129ff7890SWen Congyang out: 31229ff7890SWen Congyang return ret; 31329ff7890SWen Congyang } 31429ff7890SWen Congyang 3151e12ecfdSLukas Straub static void secondary_do_checkpoint(BlockDriverState *bs, Error **errp) 31629ff7890SWen Congyang { 3171e12ecfdSLukas Straub BDRVReplicationState *s = bs->opaque; 3181e12ecfdSLukas Straub BdrvChild *active_disk = bs->file; 31929ff7890SWen Congyang Error *local_err = NULL; 32029ff7890SWen Congyang int ret; 32129ff7890SWen Congyang 322cc19f177SVladimir Sementsov-Ogievskiy if (!s->backup_job) { 32329ff7890SWen Congyang error_setg(errp, "Backup job was cancelled unexpectedly"); 32429ff7890SWen Congyang return; 32529ff7890SWen Congyang } 32629ff7890SWen Congyang 327cc19f177SVladimir Sementsov-Ogievskiy backup_do_checkpoint(s->backup_job, &local_err); 32829ff7890SWen Congyang if (local_err) { 32929ff7890SWen Congyang error_propagate(errp, local_err); 33029ff7890SWen Congyang return; 33129ff7890SWen Congyang } 33229ff7890SWen Congyang 3331e12ecfdSLukas Straub if (!active_disk->bs->drv) { 334d470ad42SMax Reitz error_setg(errp, "Active disk %s is ejected", 3351e12ecfdSLukas Straub active_disk->bs->node_name); 336d470ad42SMax Reitz return; 337d470ad42SMax Reitz } 338d470ad42SMax Reitz 3391e12ecfdSLukas Straub ret = bdrv_make_empty(active_disk, errp); 34029ff7890SWen Congyang if (ret < 0) { 34129ff7890SWen Congyang return; 34229ff7890SWen Congyang } 34329ff7890SWen Congyang 344d470ad42SMax Reitz if (!s->hidden_disk->bs->drv) { 345d470ad42SMax Reitz error_setg(errp, "Hidden disk %s is ejected", 346d470ad42SMax Reitz s->hidden_disk->bs->node_name); 347d470ad42SMax Reitz return; 348d470ad42SMax Reitz } 349d470ad42SMax Reitz 350c2cf0ecaSLukas Straub ret = bdrv_make_empty(s->hidden_disk, errp); 35129ff7890SWen Congyang if (ret < 0) { 35229ff7890SWen Congyang return; 35329ff7890SWen Congyang } 35429ff7890SWen Congyang } 35529ff7890SWen Congyang 3563c4e9647SAlberto Garcia /* This function is supposed to be called twice: 3573c4e9647SAlberto Garcia * first with writable = true, then with writable = false. 3583c4e9647SAlberto Garcia * The first call puts s->hidden_disk and s->secondary_disk in 3593c4e9647SAlberto Garcia * r/w mode, and the second puts them back in their original state. 3603c4e9647SAlberto Garcia */ 3618dd9006eSPaolo Bonzini static void reopen_backing_file(BlockDriverState *bs, bool writable, 36229ff7890SWen Congyang Error **errp) 36329ff7890SWen Congyang { 3648dd9006eSPaolo Bonzini BDRVReplicationState *s = bs->opaque; 365a990a42bSLukas Straub BdrvChild *hidden_disk, *secondary_disk; 36629ff7890SWen Congyang BlockReopenQueue *reopen_queue = NULL; 36729ff7890SWen Congyang 368a990a42bSLukas Straub /* 369a990a42bSLukas Straub * s->hidden_disk and s->secondary_disk may not be set yet, as they will 370a990a42bSLukas Straub * only be set after the children are writable. 371a990a42bSLukas Straub */ 372a990a42bSLukas Straub hidden_disk = bs->file->bs->backing; 373a990a42bSLukas Straub secondary_disk = hidden_disk->bs->backing; 374a990a42bSLukas Straub 37529ff7890SWen Congyang if (writable) { 376a990a42bSLukas Straub s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs); 377a990a42bSLukas Straub s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs); 37829ff7890SWen Congyang } 37929ff7890SWen Congyang 380a990a42bSLukas Straub bdrv_subtree_drained_begin(hidden_disk->bs); 381a990a42bSLukas Straub bdrv_subtree_drained_begin(secondary_disk->bs); 3821a63a907SKevin Wolf 3833c4e9647SAlberto Garcia if (s->orig_hidden_read_only) { 3843c4e9647SAlberto Garcia QDict *opts = qdict_new(); 3853c4e9647SAlberto Garcia qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); 386a990a42bSLukas Straub reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs, 387077e8e20SAlberto Garcia opts, true); 38829ff7890SWen Congyang } 38929ff7890SWen Congyang 3903c4e9647SAlberto Garcia if (s->orig_secondary_read_only) { 3913c4e9647SAlberto Garcia QDict *opts = qdict_new(); 3923c4e9647SAlberto Garcia qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable); 393a990a42bSLukas Straub reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs, 394077e8e20SAlberto Garcia opts, true); 39529ff7890SWen Congyang } 39629ff7890SWen Congyang 39729ff7890SWen Congyang if (reopen_queue) { 3986cf42ca2SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 3996cf42ca2SKevin Wolf if (ctx != qemu_get_aio_context()) { 4006cf42ca2SKevin Wolf aio_context_release(ctx); 4016cf42ca2SKevin Wolf } 402992861fbSMarkus Armbruster bdrv_reopen_multiple(reopen_queue, errp); 4036cf42ca2SKevin Wolf if (ctx != qemu_get_aio_context()) { 4046cf42ca2SKevin Wolf aio_context_acquire(ctx); 4056cf42ca2SKevin Wolf } 40629ff7890SWen Congyang } 4071a63a907SKevin Wolf 408a990a42bSLukas Straub bdrv_subtree_drained_end(hidden_disk->bs); 409a990a42bSLukas Straub bdrv_subtree_drained_end(secondary_disk->bs); 41029ff7890SWen Congyang } 41129ff7890SWen Congyang 4128dd9006eSPaolo Bonzini static void backup_job_cleanup(BlockDriverState *bs) 41329ff7890SWen Congyang { 4148dd9006eSPaolo Bonzini BDRVReplicationState *s = bs->opaque; 41529ff7890SWen Congyang BlockDriverState *top_bs; 41629ff7890SWen Congyang 417e140f4b7SLukas Straub s->backup_job = NULL; 418e140f4b7SLukas Straub 41929ff7890SWen Congyang top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); 42029ff7890SWen Congyang if (!top_bs) { 42129ff7890SWen Congyang return; 42229ff7890SWen Congyang } 42329ff7890SWen Congyang bdrv_op_unblock_all(top_bs, s->blocker); 42429ff7890SWen Congyang error_free(s->blocker); 4258dd9006eSPaolo Bonzini reopen_backing_file(bs, false, NULL); 42629ff7890SWen Congyang } 42729ff7890SWen Congyang 42829ff7890SWen Congyang static void backup_job_completed(void *opaque, int ret) 42929ff7890SWen Congyang { 4308dd9006eSPaolo Bonzini BlockDriverState *bs = opaque; 4318dd9006eSPaolo Bonzini BDRVReplicationState *s = bs->opaque; 43229ff7890SWen Congyang 4333c76c606SFam Zheng if (s->stage != BLOCK_REPLICATION_FAILOVER) { 43429ff7890SWen Congyang /* The backup job is cancelled unexpectedly */ 43529ff7890SWen Congyang s->error = -EIO; 43629ff7890SWen Congyang } 43729ff7890SWen Congyang 4388dd9006eSPaolo Bonzini backup_job_cleanup(bs); 43929ff7890SWen Congyang } 44029ff7890SWen Congyang 44129ff7890SWen Congyang static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs) 44229ff7890SWen Congyang { 44329ff7890SWen Congyang BdrvChild *child; 44429ff7890SWen Congyang 44529ff7890SWen Congyang /* The bs itself is the top_bs */ 44629ff7890SWen Congyang if (top_bs == bs) { 44729ff7890SWen Congyang return true; 44829ff7890SWen Congyang } 44929ff7890SWen Congyang 45029ff7890SWen Congyang /* Iterate over top_bs's children */ 45129ff7890SWen Congyang QLIST_FOREACH(child, &top_bs->children, next) { 45229ff7890SWen Congyang if (child->bs == bs || check_top_bs(child->bs, bs)) { 45329ff7890SWen Congyang return true; 45429ff7890SWen Congyang } 45529ff7890SWen Congyang } 45629ff7890SWen Congyang 45729ff7890SWen Congyang return false; 45829ff7890SWen Congyang } 45929ff7890SWen Congyang 46029ff7890SWen Congyang static void replication_start(ReplicationState *rs, ReplicationMode mode, 46129ff7890SWen Congyang Error **errp) 46229ff7890SWen Congyang { 46329ff7890SWen Congyang BlockDriverState *bs = rs->opaque; 46429ff7890SWen Congyang BDRVReplicationState *s; 46529ff7890SWen Congyang BlockDriverState *top_bs; 466a990a42bSLukas Straub BdrvChild *active_disk, *hidden_disk, *secondary_disk; 46729ff7890SWen Congyang int64_t active_length, hidden_length, disk_length; 46829ff7890SWen Congyang AioContext *aio_context; 46929ff7890SWen Congyang Error *local_err = NULL; 4702c59fd83SVladimir Sementsov-Ogievskiy BackupPerf perf = { .use_copy_range = true, .max_workers = 1 }; 47129ff7890SWen Congyang 47229ff7890SWen Congyang aio_context = bdrv_get_aio_context(bs); 47329ff7890SWen Congyang aio_context_acquire(aio_context); 47429ff7890SWen Congyang s = bs->opaque; 47529ff7890SWen Congyang 47608ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_DONE || 47708ddb4ebSLukas Straub s->stage == BLOCK_REPLICATION_FAILOVER) { 47808ddb4ebSLukas Straub /* 47908ddb4ebSLukas Straub * This case happens when a secondary is promoted to primary. 48008ddb4ebSLukas Straub * Ignore the request because the secondary side of replication 48108ddb4ebSLukas Straub * doesn't have to do anything anymore. 48208ddb4ebSLukas Straub */ 48308ddb4ebSLukas Straub aio_context_release(aio_context); 48408ddb4ebSLukas Straub return; 48508ddb4ebSLukas Straub } 48608ddb4ebSLukas Straub 4873c76c606SFam Zheng if (s->stage != BLOCK_REPLICATION_NONE) { 48829ff7890SWen Congyang error_setg(errp, "Block replication is running or done"); 48929ff7890SWen Congyang aio_context_release(aio_context); 49029ff7890SWen Congyang return; 49129ff7890SWen Congyang } 49229ff7890SWen Congyang 49329ff7890SWen Congyang if (s->mode != mode) { 49429ff7890SWen Congyang error_setg(errp, "The parameter mode's value is invalid, needs %d," 49529ff7890SWen Congyang " but got %d", s->mode, mode); 49629ff7890SWen Congyang aio_context_release(aio_context); 49729ff7890SWen Congyang return; 49829ff7890SWen Congyang } 49929ff7890SWen Congyang 50029ff7890SWen Congyang switch (s->mode) { 50129ff7890SWen Congyang case REPLICATION_MODE_PRIMARY: 50229ff7890SWen Congyang break; 50329ff7890SWen Congyang case REPLICATION_MODE_SECONDARY: 5041e12ecfdSLukas Straub active_disk = bs->file; 5051e12ecfdSLukas Straub if (!active_disk || !active_disk->bs || !active_disk->bs->backing) { 50629ff7890SWen Congyang error_setg(errp, "Active disk doesn't have backing file"); 50729ff7890SWen Congyang aio_context_release(aio_context); 50829ff7890SWen Congyang return; 50929ff7890SWen Congyang } 51029ff7890SWen Congyang 511a990a42bSLukas Straub hidden_disk = active_disk->bs->backing; 512a990a42bSLukas Straub if (!hidden_disk->bs || !hidden_disk->bs->backing) { 51329ff7890SWen Congyang error_setg(errp, "Hidden disk doesn't have backing file"); 51429ff7890SWen Congyang aio_context_release(aio_context); 51529ff7890SWen Congyang return; 51629ff7890SWen Congyang } 51729ff7890SWen Congyang 518a990a42bSLukas Straub secondary_disk = hidden_disk->bs->backing; 519a990a42bSLukas Straub if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) { 52029ff7890SWen Congyang error_setg(errp, "The secondary disk doesn't have block backend"); 52129ff7890SWen Congyang aio_context_release(aio_context); 52229ff7890SWen Congyang return; 52329ff7890SWen Congyang } 52429ff7890SWen Congyang 52529ff7890SWen Congyang /* verify the length */ 5261e12ecfdSLukas Straub active_length = bdrv_getlength(active_disk->bs); 527a990a42bSLukas Straub hidden_length = bdrv_getlength(hidden_disk->bs); 528a990a42bSLukas Straub disk_length = bdrv_getlength(secondary_disk->bs); 52929ff7890SWen Congyang if (active_length < 0 || hidden_length < 0 || disk_length < 0 || 53029ff7890SWen Congyang active_length != hidden_length || hidden_length != disk_length) { 53129ff7890SWen Congyang error_setg(errp, "Active disk, hidden disk, secondary disk's length" 53229ff7890SWen Congyang " are not the same"); 53329ff7890SWen Congyang aio_context_release(aio_context); 53429ff7890SWen Congyang return; 53529ff7890SWen Congyang } 53629ff7890SWen Congyang 537d470ad42SMax Reitz /* Must be true, or the bdrv_getlength() calls would have failed */ 538a990a42bSLukas Straub assert(active_disk->bs->drv && hidden_disk->bs->drv); 539d470ad42SMax Reitz 5401e12ecfdSLukas Straub if (!active_disk->bs->drv->bdrv_make_empty || 541a990a42bSLukas Straub !hidden_disk->bs->drv->bdrv_make_empty) { 54229ff7890SWen Congyang error_setg(errp, 54329ff7890SWen Congyang "Active disk or hidden disk doesn't support make_empty"); 54429ff7890SWen Congyang aio_context_release(aio_context); 54529ff7890SWen Congyang return; 54629ff7890SWen Congyang } 54729ff7890SWen Congyang 54829ff7890SWen Congyang /* reopen the backing file in r/w mode */ 5498dd9006eSPaolo Bonzini reopen_backing_file(bs, true, &local_err); 55029ff7890SWen Congyang if (local_err) { 55129ff7890SWen Congyang error_propagate(errp, local_err); 55229ff7890SWen Congyang aio_context_release(aio_context); 55329ff7890SWen Congyang return; 55429ff7890SWen Congyang } 55529ff7890SWen Congyang 5563b78420bSLukas Straub bdrv_ref(hidden_disk->bs); 5573b78420bSLukas Straub s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk", 5583b78420bSLukas Straub &child_of_bds, BDRV_CHILD_DATA, 5593b78420bSLukas Straub &local_err); 5603b78420bSLukas Straub if (local_err) { 5613b78420bSLukas Straub error_propagate(errp, local_err); 5623b78420bSLukas Straub aio_context_release(aio_context); 5633b78420bSLukas Straub return; 5643b78420bSLukas Straub } 5653b78420bSLukas Straub 5663b78420bSLukas Straub bdrv_ref(secondary_disk->bs); 5673b78420bSLukas Straub s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs, 5683b78420bSLukas Straub "secondary disk", &child_of_bds, 5693b78420bSLukas Straub BDRV_CHILD_DATA, &local_err); 5703b78420bSLukas Straub if (local_err) { 5713b78420bSLukas Straub error_propagate(errp, local_err); 5723b78420bSLukas Straub aio_context_release(aio_context); 5733b78420bSLukas Straub return; 5743b78420bSLukas Straub } 575a990a42bSLukas Straub 57629ff7890SWen Congyang /* start backup job now */ 57729ff7890SWen Congyang error_setg(&s->blocker, 57829ff7890SWen Congyang "Block device is in use by internal backup job"); 57929ff7890SWen Congyang 58029ff7890SWen Congyang top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); 58129ff7890SWen Congyang if (!top_bs || !bdrv_is_root_node(top_bs) || 58229ff7890SWen Congyang !check_top_bs(top_bs, bs)) { 58329ff7890SWen Congyang error_setg(errp, "No top_bs or it is invalid"); 5848dd9006eSPaolo Bonzini reopen_backing_file(bs, false, NULL); 58529ff7890SWen Congyang aio_context_release(aio_context); 58629ff7890SWen Congyang return; 58729ff7890SWen Congyang } 58829ff7890SWen Congyang bdrv_op_block_all(top_bs, s->blocker); 58929ff7890SWen Congyang bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker); 59029ff7890SWen Congyang 591cc19f177SVladimir Sementsov-Ogievskiy s->backup_job = backup_job_create( 592cc19f177SVladimir Sementsov-Ogievskiy NULL, s->secondary_disk->bs, s->hidden_disk->bs, 59300e30f05SVladimir Sementsov-Ogievskiy 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL, 59486c6a3b6SVladimir Sementsov-Ogievskiy &perf, 595111049a4SJohn Snow BLOCKDEV_ON_ERROR_REPORT, 596bb02b65cSKevin Wolf BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL, 597111049a4SJohn Snow backup_job_completed, bs, NULL, &local_err); 59829ff7890SWen Congyang if (local_err) { 59929ff7890SWen Congyang error_propagate(errp, local_err); 6008dd9006eSPaolo Bonzini backup_job_cleanup(bs); 60129ff7890SWen Congyang aio_context_release(aio_context); 60229ff7890SWen Congyang return; 60329ff7890SWen Congyang } 604cc19f177SVladimir Sementsov-Ogievskiy job_start(&s->backup_job->job); 60529ff7890SWen Congyang break; 60629ff7890SWen Congyang default: 60729ff7890SWen Congyang aio_context_release(aio_context); 60829ff7890SWen Congyang abort(); 60929ff7890SWen Congyang } 61029ff7890SWen Congyang 6113c76c606SFam Zheng s->stage = BLOCK_REPLICATION_RUNNING; 61229ff7890SWen Congyang 61329ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) { 6141e12ecfdSLukas Straub secondary_do_checkpoint(bs, errp); 61529ff7890SWen Congyang } 61629ff7890SWen Congyang 61729ff7890SWen Congyang s->error = 0; 61829ff7890SWen Congyang aio_context_release(aio_context); 61929ff7890SWen Congyang } 62029ff7890SWen Congyang 62129ff7890SWen Congyang static void replication_do_checkpoint(ReplicationState *rs, Error **errp) 62229ff7890SWen Congyang { 62329ff7890SWen Congyang BlockDriverState *bs = rs->opaque; 62429ff7890SWen Congyang BDRVReplicationState *s; 62529ff7890SWen Congyang AioContext *aio_context; 62629ff7890SWen Congyang 62729ff7890SWen Congyang aio_context = bdrv_get_aio_context(bs); 62829ff7890SWen Congyang aio_context_acquire(aio_context); 62929ff7890SWen Congyang s = bs->opaque; 63029ff7890SWen Congyang 63108ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_DONE || 63208ddb4ebSLukas Straub s->stage == BLOCK_REPLICATION_FAILOVER) { 63308ddb4ebSLukas Straub /* 63408ddb4ebSLukas Straub * This case happens when a secondary was promoted to primary. 63508ddb4ebSLukas Straub * Ignore the request because the secondary side of replication 63608ddb4ebSLukas Straub * doesn't have to do anything anymore. 63708ddb4ebSLukas Straub */ 63808ddb4ebSLukas Straub aio_context_release(aio_context); 63908ddb4ebSLukas Straub return; 64008ddb4ebSLukas Straub } 64108ddb4ebSLukas Straub 64229ff7890SWen Congyang if (s->mode == REPLICATION_MODE_SECONDARY) { 6431e12ecfdSLukas Straub secondary_do_checkpoint(bs, errp); 64429ff7890SWen Congyang } 64529ff7890SWen Congyang aio_context_release(aio_context); 64629ff7890SWen Congyang } 64729ff7890SWen Congyang 64829ff7890SWen Congyang static void replication_get_error(ReplicationState *rs, Error **errp) 64929ff7890SWen Congyang { 65029ff7890SWen Congyang BlockDriverState *bs = rs->opaque; 65129ff7890SWen Congyang BDRVReplicationState *s; 65229ff7890SWen Congyang AioContext *aio_context; 65329ff7890SWen Congyang 65429ff7890SWen Congyang aio_context = bdrv_get_aio_context(bs); 65529ff7890SWen Congyang aio_context_acquire(aio_context); 65629ff7890SWen Congyang s = bs->opaque; 65729ff7890SWen Congyang 65808ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_NONE) { 65929ff7890SWen Congyang error_setg(errp, "Block replication is not running"); 66029ff7890SWen Congyang aio_context_release(aio_context); 66129ff7890SWen Congyang return; 66229ff7890SWen Congyang } 66329ff7890SWen Congyang 66429ff7890SWen Congyang if (s->error) { 66529ff7890SWen Congyang error_setg(errp, "I/O error occurred"); 66629ff7890SWen Congyang aio_context_release(aio_context); 66729ff7890SWen Congyang return; 66829ff7890SWen Congyang } 66929ff7890SWen Congyang aio_context_release(aio_context); 67029ff7890SWen Congyang } 67129ff7890SWen Congyang 67229ff7890SWen Congyang static void replication_done(void *opaque, int ret) 67329ff7890SWen Congyang { 67429ff7890SWen Congyang BlockDriverState *bs = opaque; 67529ff7890SWen Congyang BDRVReplicationState *s = bs->opaque; 67629ff7890SWen Congyang 67729ff7890SWen Congyang if (ret == 0) { 6783c76c606SFam Zheng s->stage = BLOCK_REPLICATION_DONE; 67929ff7890SWen Congyang 6803b78420bSLukas Straub bdrv_unref_child(bs, s->secondary_disk); 68129ff7890SWen Congyang s->secondary_disk = NULL; 6823b78420bSLukas Straub bdrv_unref_child(bs, s->hidden_disk); 68329ff7890SWen Congyang s->hidden_disk = NULL; 68429ff7890SWen Congyang s->error = 0; 68529ff7890SWen Congyang } else { 6863c76c606SFam Zheng s->stage = BLOCK_REPLICATION_FAILOVER_FAILED; 68729ff7890SWen Congyang s->error = -EIO; 68829ff7890SWen Congyang } 68929ff7890SWen Congyang } 69029ff7890SWen Congyang 69129ff7890SWen Congyang static void replication_stop(ReplicationState *rs, bool failover, Error **errp) 69229ff7890SWen Congyang { 69329ff7890SWen Congyang BlockDriverState *bs = rs->opaque; 69429ff7890SWen Congyang BDRVReplicationState *s; 69529ff7890SWen Congyang AioContext *aio_context; 69629ff7890SWen Congyang 69729ff7890SWen Congyang aio_context = bdrv_get_aio_context(bs); 69829ff7890SWen Congyang aio_context_acquire(aio_context); 69929ff7890SWen Congyang s = bs->opaque; 70029ff7890SWen Congyang 70108ddb4ebSLukas Straub if (s->stage == BLOCK_REPLICATION_DONE || 70208ddb4ebSLukas Straub s->stage == BLOCK_REPLICATION_FAILOVER) { 70308ddb4ebSLukas Straub /* 70408ddb4ebSLukas Straub * This case happens when a secondary was promoted to primary. 70508ddb4ebSLukas Straub * Ignore the request because the secondary side of replication 70608ddb4ebSLukas Straub * doesn't have to do anything anymore. 70708ddb4ebSLukas Straub */ 70808ddb4ebSLukas Straub aio_context_release(aio_context); 70908ddb4ebSLukas Straub return; 71008ddb4ebSLukas Straub } 71108ddb4ebSLukas Straub 7123c76c606SFam Zheng if (s->stage != BLOCK_REPLICATION_RUNNING) { 71329ff7890SWen Congyang error_setg(errp, "Block replication is not running"); 71429ff7890SWen Congyang aio_context_release(aio_context); 71529ff7890SWen Congyang return; 71629ff7890SWen Congyang } 71729ff7890SWen Congyang 71829ff7890SWen Congyang switch (s->mode) { 71929ff7890SWen Congyang case REPLICATION_MODE_PRIMARY: 7203c76c606SFam Zheng s->stage = BLOCK_REPLICATION_DONE; 72129ff7890SWen Congyang s->error = 0; 72229ff7890SWen Congyang break; 72329ff7890SWen Congyang case REPLICATION_MODE_SECONDARY: 72429ff7890SWen Congyang /* 72529ff7890SWen Congyang * This BDS will be closed, and the job should be completed 72629ff7890SWen Congyang * before the BDS is closed, because we will access hidden 72729ff7890SWen Congyang * disk, secondary disk in backup_job_completed(). 72829ff7890SWen Congyang */ 729cc19f177SVladimir Sementsov-Ogievskiy if (s->backup_job) { 730*6f592e5aSEmanuele Giuseppe Esposito aio_context_release(aio_context); 7314cfb3f05SHanna Reitz job_cancel_sync(&s->backup_job->job, true); 732*6f592e5aSEmanuele Giuseppe Esposito aio_context_acquire(aio_context); 73329ff7890SWen Congyang } 73429ff7890SWen Congyang 73529ff7890SWen Congyang if (!failover) { 7361e12ecfdSLukas Straub secondary_do_checkpoint(bs, errp); 7373c76c606SFam Zheng s->stage = BLOCK_REPLICATION_DONE; 73829ff7890SWen Congyang aio_context_release(aio_context); 73929ff7890SWen Congyang return; 74029ff7890SWen Congyang } 74129ff7890SWen Congyang 7423c76c606SFam Zheng s->stage = BLOCK_REPLICATION_FAILOVER; 743cc19f177SVladimir Sementsov-Ogievskiy s->commit_job = commit_active_start( 7441e12ecfdSLukas Straub NULL, bs->file->bs, s->secondary_disk->bs, 745bb02b65cSKevin Wolf JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT, 74678bbd910SFam Zheng NULL, replication_done, bs, true, errp); 74729ff7890SWen Congyang break; 74829ff7890SWen Congyang default: 74929ff7890SWen Congyang aio_context_release(aio_context); 75029ff7890SWen Congyang abort(); 75129ff7890SWen Congyang } 75229ff7890SWen Congyang aio_context_release(aio_context); 75329ff7890SWen Congyang } 75429ff7890SWen Congyang 7552654267cSMax Reitz static const char *const replication_strong_runtime_opts[] = { 7562654267cSMax Reitz REPLICATION_MODE, 7572654267cSMax Reitz REPLICATION_TOP_ID, 7582654267cSMax Reitz 7592654267cSMax Reitz NULL 7602654267cSMax Reitz }; 7612654267cSMax Reitz 762782b9d06SAlberto Garcia static BlockDriver bdrv_replication = { 76329ff7890SWen Congyang .format_name = "replication", 76429ff7890SWen Congyang .instance_size = sizeof(BDRVReplicationState), 76529ff7890SWen Congyang 76629ff7890SWen Congyang .bdrv_open = replication_open, 76729ff7890SWen Congyang .bdrv_close = replication_close, 76837a9051cSChanglong Xie .bdrv_child_perm = replication_child_perm, 76929ff7890SWen Congyang 77029ff7890SWen Congyang .bdrv_getlength = replication_getlength, 77129ff7890SWen Congyang .bdrv_co_readv = replication_co_readv, 77229ff7890SWen Congyang .bdrv_co_writev = replication_co_writev, 77329ff7890SWen Congyang 77429ff7890SWen Congyang .is_filter = true, 77529ff7890SWen Congyang 77629ff7890SWen Congyang .has_variable_length = true, 7772654267cSMax Reitz .strong_runtime_opts = replication_strong_runtime_opts, 77829ff7890SWen Congyang }; 77929ff7890SWen Congyang 78029ff7890SWen Congyang static void bdrv_replication_init(void) 78129ff7890SWen Congyang { 78229ff7890SWen Congyang bdrv_register(&bdrv_replication); 78329ff7890SWen Congyang } 78429ff7890SWen Congyang 78529ff7890SWen Congyang block_init(bdrv_replication_init); 786