1 /* 2 * copy-before-write filter driver 3 * 4 * The driver performs Copy-Before-Write (CBW) operation: it is injected above 5 * some node, and before each write it copies _old_ data to the target node. 6 * 7 * Copyright (c) 2018-2021 Virtuozzo International GmbH. 8 * 9 * Author: 10 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com> 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License as published by 14 * the Free Software Foundation; either version 2 of the License, or 15 * (at your option) any later version. 16 * 17 * This program is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 * 22 * You should have received a copy of the GNU General Public License 23 * along with this program. If not, see <http://www.gnu.org/licenses/>. 24 */ 25 26 #include "qemu/osdep.h" 27 28 #include "sysemu/block-backend.h" 29 #include "qemu/cutils.h" 30 #include "qapi/error.h" 31 #include "block/block_int.h" 32 #include "block/qdict.h" 33 #include "block/block-copy.h" 34 35 #include "block/copy-before-write.h" 36 #include "block/reqlist.h" 37 38 #include "qapi/qapi-visit-block-core.h" 39 40 typedef struct BDRVCopyBeforeWriteState { 41 BlockCopyState *bcs; 42 BdrvChild *target; 43 44 /* 45 * @lock: protects access to @access_bitmap, @done_bitmap and 46 * @frozen_read_reqs 47 */ 48 CoMutex lock; 49 50 /* 51 * @access_bitmap: represents areas allowed for reading by fleecing user. 52 * Reading from non-dirty areas leads to -EACCES. 53 */ 54 BdrvDirtyBitmap *access_bitmap; 55 56 /* 57 * @done_bitmap: represents areas that was successfully copied to @target by 58 * copy-before-write operations. 59 */ 60 BdrvDirtyBitmap *done_bitmap; 61 62 /* 63 * @frozen_read_reqs: current read requests for fleecing user in bs->file 64 * node. These areas must not be rewritten by guest. 65 */ 66 BlockReqList frozen_read_reqs; 67 } BDRVCopyBeforeWriteState; 68 69 static coroutine_fn int cbw_co_preadv( 70 BlockDriverState *bs, int64_t offset, int64_t bytes, 71 QEMUIOVector *qiov, BdrvRequestFlags flags) 72 { 73 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); 74 } 75 76 /* 77 * Do copy-before-write operation. 78 * 79 * On failure guest request must be failed too. 80 * 81 * On success, we also wait for all in-flight fleecing read requests in source 82 * node, and it's guaranteed that after cbw_do_copy_before_write() successful 83 * return there are no such requests and they will never appear. 84 */ 85 static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, 86 uint64_t offset, uint64_t bytes, BdrvRequestFlags flags) 87 { 88 BDRVCopyBeforeWriteState *s = bs->opaque; 89 int ret; 90 uint64_t off, end; 91 int64_t cluster_size = block_copy_cluster_size(s->bcs); 92 93 if (flags & BDRV_REQ_WRITE_UNCHANGED) { 94 return 0; 95 } 96 97 off = QEMU_ALIGN_DOWN(offset, cluster_size); 98 end = QEMU_ALIGN_UP(offset + bytes, cluster_size); 99 100 ret = block_copy(s->bcs, off, end - off, true); 101 if (ret < 0) { 102 return ret; 103 } 104 105 WITH_QEMU_LOCK_GUARD(&s->lock) { 106 bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off); 107 reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock); 108 } 109 110 return 0; 111 } 112 113 static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs, 114 int64_t offset, int64_t bytes) 115 { 116 int ret = cbw_do_copy_before_write(bs, offset, bytes, 0); 117 if (ret < 0) { 118 return ret; 119 } 120 121 return bdrv_co_pdiscard(bs->file, offset, bytes); 122 } 123 124 static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs, 125 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 126 { 127 int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); 128 if (ret < 0) { 129 return ret; 130 } 131 132 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); 133 } 134 135 static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs, 136 int64_t offset, 137 int64_t bytes, 138 QEMUIOVector *qiov, 139 BdrvRequestFlags flags) 140 { 141 int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); 142 if (ret < 0) { 143 return ret; 144 } 145 146 return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); 147 } 148 149 static int coroutine_fn cbw_co_flush(BlockDriverState *bs) 150 { 151 if (!bs->file) { 152 return 0; 153 } 154 155 return bdrv_co_flush(bs->file->bs); 156 } 157 158 /* 159 * If @offset not accessible - return NULL. 160 * 161 * Otherwise, set @pnum to some bytes that accessible from @file (@file is set 162 * to bs->file or to s->target). Return newly allocated BlockReq object that 163 * should be than passed to cbw_snapshot_read_unlock(). 164 * 165 * It's guaranteed that guest writes will not interact in the region until 166 * cbw_snapshot_read_unlock() called. 167 */ 168 static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs, 169 int64_t offset, int64_t bytes, 170 int64_t *pnum, BdrvChild **file) 171 { 172 BDRVCopyBeforeWriteState *s = bs->opaque; 173 BlockReq *req = g_new(BlockReq, 1); 174 bool done; 175 176 QEMU_LOCK_GUARD(&s->lock); 177 178 if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) { 179 g_free(req); 180 return NULL; 181 } 182 183 done = bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, pnum); 184 if (done) { 185 /* 186 * Special invalid BlockReq, that is handled in 187 * cbw_snapshot_read_unlock(). We don't need to lock something to read 188 * from s->target. 189 */ 190 *req = (BlockReq) {.offset = -1, .bytes = -1}; 191 *file = s->target; 192 } else { 193 reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes); 194 *file = bs->file; 195 } 196 197 return req; 198 } 199 200 static void cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req) 201 { 202 BDRVCopyBeforeWriteState *s = bs->opaque; 203 204 if (req->offset == -1 && req->bytes == -1) { 205 g_free(req); 206 return; 207 } 208 209 QEMU_LOCK_GUARD(&s->lock); 210 211 reqlist_remove_req(req); 212 g_free(req); 213 } 214 215 static coroutine_fn int 216 cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes, 217 QEMUIOVector *qiov, size_t qiov_offset) 218 { 219 BlockReq *req; 220 BdrvChild *file; 221 int ret; 222 223 /* TODO: upgrade to async loop using AioTask */ 224 while (bytes) { 225 int64_t cur_bytes; 226 227 req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &file); 228 if (!req) { 229 return -EACCES; 230 } 231 232 ret = bdrv_co_preadv_part(file, offset, cur_bytes, 233 qiov, qiov_offset, 0); 234 cbw_snapshot_read_unlock(bs, req); 235 if (ret < 0) { 236 return ret; 237 } 238 239 bytes -= cur_bytes; 240 offset += cur_bytes; 241 qiov_offset += cur_bytes; 242 } 243 244 return 0; 245 } 246 247 static int coroutine_fn 248 cbw_co_snapshot_block_status(BlockDriverState *bs, 249 bool want_zero, int64_t offset, int64_t bytes, 250 int64_t *pnum, int64_t *map, 251 BlockDriverState **file) 252 { 253 BDRVCopyBeforeWriteState *s = bs->opaque; 254 BlockReq *req; 255 int ret; 256 int64_t cur_bytes; 257 BdrvChild *child; 258 259 req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &child); 260 if (!req) { 261 return -EACCES; 262 } 263 264 ret = bdrv_block_status(child->bs, offset, cur_bytes, pnum, map, file); 265 if (child == s->target) { 266 /* 267 * We refer to s->target only for areas that we've written to it. 268 * And we can not report unallocated blocks in s->target: this will 269 * break generic block-status-above logic, that will go to 270 * copy-before-write filtered child in this case. 271 */ 272 assert(ret & BDRV_BLOCK_ALLOCATED); 273 } 274 275 cbw_snapshot_read_unlock(bs, req); 276 277 return ret; 278 } 279 280 static int coroutine_fn cbw_co_pdiscard_snapshot(BlockDriverState *bs, 281 int64_t offset, int64_t bytes) 282 { 283 BDRVCopyBeforeWriteState *s = bs->opaque; 284 285 WITH_QEMU_LOCK_GUARD(&s->lock) { 286 bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes); 287 } 288 289 block_copy_reset(s->bcs, offset, bytes); 290 291 return bdrv_co_pdiscard(s->target, offset, bytes); 292 } 293 294 static void cbw_refresh_filename(BlockDriverState *bs) 295 { 296 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), 297 bs->file->bs->filename); 298 } 299 300 static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c, 301 BdrvChildRole role, 302 BlockReopenQueue *reopen_queue, 303 uint64_t perm, uint64_t shared, 304 uint64_t *nperm, uint64_t *nshared) 305 { 306 if (!(role & BDRV_CHILD_FILTERED)) { 307 /* 308 * Target child 309 * 310 * Share write to target (child_file), to not interfere 311 * with guest writes to its disk which may be in target backing chain. 312 * Can't resize during a backup block job because we check the size 313 * only upfront. 314 */ 315 *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE; 316 *nperm = BLK_PERM_WRITE; 317 } else { 318 /* Source child */ 319 bdrv_default_perms(bs, c, role, reopen_queue, 320 perm, shared, nperm, nshared); 321 322 if (!QLIST_EMPTY(&bs->parents)) { 323 if (perm & BLK_PERM_WRITE) { 324 *nperm = *nperm | BLK_PERM_CONSISTENT_READ; 325 } 326 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); 327 } 328 } 329 } 330 331 static bool cbw_parse_bitmap_option(QDict *options, BdrvDirtyBitmap **bitmap, 332 Error **errp) 333 { 334 QDict *bitmap_qdict = NULL; 335 BlockDirtyBitmap *bmp_param = NULL; 336 Visitor *v = NULL; 337 bool ret = false; 338 339 *bitmap = NULL; 340 341 qdict_extract_subqdict(options, &bitmap_qdict, "bitmap."); 342 if (!qdict_size(bitmap_qdict)) { 343 ret = true; 344 goto out; 345 } 346 347 v = qobject_input_visitor_new_flat_confused(bitmap_qdict, errp); 348 if (!v) { 349 goto out; 350 } 351 352 visit_type_BlockDirtyBitmap(v, NULL, &bmp_param, errp); 353 if (!bmp_param) { 354 goto out; 355 } 356 357 *bitmap = block_dirty_bitmap_lookup(bmp_param->node, bmp_param->name, NULL, 358 errp); 359 if (!*bitmap) { 360 goto out; 361 } 362 363 ret = true; 364 365 out: 366 qapi_free_BlockDirtyBitmap(bmp_param); 367 visit_free(v); 368 qobject_unref(bitmap_qdict); 369 370 return ret; 371 } 372 373 static int cbw_open(BlockDriverState *bs, QDict *options, int flags, 374 Error **errp) 375 { 376 BDRVCopyBeforeWriteState *s = bs->opaque; 377 BdrvDirtyBitmap *bitmap = NULL; 378 int64_t cluster_size; 379 380 bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, 381 BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, 382 false, errp); 383 if (!bs->file) { 384 return -EINVAL; 385 } 386 387 s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds, 388 BDRV_CHILD_DATA, false, errp); 389 if (!s->target) { 390 return -EINVAL; 391 } 392 393 if (!cbw_parse_bitmap_option(options, &bitmap, errp)) { 394 return -EINVAL; 395 } 396 397 bs->total_sectors = bs->file->bs->total_sectors; 398 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | 399 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); 400 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | 401 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & 402 bs->file->bs->supported_zero_flags); 403 404 s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp); 405 if (!s->bcs) { 406 error_prepend(errp, "Cannot create block-copy-state: "); 407 return -EINVAL; 408 } 409 410 cluster_size = block_copy_cluster_size(s->bcs); 411 412 s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp); 413 if (!s->done_bitmap) { 414 return -EINVAL; 415 } 416 bdrv_disable_dirty_bitmap(s->done_bitmap); 417 418 /* s->access_bitmap starts equal to bcs bitmap */ 419 s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp); 420 if (!s->access_bitmap) { 421 return -EINVAL; 422 } 423 bdrv_disable_dirty_bitmap(s->access_bitmap); 424 bdrv_dirty_bitmap_merge_internal(s->access_bitmap, 425 block_copy_dirty_bitmap(s->bcs), NULL, 426 true); 427 428 qemu_co_mutex_init(&s->lock); 429 QLIST_INIT(&s->frozen_read_reqs); 430 431 return 0; 432 } 433 434 static void cbw_close(BlockDriverState *bs) 435 { 436 BDRVCopyBeforeWriteState *s = bs->opaque; 437 438 bdrv_release_dirty_bitmap(s->access_bitmap); 439 bdrv_release_dirty_bitmap(s->done_bitmap); 440 441 block_copy_state_free(s->bcs); 442 s->bcs = NULL; 443 } 444 445 BlockDriver bdrv_cbw_filter = { 446 .format_name = "copy-before-write", 447 .instance_size = sizeof(BDRVCopyBeforeWriteState), 448 449 .bdrv_open = cbw_open, 450 .bdrv_close = cbw_close, 451 452 .bdrv_co_preadv = cbw_co_preadv, 453 .bdrv_co_pwritev = cbw_co_pwritev, 454 .bdrv_co_pwrite_zeroes = cbw_co_pwrite_zeroes, 455 .bdrv_co_pdiscard = cbw_co_pdiscard, 456 .bdrv_co_flush = cbw_co_flush, 457 458 .bdrv_co_preadv_snapshot = cbw_co_preadv_snapshot, 459 .bdrv_co_pdiscard_snapshot = cbw_co_pdiscard_snapshot, 460 .bdrv_co_snapshot_block_status = cbw_co_snapshot_block_status, 461 462 .bdrv_refresh_filename = cbw_refresh_filename, 463 464 .bdrv_child_perm = cbw_child_perm, 465 466 .is_filter = true, 467 }; 468 469 BlockDriverState *bdrv_cbw_append(BlockDriverState *source, 470 BlockDriverState *target, 471 const char *filter_node_name, 472 BlockCopyState **bcs, 473 Error **errp) 474 { 475 ERRP_GUARD(); 476 BDRVCopyBeforeWriteState *state; 477 BlockDriverState *top; 478 QDict *opts; 479 480 assert(source->total_sectors == target->total_sectors); 481 GLOBAL_STATE_CODE(); 482 483 opts = qdict_new(); 484 qdict_put_str(opts, "driver", "copy-before-write"); 485 if (filter_node_name) { 486 qdict_put_str(opts, "node-name", filter_node_name); 487 } 488 qdict_put_str(opts, "file", bdrv_get_node_name(source)); 489 qdict_put_str(opts, "target", bdrv_get_node_name(target)); 490 491 top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp); 492 if (!top) { 493 return NULL; 494 } 495 496 state = top->opaque; 497 *bcs = state->bcs; 498 499 return top; 500 } 501 502 void bdrv_cbw_drop(BlockDriverState *bs) 503 { 504 GLOBAL_STATE_CODE(); 505 bdrv_drop_filter(bs, &error_abort); 506 bdrv_unref(bs); 507 } 508 509 static void cbw_init(void) 510 { 511 bdrv_register(&bdrv_cbw_filter); 512 } 513 514 block_init(cbw_init); 515