1 /* 2 * block_copy API 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * Copyright (c) 2019 Virtuozzo International GmbH. 6 * 7 * Authors: 8 * Dietmar Maurer (dietmar@proxmox.com) 9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 */ 14 15 #include "qemu/osdep.h" 16 17 #include "trace.h" 18 #include "qapi/error.h" 19 #include "block/block-copy.h" 20 #include "sysemu/block-backend.h" 21 #include "qemu/units.h" 22 23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) 24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB) 25 #define BLOCK_COPY_MAX_MEM (128 * MiB) 26 27 static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s, 28 int64_t start, 29 int64_t end) 30 { 31 BlockCopyInFlightReq *req; 32 bool waited; 33 34 do { 35 waited = false; 36 QLIST_FOREACH(req, &s->inflight_reqs, list) { 37 if (end > req->start_byte && start < req->end_byte) { 38 qemu_co_queue_wait(&req->wait_queue, NULL); 39 waited = true; 40 break; 41 } 42 } 43 } while (waited); 44 } 45 46 static void block_copy_inflight_req_begin(BlockCopyState *s, 47 BlockCopyInFlightReq *req, 48 int64_t start, int64_t end) 49 { 50 req->start_byte = start; 51 req->end_byte = end; 52 qemu_co_queue_init(&req->wait_queue); 53 QLIST_INSERT_HEAD(&s->inflight_reqs, req, list); 54 } 55 56 static void coroutine_fn block_copy_inflight_req_end(BlockCopyInFlightReq *req) 57 { 58 QLIST_REMOVE(req, list); 59 qemu_co_queue_restart_all(&req->wait_queue); 60 } 61 62 void block_copy_state_free(BlockCopyState *s) 63 { 64 if (!s) { 65 return; 66 } 67 68 bdrv_release_dirty_bitmap(s->copy_bitmap); 69 shres_destroy(s->mem); 70 g_free(s); 71 } 72 73 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, 74 int64_t cluster_size, 75 BdrvRequestFlags write_flags, Error **errp) 76 { 77 BlockCopyState *s; 78 BdrvDirtyBitmap *copy_bitmap; 79 uint32_t max_transfer = 80 MIN_NON_ZERO(INT_MAX, 81 MIN_NON_ZERO(source->bs->bl.max_transfer, 82 target->bs->bl.max_transfer)); 83 84 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, 85 errp); 86 if (!copy_bitmap) { 87 return NULL; 88 } 89 bdrv_disable_dirty_bitmap(copy_bitmap); 90 91 s = g_new(BlockCopyState, 1); 92 *s = (BlockCopyState) { 93 .source = source, 94 .target = target, 95 .copy_bitmap = copy_bitmap, 96 .cluster_size = cluster_size, 97 .len = bdrv_dirty_bitmap_size(copy_bitmap), 98 .write_flags = write_flags, 99 .mem = shres_create(BLOCK_COPY_MAX_MEM), 100 }; 101 102 if (max_transfer < cluster_size) { 103 /* 104 * copy_range does not respect max_transfer. We don't want to bother 105 * with requests smaller than block-copy cluster size, so fallback to 106 * buffered copying (read and write respect max_transfer on their 107 * behalf). 108 */ 109 s->use_copy_range = false; 110 s->copy_size = cluster_size; 111 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) { 112 /* Compression supports only cluster-size writes and no copy-range. */ 113 s->use_copy_range = false; 114 s->copy_size = cluster_size; 115 } else { 116 /* 117 * copy_range does not respect max_transfer (it's a TODO), so we factor 118 * that in here. 119 */ 120 s->use_copy_range = true; 121 s->copy_size = MIN(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE), 122 QEMU_ALIGN_DOWN(max_transfer, cluster_size)); 123 } 124 125 QLIST_INIT(&s->inflight_reqs); 126 127 return s; 128 } 129 130 void block_copy_set_callbacks( 131 BlockCopyState *s, 132 ProgressBytesCallbackFunc progress_bytes_callback, 133 ProgressResetCallbackFunc progress_reset_callback, 134 void *progress_opaque) 135 { 136 s->progress_bytes_callback = progress_bytes_callback; 137 s->progress_reset_callback = progress_reset_callback; 138 s->progress_opaque = progress_opaque; 139 } 140 141 /* 142 * block_copy_do_copy 143 * 144 * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to 145 * cover last cluster when s->len is not aligned to clusters. 146 * 147 * No sync here: nor bitmap neighter intersecting requests handling, only copy. 148 * 149 * Returns 0 on success. 150 */ 151 static int coroutine_fn block_copy_do_copy(BlockCopyState *s, 152 int64_t start, int64_t end, 153 bool *error_is_read) 154 { 155 int ret; 156 int nbytes = MIN(end, s->len) - start; 157 void *bounce_buffer = NULL; 158 159 assert(QEMU_IS_ALIGNED(start, s->cluster_size)); 160 assert(QEMU_IS_ALIGNED(end, s->cluster_size)); 161 assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size)); 162 163 if (s->use_copy_range) { 164 ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes, 165 0, s->write_flags); 166 if (ret < 0) { 167 trace_block_copy_copy_range_fail(s, start, ret); 168 s->use_copy_range = false; 169 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); 170 /* Fallback to read+write with allocated buffer */ 171 } else { 172 goto out; 173 } 174 } 175 176 /* 177 * In case of failed copy_range request above, we may proceed with buffered 178 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will 179 * be properly limited, so don't care too much. 180 */ 181 182 bounce_buffer = qemu_blockalign(s->source->bs, nbytes); 183 184 ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0); 185 if (ret < 0) { 186 trace_block_copy_read_fail(s, start, ret); 187 if (error_is_read) { 188 *error_is_read = true; 189 } 190 goto out; 191 } 192 193 ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer, 194 s->write_flags); 195 if (ret < 0) { 196 trace_block_copy_write_fail(s, start, ret); 197 if (error_is_read) { 198 *error_is_read = false; 199 } 200 goto out; 201 } 202 203 out: 204 qemu_vfree(bounce_buffer); 205 206 return ret; 207 } 208 209 /* 210 * Check if the cluster starting at offset is allocated or not. 211 * return via pnum the number of contiguous clusters sharing this allocation. 212 */ 213 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, 214 int64_t *pnum) 215 { 216 BlockDriverState *bs = s->source->bs; 217 int64_t count, total_count = 0; 218 int64_t bytes = s->len - offset; 219 int ret; 220 221 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 222 223 while (true) { 224 ret = bdrv_is_allocated(bs, offset, bytes, &count); 225 if (ret < 0) { 226 return ret; 227 } 228 229 total_count += count; 230 231 if (ret || count == 0) { 232 /* 233 * ret: partial segment(s) are considered allocated. 234 * otherwise: unallocated tail is treated as an entire segment. 235 */ 236 *pnum = DIV_ROUND_UP(total_count, s->cluster_size); 237 return ret; 238 } 239 240 /* Unallocated segment(s) with uncertain following segment(s) */ 241 if (total_count >= s->cluster_size) { 242 *pnum = total_count / s->cluster_size; 243 return 0; 244 } 245 246 offset += count; 247 bytes -= count; 248 } 249 } 250 251 /* 252 * Reset bits in copy_bitmap starting at offset if they represent unallocated 253 * data in the image. May reset subsequent contiguous bits. 254 * @return 0 when the cluster at @offset was unallocated, 255 * 1 otherwise, and -ret on error. 256 */ 257 int64_t block_copy_reset_unallocated(BlockCopyState *s, 258 int64_t offset, int64_t *count) 259 { 260 int ret; 261 int64_t clusters, bytes; 262 263 ret = block_copy_is_cluster_allocated(s, offset, &clusters); 264 if (ret < 0) { 265 return ret; 266 } 267 268 bytes = clusters * s->cluster_size; 269 270 if (!ret) { 271 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 272 s->progress_reset_callback(s->progress_opaque); 273 } 274 275 *count = bytes; 276 return ret; 277 } 278 279 int coroutine_fn block_copy(BlockCopyState *s, 280 int64_t start, uint64_t bytes, 281 bool *error_is_read) 282 { 283 int ret = 0; 284 int64_t end = bytes + start; /* bytes */ 285 int64_t status_bytes; 286 BlockCopyInFlightReq req; 287 288 /* 289 * block_copy() user is responsible for keeping source and target in same 290 * aio context 291 */ 292 assert(bdrv_get_aio_context(s->source->bs) == 293 bdrv_get_aio_context(s->target->bs)); 294 295 assert(QEMU_IS_ALIGNED(start, s->cluster_size)); 296 assert(QEMU_IS_ALIGNED(end, s->cluster_size)); 297 298 block_copy_wait_inflight_reqs(s, start, bytes); 299 block_copy_inflight_req_begin(s, &req, start, end); 300 301 while (start < end) { 302 int64_t next_zero, chunk_end; 303 304 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) { 305 trace_block_copy_skip(s, start); 306 start += s->cluster_size; 307 continue; /* already copied */ 308 } 309 310 chunk_end = MIN(end, start + s->copy_size); 311 312 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start, 313 chunk_end - start); 314 if (next_zero >= 0) { 315 assert(next_zero > start); /* start is dirty */ 316 assert(next_zero < chunk_end); /* no need to do MIN() */ 317 chunk_end = next_zero; 318 } 319 320 if (s->skip_unallocated) { 321 ret = block_copy_reset_unallocated(s, start, &status_bytes); 322 if (ret == 0) { 323 trace_block_copy_skip_range(s, start, status_bytes); 324 start += status_bytes; 325 continue; 326 } 327 /* Clamp to known allocated region */ 328 chunk_end = MIN(chunk_end, start + status_bytes); 329 } 330 331 trace_block_copy_process(s, start); 332 333 bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start); 334 335 co_get_from_shres(s->mem, chunk_end - start); 336 ret = block_copy_do_copy(s, start, chunk_end, error_is_read); 337 co_put_to_shres(s->mem, chunk_end - start); 338 if (ret < 0) { 339 bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start); 340 break; 341 } 342 343 s->progress_bytes_callback(chunk_end - start, s->progress_opaque); 344 start = chunk_end; 345 ret = 0; 346 } 347 348 block_copy_inflight_req_end(&req); 349 350 return ret; 351 } 352