xref: /openbmc/qemu/block/block-copy.c (revision ed5abf46)
1 /*
2  * block_copy API
3  *
4  * Copyright (C) 2013 Proxmox Server Solutions
5  * Copyright (c) 2019 Virtuozzo International GmbH.
6  *
7  * Authors:
8  *  Dietmar Maurer (dietmar@proxmox.com)
9  *  Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  */
14 
15 #include "qemu/osdep.h"
16 
17 #include "trace.h"
18 #include "qapi/error.h"
19 #include "block/block-copy.h"
20 #include "sysemu/block-backend.h"
21 #include "qemu/units.h"
22 
23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
25 #define BLOCK_COPY_MAX_MEM (128 * MiB)
26 
27 typedef struct BlockCopyInFlightReq {
28     int64_t offset;
29     int64_t bytes;
30     QLIST_ENTRY(BlockCopyInFlightReq) list;
31     CoQueue wait_queue; /* coroutines blocked on this request */
32 } BlockCopyInFlightReq;
33 
34 typedef struct BlockCopyState {
35     /*
36      * BdrvChild objects are not owned or managed by block-copy. They are
37      * provided by block-copy user and user is responsible for appropriate
38      * permissions on these children.
39      */
40     BdrvChild *source;
41     BdrvChild *target;
42     BdrvDirtyBitmap *copy_bitmap;
43     int64_t in_flight_bytes;
44     int64_t cluster_size;
45     bool use_copy_range;
46     int64_t copy_size;
47     uint64_t len;
48     QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
49 
50     BdrvRequestFlags write_flags;
51 
52     /*
53      * skip_unallocated:
54      *
55      * Used by sync=top jobs, which first scan the source node for unallocated
56      * areas and clear them in the copy_bitmap.  During this process, the bitmap
57      * is thus not fully initialized: It may still have bits set for areas that
58      * are unallocated and should actually not be copied.
59      *
60      * This is indicated by skip_unallocated.
61      *
62      * In this case, block_copy() will query the source’s allocation status,
63      * skip unallocated regions, clear them in the copy_bitmap, and invoke
64      * block_copy_reset_unallocated() every time it does.
65      */
66     bool skip_unallocated;
67 
68     ProgressMeter *progress;
69     /* progress_bytes_callback: called when some copying progress is done. */
70     ProgressBytesCallbackFunc progress_bytes_callback;
71     void *progress_opaque;
72 
73     SharedResource *mem;
74 } BlockCopyState;
75 
76 static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
77                                                            int64_t offset,
78                                                            int64_t bytes)
79 {
80     BlockCopyInFlightReq *req;
81 
82     QLIST_FOREACH(req, &s->inflight_reqs, list) {
83         if (offset + bytes > req->offset && offset < req->offset + req->bytes) {
84             return req;
85         }
86     }
87 
88     return NULL;
89 }
90 
91 /*
92  * If there are no intersecting requests return false. Otherwise, wait for the
93  * first found intersecting request to finish and return true.
94  */
95 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
96                                              int64_t bytes)
97 {
98     BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes);
99 
100     if (!req) {
101         return false;
102     }
103 
104     qemu_co_queue_wait(&req->wait_queue, NULL);
105 
106     return true;
107 }
108 
109 /* Called only on full-dirty region */
110 static void block_copy_inflight_req_begin(BlockCopyState *s,
111                                           BlockCopyInFlightReq *req,
112                                           int64_t offset, int64_t bytes)
113 {
114     assert(!find_conflicting_inflight_req(s, offset, bytes));
115 
116     bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
117     s->in_flight_bytes += bytes;
118 
119     req->offset = offset;
120     req->bytes = bytes;
121     qemu_co_queue_init(&req->wait_queue);
122     QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
123 }
124 
125 /*
126  * block_copy_inflight_req_shrink
127  *
128  * Drop the tail of the request to be handled later. Set dirty bits back and
129  * wake up all requests waiting for us (may be some of them are not intersecting
130  * with shrunk request)
131  */
132 static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
133         BlockCopyInFlightReq *req, int64_t new_bytes)
134 {
135     if (new_bytes == req->bytes) {
136         return;
137     }
138 
139     assert(new_bytes > 0 && new_bytes < req->bytes);
140 
141     s->in_flight_bytes -= req->bytes - new_bytes;
142     bdrv_set_dirty_bitmap(s->copy_bitmap,
143                           req->offset + new_bytes, req->bytes - new_bytes);
144 
145     req->bytes = new_bytes;
146     qemu_co_queue_restart_all(&req->wait_queue);
147 }
148 
149 static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s,
150                                                      BlockCopyInFlightReq *req,
151                                                      int ret)
152 {
153     s->in_flight_bytes -= req->bytes;
154     if (ret < 0) {
155         bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes);
156     }
157     QLIST_REMOVE(req, list);
158     qemu_co_queue_restart_all(&req->wait_queue);
159 }
160 
161 void block_copy_state_free(BlockCopyState *s)
162 {
163     if (!s) {
164         return;
165     }
166 
167     bdrv_release_dirty_bitmap(s->copy_bitmap);
168     shres_destroy(s->mem);
169     g_free(s);
170 }
171 
172 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
173 {
174     return MIN_NON_ZERO(INT_MAX,
175                         MIN_NON_ZERO(source->bs->bl.max_transfer,
176                                      target->bs->bl.max_transfer));
177 }
178 
179 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
180                                      int64_t cluster_size,
181                                      BdrvRequestFlags write_flags, Error **errp)
182 {
183     BlockCopyState *s;
184     BdrvDirtyBitmap *copy_bitmap;
185 
186     copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
187                                            errp);
188     if (!copy_bitmap) {
189         return NULL;
190     }
191     bdrv_disable_dirty_bitmap(copy_bitmap);
192 
193     s = g_new(BlockCopyState, 1);
194     *s = (BlockCopyState) {
195         .source = source,
196         .target = target,
197         .copy_bitmap = copy_bitmap,
198         .cluster_size = cluster_size,
199         .len = bdrv_dirty_bitmap_size(copy_bitmap),
200         .write_flags = write_flags,
201         .mem = shres_create(BLOCK_COPY_MAX_MEM),
202     };
203 
204     if (block_copy_max_transfer(source, target) < cluster_size) {
205         /*
206          * copy_range does not respect max_transfer. We don't want to bother
207          * with requests smaller than block-copy cluster size, so fallback to
208          * buffered copying (read and write respect max_transfer on their
209          * behalf).
210          */
211         s->use_copy_range = false;
212         s->copy_size = cluster_size;
213     } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
214         /* Compression supports only cluster-size writes and no copy-range. */
215         s->use_copy_range = false;
216         s->copy_size = cluster_size;
217     } else {
218         /*
219          * We enable copy-range, but keep small copy_size, until first
220          * successful copy_range (look at block_copy_do_copy).
221          */
222         s->use_copy_range = true;
223         s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
224     }
225 
226     QLIST_INIT(&s->inflight_reqs);
227 
228     return s;
229 }
230 
231 void block_copy_set_progress_callback(
232         BlockCopyState *s,
233         ProgressBytesCallbackFunc progress_bytes_callback,
234         void *progress_opaque)
235 {
236     s->progress_bytes_callback = progress_bytes_callback;
237     s->progress_opaque = progress_opaque;
238 }
239 
240 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
241 {
242     s->progress = pm;
243 }
244 
245 /*
246  * block_copy_do_copy
247  *
248  * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
249  * s->len only to cover last cluster when s->len is not aligned to clusters.
250  *
251  * No sync here: nor bitmap neighter intersecting requests handling, only copy.
252  *
253  * Returns 0 on success.
254  */
255 static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
256                                            int64_t offset, int64_t bytes,
257                                            bool zeroes, bool *error_is_read)
258 {
259     int ret;
260     int64_t nbytes = MIN(offset + bytes, s->len) - offset;
261     void *bounce_buffer = NULL;
262 
263     assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
264     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
265     assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
266     assert(offset < s->len);
267     assert(offset + bytes <= s->len ||
268            offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
269     assert(nbytes < INT_MAX);
270 
271     if (zeroes) {
272         ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
273                                     ~BDRV_REQ_WRITE_COMPRESSED);
274         if (ret < 0) {
275             trace_block_copy_write_zeroes_fail(s, offset, ret);
276             if (error_is_read) {
277                 *error_is_read = false;
278             }
279         }
280         return ret;
281     }
282 
283     if (s->use_copy_range) {
284         ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
285                                  0, s->write_flags);
286         if (ret < 0) {
287             trace_block_copy_copy_range_fail(s, offset, ret);
288             s->use_copy_range = false;
289             s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
290             /* Fallback to read+write with allocated buffer */
291         } else {
292             if (s->use_copy_range) {
293                 /*
294                  * Successful copy-range. Now increase copy_size.  copy_range
295                  * does not respect max_transfer (it's a TODO), so we factor
296                  * that in here.
297                  *
298                  * Note: we double-check s->use_copy_range for the case when
299                  * parallel block-copy request unsets it during previous
300                  * bdrv_co_copy_range call.
301                  */
302                 s->copy_size =
303                         MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
304                             QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
305                                                                     s->target),
306                                             s->cluster_size));
307             }
308             goto out;
309         }
310     }
311 
312     /*
313      * In case of failed copy_range request above, we may proceed with buffered
314      * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
315      * be properly limited, so don't care too much. Moreover the most likely
316      * case (copy_range is unsupported for the configuration, so the very first
317      * copy_range request fails) is handled by setting large copy_size only
318      * after first successful copy_range.
319      */
320 
321     bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
322 
323     ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
324     if (ret < 0) {
325         trace_block_copy_read_fail(s, offset, ret);
326         if (error_is_read) {
327             *error_is_read = true;
328         }
329         goto out;
330     }
331 
332     ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
333                          s->write_flags);
334     if (ret < 0) {
335         trace_block_copy_write_fail(s, offset, ret);
336         if (error_is_read) {
337             *error_is_read = false;
338         }
339         goto out;
340     }
341 
342 out:
343     qemu_vfree(bounce_buffer);
344 
345     return ret;
346 }
347 
348 static int block_copy_block_status(BlockCopyState *s, int64_t offset,
349                                    int64_t bytes, int64_t *pnum)
350 {
351     int64_t num;
352     BlockDriverState *base;
353     int ret;
354 
355     if (s->skip_unallocated && s->source->bs->backing) {
356         base = s->source->bs->backing->bs;
357     } else {
358         base = NULL;
359     }
360 
361     ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
362                                   NULL, NULL);
363     if (ret < 0 || num < s->cluster_size) {
364         /*
365          * On error or if failed to obtain large enough chunk just fallback to
366          * copy one cluster.
367          */
368         num = s->cluster_size;
369         ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
370     } else if (offset + num == s->len) {
371         num = QEMU_ALIGN_UP(num, s->cluster_size);
372     } else {
373         num = QEMU_ALIGN_DOWN(num, s->cluster_size);
374     }
375 
376     *pnum = num;
377     return ret;
378 }
379 
380 /*
381  * Check if the cluster starting at offset is allocated or not.
382  * return via pnum the number of contiguous clusters sharing this allocation.
383  */
384 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
385                                            int64_t *pnum)
386 {
387     BlockDriverState *bs = s->source->bs;
388     int64_t count, total_count = 0;
389     int64_t bytes = s->len - offset;
390     int ret;
391 
392     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
393 
394     while (true) {
395         ret = bdrv_is_allocated(bs, offset, bytes, &count);
396         if (ret < 0) {
397             return ret;
398         }
399 
400         total_count += count;
401 
402         if (ret || count == 0) {
403             /*
404              * ret: partial segment(s) are considered allocated.
405              * otherwise: unallocated tail is treated as an entire segment.
406              */
407             *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
408             return ret;
409         }
410 
411         /* Unallocated segment(s) with uncertain following segment(s) */
412         if (total_count >= s->cluster_size) {
413             *pnum = total_count / s->cluster_size;
414             return 0;
415         }
416 
417         offset += count;
418         bytes -= count;
419     }
420 }
421 
422 /*
423  * Reset bits in copy_bitmap starting at offset if they represent unallocated
424  * data in the image. May reset subsequent contiguous bits.
425  * @return 0 when the cluster at @offset was unallocated,
426  *         1 otherwise, and -ret on error.
427  */
428 int64_t block_copy_reset_unallocated(BlockCopyState *s,
429                                      int64_t offset, int64_t *count)
430 {
431     int ret;
432     int64_t clusters, bytes;
433 
434     ret = block_copy_is_cluster_allocated(s, offset, &clusters);
435     if (ret < 0) {
436         return ret;
437     }
438 
439     bytes = clusters * s->cluster_size;
440 
441     if (!ret) {
442         bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
443         progress_set_remaining(s->progress,
444                                bdrv_get_dirty_count(s->copy_bitmap) +
445                                s->in_flight_bytes);
446     }
447 
448     *count = bytes;
449     return ret;
450 }
451 
452 /*
453  * block_copy_dirty_clusters
454  *
455  * Copy dirty clusters in @offset/@bytes range.
456  * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
457  * clusters found and -errno on failure.
458  */
459 static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
460                                                   int64_t offset, int64_t bytes,
461                                                   bool *error_is_read)
462 {
463     int ret = 0;
464     bool found_dirty = false;
465 
466     /*
467      * block_copy() user is responsible for keeping source and target in same
468      * aio context
469      */
470     assert(bdrv_get_aio_context(s->source->bs) ==
471            bdrv_get_aio_context(s->target->bs));
472 
473     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
474     assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
475 
476     while (bytes) {
477         BlockCopyInFlightReq req;
478         int64_t next_zero, cur_bytes, status_bytes;
479 
480         if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
481             trace_block_copy_skip(s, offset);
482             offset += s->cluster_size;
483             bytes -= s->cluster_size;
484             continue; /* already copied */
485         }
486 
487         found_dirty = true;
488 
489         cur_bytes = MIN(bytes, s->copy_size);
490 
491         next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
492                                                 cur_bytes);
493         if (next_zero >= 0) {
494             assert(next_zero > offset); /* offset is dirty */
495             assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
496             cur_bytes = next_zero - offset;
497         }
498         block_copy_inflight_req_begin(s, &req, offset, cur_bytes);
499 
500         ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
501         assert(ret >= 0); /* never fail */
502         cur_bytes = MIN(cur_bytes, status_bytes);
503         block_copy_inflight_req_shrink(s, &req, cur_bytes);
504         if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
505             block_copy_inflight_req_end(s, &req, 0);
506             progress_set_remaining(s->progress,
507                                    bdrv_get_dirty_count(s->copy_bitmap) +
508                                    s->in_flight_bytes);
509             trace_block_copy_skip_range(s, offset, status_bytes);
510             offset += status_bytes;
511             bytes -= status_bytes;
512             continue;
513         }
514 
515         trace_block_copy_process(s, offset);
516 
517         co_get_from_shres(s->mem, cur_bytes);
518         ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
519                                  error_is_read);
520         co_put_to_shres(s->mem, cur_bytes);
521         block_copy_inflight_req_end(s, &req, ret);
522         if (ret < 0) {
523             return ret;
524         }
525 
526         progress_work_done(s->progress, cur_bytes);
527         s->progress_bytes_callback(cur_bytes, s->progress_opaque);
528         offset += cur_bytes;
529         bytes -= cur_bytes;
530     }
531 
532     return found_dirty;
533 }
534 
535 /*
536  * block_copy
537  *
538  * Copy requested region, accordingly to dirty bitmap.
539  * Collaborate with parallel block_copy requests: if they succeed it will help
540  * us. If they fail, we will retry not-copied regions. So, if we return error,
541  * it means that some I/O operation failed in context of _this_ block_copy call,
542  * not some parallel operation.
543  */
544 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
545                             bool *error_is_read)
546 {
547     int ret;
548 
549     do {
550         ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
551 
552         if (ret == 0) {
553             ret = block_copy_wait_one(s, offset, bytes);
554         }
555 
556         /*
557          * We retry in two cases:
558          * 1. Some progress done
559          *    Something was copied, which means that there were yield points
560          *    and some new dirty bits may have appeared (due to failed parallel
561          *    block-copy requests).
562          * 2. We have waited for some intersecting block-copy request
563          *    It may have failed and produced new dirty bits.
564          */
565     } while (ret > 0);
566 
567     return ret;
568 }
569 
570 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
571 {
572     return s->copy_bitmap;
573 }
574 
575 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
576 {
577     s->skip_unallocated = skip;
578 }
579