xref: /openbmc/qemu/block/backup.c (revision 9c4218e9)
1 /*
2  * QEMU backup
3  *
4  * Copyright (C) 2013 Proxmox Server Solutions
5  *
6  * Authors:
7  *  Dietmar Maurer (dietmar@proxmox.com)
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 
16 #include "trace.h"
17 #include "block/block.h"
18 #include "block/block_int.h"
19 #include "block/blockjob.h"
20 #include "qapi/qmp/qerror.h"
21 #include "qemu/ratelimit.h"
22 #include "sysemu/block-backend.h"
23 
24 #define BACKUP_CLUSTER_BITS 16
25 #define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
26 #define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
27 
28 #define SLICE_TIME 100000000ULL /* ns */
29 
30 typedef struct CowRequest {
31     int64_t start;
32     int64_t end;
33     QLIST_ENTRY(CowRequest) list;
34     CoQueue wait_queue; /* coroutines blocked on this request */
35 } CowRequest;
36 
37 typedef struct BackupBlockJob {
38     BlockJob common;
39     BlockDriverState *target;
40     /* bitmap for sync=incremental */
41     BdrvDirtyBitmap *sync_bitmap;
42     MirrorSyncMode sync_mode;
43     RateLimit limit;
44     BlockdevOnError on_source_error;
45     BlockdevOnError on_target_error;
46     CoRwlock flush_rwlock;
47     uint64_t sectors_read;
48     HBitmap *bitmap;
49     QLIST_HEAD(, CowRequest) inflight_reqs;
50 } BackupBlockJob;
51 
52 /* See if in-flight requests overlap and wait for them to complete */
53 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
54                                                        int64_t start,
55                                                        int64_t end)
56 {
57     CowRequest *req;
58     bool retry;
59 
60     do {
61         retry = false;
62         QLIST_FOREACH(req, &job->inflight_reqs, list) {
63             if (end > req->start && start < req->end) {
64                 qemu_co_queue_wait(&req->wait_queue);
65                 retry = true;
66                 break;
67             }
68         }
69     } while (retry);
70 }
71 
72 /* Keep track of an in-flight request */
73 static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
74                                      int64_t start, int64_t end)
75 {
76     req->start = start;
77     req->end = end;
78     qemu_co_queue_init(&req->wait_queue);
79     QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
80 }
81 
82 /* Forget about a completed request */
83 static void cow_request_end(CowRequest *req)
84 {
85     QLIST_REMOVE(req, list);
86     qemu_co_queue_restart_all(&req->wait_queue);
87 }
88 
89 static int coroutine_fn backup_do_cow(BlockDriverState *bs,
90                                       int64_t sector_num, int nb_sectors,
91                                       bool *error_is_read,
92                                       bool is_write_notifier)
93 {
94     BackupBlockJob *job = (BackupBlockJob *)bs->job;
95     CowRequest cow_request;
96     struct iovec iov;
97     QEMUIOVector bounce_qiov;
98     void *bounce_buffer = NULL;
99     int ret = 0;
100     int64_t start, end;
101     int n;
102 
103     qemu_co_rwlock_rdlock(&job->flush_rwlock);
104 
105     start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
106     end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);
107 
108     trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);
109 
110     wait_for_overlapping_requests(job, start, end);
111     cow_request_begin(&cow_request, job, start, end);
112 
113     for (; start < end; start++) {
114         if (hbitmap_get(job->bitmap, start)) {
115             trace_backup_do_cow_skip(job, start);
116             continue; /* already copied */
117         }
118 
119         trace_backup_do_cow_process(job, start);
120 
121         n = MIN(BACKUP_SECTORS_PER_CLUSTER,
122                 job->common.len / BDRV_SECTOR_SIZE -
123                 start * BACKUP_SECTORS_PER_CLUSTER);
124 
125         if (!bounce_buffer) {
126             bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
127         }
128         iov.iov_base = bounce_buffer;
129         iov.iov_len = n * BDRV_SECTOR_SIZE;
130         qemu_iovec_init_external(&bounce_qiov, &iov, 1);
131 
132         if (is_write_notifier) {
133             ret = bdrv_co_readv_no_serialising(bs,
134                                            start * BACKUP_SECTORS_PER_CLUSTER,
135                                            n, &bounce_qiov);
136         } else {
137             ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
138                                 &bounce_qiov);
139         }
140         if (ret < 0) {
141             trace_backup_do_cow_read_fail(job, start, ret);
142             if (error_is_read) {
143                 *error_is_read = true;
144             }
145             goto out;
146         }
147 
148         if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
149             ret = bdrv_co_write_zeroes(job->target,
150                                        start * BACKUP_SECTORS_PER_CLUSTER,
151                                        n, BDRV_REQ_MAY_UNMAP);
152         } else {
153             ret = bdrv_co_writev(job->target,
154                                  start * BACKUP_SECTORS_PER_CLUSTER, n,
155                                  &bounce_qiov);
156         }
157         if (ret < 0) {
158             trace_backup_do_cow_write_fail(job, start, ret);
159             if (error_is_read) {
160                 *error_is_read = false;
161             }
162             goto out;
163         }
164 
165         hbitmap_set(job->bitmap, start, 1);
166 
167         /* Publish progress, guest I/O counts as progress too.  Note that the
168          * offset field is an opaque progress value, it is not a disk offset.
169          */
170         job->sectors_read += n;
171         job->common.offset += n * BDRV_SECTOR_SIZE;
172     }
173 
174 out:
175     if (bounce_buffer) {
176         qemu_vfree(bounce_buffer);
177     }
178 
179     cow_request_end(&cow_request);
180 
181     trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);
182 
183     qemu_co_rwlock_unlock(&job->flush_rwlock);
184 
185     return ret;
186 }
187 
188 static int coroutine_fn backup_before_write_notify(
189         NotifierWithReturn *notifier,
190         void *opaque)
191 {
192     BdrvTrackedRequest *req = opaque;
193     int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
194     int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
195 
196     assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
197     assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
198 
199     return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
200 }
201 
202 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
203 {
204     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
205 
206     if (speed < 0) {
207         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
208         return;
209     }
210     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
211 }
212 
213 static void backup_iostatus_reset(BlockJob *job)
214 {
215     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
216 
217     if (s->target->blk) {
218         blk_iostatus_reset(s->target->blk);
219     }
220 }
221 
222 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
223 {
224     BdrvDirtyBitmap *bm;
225     BlockDriverState *bs = job->common.bs;
226 
227     if (ret < 0 || block_job_is_cancelled(&job->common)) {
228         /* Merge the successor back into the parent, delete nothing. */
229         bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
230         assert(bm);
231     } else {
232         /* Everything is fine, delete this bitmap and install the backup. */
233         bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
234         assert(bm);
235     }
236 }
237 
238 static void backup_commit(BlockJob *job)
239 {
240     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
241     if (s->sync_bitmap) {
242         backup_cleanup_sync_bitmap(s, 0);
243     }
244 }
245 
246 static void backup_abort(BlockJob *job)
247 {
248     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
249     if (s->sync_bitmap) {
250         backup_cleanup_sync_bitmap(s, -1);
251     }
252 }
253 
254 static const BlockJobDriver backup_job_driver = {
255     .instance_size  = sizeof(BackupBlockJob),
256     .job_type       = BLOCK_JOB_TYPE_BACKUP,
257     .set_speed      = backup_set_speed,
258     .iostatus_reset = backup_iostatus_reset,
259     .commit         = backup_commit,
260     .abort          = backup_abort,
261 };
262 
263 static BlockErrorAction backup_error_action(BackupBlockJob *job,
264                                             bool read, int error)
265 {
266     if (read) {
267         return block_job_error_action(&job->common, job->common.bs,
268                                       job->on_source_error, true, error);
269     } else {
270         return block_job_error_action(&job->common, job->target,
271                                       job->on_target_error, false, error);
272     }
273 }
274 
275 typedef struct {
276     int ret;
277 } BackupCompleteData;
278 
279 static void backup_complete(BlockJob *job, void *opaque)
280 {
281     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
282     BackupCompleteData *data = opaque;
283 
284     bdrv_unref(s->target);
285 
286     block_job_completed(job, data->ret);
287     g_free(data);
288 }
289 
290 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
291 {
292     if (block_job_is_cancelled(&job->common)) {
293         return true;
294     }
295 
296     /* we need to yield so that bdrv_drain_all() returns.
297      * (without, VM does not reboot)
298      */
299     if (job->common.speed) {
300         uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
301                                                       job->sectors_read);
302         job->sectors_read = 0;
303         block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
304     } else {
305         block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
306     }
307 
308     if (block_job_is_cancelled(&job->common)) {
309         return true;
310     }
311 
312     return false;
313 }
314 
315 static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
316 {
317     bool error_is_read;
318     int ret = 0;
319     int clusters_per_iter;
320     uint32_t granularity;
321     int64_t sector;
322     int64_t cluster;
323     int64_t end;
324     int64_t last_cluster = -1;
325     BlockDriverState *bs = job->common.bs;
326     HBitmapIter hbi;
327 
328     granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
329     clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
330     bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
331 
332     /* Find the next dirty sector(s) */
333     while ((sector = hbitmap_iter_next(&hbi)) != -1) {
334         cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
335 
336         /* Fake progress updates for any clusters we skipped */
337         if (cluster != last_cluster + 1) {
338             job->common.offset += ((cluster - last_cluster - 1) *
339                                    BACKUP_CLUSTER_SIZE);
340         }
341 
342         for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
343             do {
344                 if (yield_and_check(job)) {
345                     return ret;
346                 }
347                 ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
348                                     BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
349                                     false);
350                 if ((ret < 0) &&
351                     backup_error_action(job, error_is_read, -ret) ==
352                     BLOCK_ERROR_ACTION_REPORT) {
353                     return ret;
354                 }
355             } while (ret < 0);
356         }
357 
358         /* If the bitmap granularity is smaller than the backup granularity,
359          * we need to advance the iterator pointer to the next cluster. */
360         if (granularity < BACKUP_CLUSTER_SIZE) {
361             bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
362         }
363 
364         last_cluster = cluster - 1;
365     }
366 
367     /* Play some final catchup with the progress meter */
368     end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
369     if (last_cluster + 1 < end) {
370         job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
371     }
372 
373     return ret;
374 }
375 
376 static void coroutine_fn backup_run(void *opaque)
377 {
378     BackupBlockJob *job = opaque;
379     BackupCompleteData *data;
380     BlockDriverState *bs = job->common.bs;
381     BlockDriverState *target = job->target;
382     BlockdevOnError on_target_error = job->on_target_error;
383     NotifierWithReturn before_write = {
384         .notify = backup_before_write_notify,
385     };
386     int64_t start, end;
387     int ret = 0;
388 
389     QLIST_INIT(&job->inflight_reqs);
390     qemu_co_rwlock_init(&job->flush_rwlock);
391 
392     start = 0;
393     end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
394 
395     job->bitmap = hbitmap_alloc(end, 0);
396 
397     bdrv_set_enable_write_cache(target, true);
398     if (target->blk) {
399         blk_set_on_error(target->blk, on_target_error, on_target_error);
400         blk_iostatus_enable(target->blk);
401     }
402 
403     bdrv_add_before_write_notifier(bs, &before_write);
404 
405     if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
406         while (!block_job_is_cancelled(&job->common)) {
407             /* Yield until the job is cancelled.  We just let our before_write
408              * notify callback service CoW requests. */
409             job->common.busy = false;
410             qemu_coroutine_yield();
411             job->common.busy = true;
412         }
413     } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
414         ret = backup_run_incremental(job);
415     } else {
416         /* Both FULL and TOP SYNC_MODE's require copying.. */
417         for (; start < end; start++) {
418             bool error_is_read;
419             if (yield_and_check(job)) {
420                 break;
421             }
422 
423             if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
424                 int i, n;
425                 int alloced = 0;
426 
427                 /* Check to see if these blocks are already in the
428                  * backing file. */
429 
430                 for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
431                     /* bdrv_is_allocated() only returns true/false based
432                      * on the first set of sectors it comes across that
433                      * are are all in the same state.
434                      * For that reason we must verify each sector in the
435                      * backup cluster length.  We end up copying more than
436                      * needed but at some point that is always the case. */
437                     alloced =
438                         bdrv_is_allocated(bs,
439                                 start * BACKUP_SECTORS_PER_CLUSTER + i,
440                                 BACKUP_SECTORS_PER_CLUSTER - i, &n);
441                     i += n;
442 
443                     if (alloced == 1 || n == 0) {
444                         break;
445                     }
446                 }
447 
448                 /* If the above loop never found any sectors that are in
449                  * the topmost image, skip this backup. */
450                 if (alloced == 0) {
451                     continue;
452                 }
453             }
454             /* FULL sync mode we copy the whole drive. */
455             ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
456                     BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
457             if (ret < 0) {
458                 /* Depending on error action, fail now or retry cluster */
459                 BlockErrorAction action =
460                     backup_error_action(job, error_is_read, -ret);
461                 if (action == BLOCK_ERROR_ACTION_REPORT) {
462                     break;
463                 } else {
464                     start--;
465                     continue;
466                 }
467             }
468         }
469     }
470 
471     notifier_with_return_remove(&before_write);
472 
473     /* wait until pending backup_do_cow() calls have completed */
474     qemu_co_rwlock_wrlock(&job->flush_rwlock);
475     qemu_co_rwlock_unlock(&job->flush_rwlock);
476     hbitmap_free(job->bitmap);
477 
478     if (target->blk) {
479         blk_iostatus_disable(target->blk);
480     }
481     bdrv_op_unblock_all(target, job->common.blocker);
482 
483     data = g_malloc(sizeof(*data));
484     data->ret = ret;
485     block_job_defer_to_main_loop(&job->common, backup_complete, data);
486 }
487 
488 void backup_start(BlockDriverState *bs, BlockDriverState *target,
489                   int64_t speed, MirrorSyncMode sync_mode,
490                   BdrvDirtyBitmap *sync_bitmap,
491                   BlockdevOnError on_source_error,
492                   BlockdevOnError on_target_error,
493                   BlockCompletionFunc *cb, void *opaque,
494                   BlockJobTxn *txn, Error **errp)
495 {
496     int64_t len;
497 
498     assert(bs);
499     assert(target);
500     assert(cb);
501 
502     if (bs == target) {
503         error_setg(errp, "Source and target cannot be the same");
504         return;
505     }
506 
507     if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
508          on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
509         (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) {
510         error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
511         return;
512     }
513 
514     if (!bdrv_is_inserted(bs)) {
515         error_setg(errp, "Device is not inserted: %s",
516                    bdrv_get_device_name(bs));
517         return;
518     }
519 
520     if (!bdrv_is_inserted(target)) {
521         error_setg(errp, "Device is not inserted: %s",
522                    bdrv_get_device_name(target));
523         return;
524     }
525 
526     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
527         return;
528     }
529 
530     if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
531         return;
532     }
533 
534     if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
535         if (!sync_bitmap) {
536             error_setg(errp, "must provide a valid bitmap name for "
537                              "\"incremental\" sync mode");
538             return;
539         }
540 
541         /* Create a new bitmap, and freeze/disable this one. */
542         if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
543             return;
544         }
545     } else if (sync_bitmap) {
546         error_setg(errp,
547                    "a sync_bitmap was provided to backup_run, "
548                    "but received an incompatible sync_mode (%s)",
549                    MirrorSyncMode_lookup[sync_mode]);
550         return;
551     }
552 
553     len = bdrv_getlength(bs);
554     if (len < 0) {
555         error_setg_errno(errp, -len, "unable to get length for '%s'",
556                          bdrv_get_device_name(bs));
557         goto error;
558     }
559 
560     BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
561                                            cb, opaque, errp);
562     if (!job) {
563         goto error;
564     }
565 
566     bdrv_op_block_all(target, job->common.blocker);
567 
568     job->on_source_error = on_source_error;
569     job->on_target_error = on_target_error;
570     job->target = target;
571     job->sync_mode = sync_mode;
572     job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
573                        sync_bitmap : NULL;
574     job->common.len = len;
575     job->common.co = qemu_coroutine_create(backup_run);
576     block_job_txn_add_job(txn, &job->common);
577     qemu_coroutine_enter(job->common.co, job);
578     return;
579 
580  error:
581     if (sync_bitmap) {
582         bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
583     }
584 }
585