xref: /openbmc/qemu/block/backup.c (revision 135b03cb)
1 /*
2  * QEMU backup
3  *
4  * Copyright (C) 2013 Proxmox Server Solutions
5  *
6  * Authors:
7  *  Dietmar Maurer (dietmar@proxmox.com)
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 
16 #include "trace.h"
17 #include "block/block.h"
18 #include "block/block_int.h"
19 #include "block/blockjob_int.h"
20 #include "block/block_backup.h"
21 #include "qapi/error.h"
22 #include "qapi/qmp/qerror.h"
23 #include "qemu/ratelimit.h"
24 #include "qemu/cutils.h"
25 #include "sysemu/block-backend.h"
26 #include "qemu/bitmap.h"
27 #include "qemu/error-report.h"
28 
29 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
30 
31 typedef struct CowRequest {
32     int64_t start_byte;
33     int64_t end_byte;
34     QLIST_ENTRY(CowRequest) list;
35     CoQueue wait_queue; /* coroutines blocked on this request */
36 } CowRequest;
37 
38 typedef struct BackupBlockJob {
39     BlockJob common;
40     BlockBackend *target;
41     /* bitmap for sync=incremental */
42     BdrvDirtyBitmap *sync_bitmap;
43     MirrorSyncMode sync_mode;
44     BlockdevOnError on_source_error;
45     BlockdevOnError on_target_error;
46     CoRwlock flush_rwlock;
47     uint64_t len;
48     uint64_t bytes_read;
49     int64_t cluster_size;
50     bool compress;
51     NotifierWithReturn before_write;
52     QLIST_HEAD(, CowRequest) inflight_reqs;
53 
54     HBitmap *copy_bitmap;
55     bool use_copy_range;
56     int64_t copy_range_size;
57 
58     bool serialize_target_writes;
59 } BackupBlockJob;
60 
61 static const BlockJobDriver backup_job_driver;
62 
63 /* See if in-flight requests overlap and wait for them to complete */
64 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
65                                                        int64_t start,
66                                                        int64_t end)
67 {
68     CowRequest *req;
69     bool retry;
70 
71     do {
72         retry = false;
73         QLIST_FOREACH(req, &job->inflight_reqs, list) {
74             if (end > req->start_byte && start < req->end_byte) {
75                 qemu_co_queue_wait(&req->wait_queue, NULL);
76                 retry = true;
77                 break;
78             }
79         }
80     } while (retry);
81 }
82 
83 /* Keep track of an in-flight request */
84 static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
85                               int64_t start, int64_t end)
86 {
87     req->start_byte = start;
88     req->end_byte = end;
89     qemu_co_queue_init(&req->wait_queue);
90     QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
91 }
92 
93 /* Forget about a completed request */
94 static void cow_request_end(CowRequest *req)
95 {
96     QLIST_REMOVE(req, list);
97     qemu_co_queue_restart_all(&req->wait_queue);
98 }
99 
100 /* Copy range to target with a bounce buffer and return the bytes copied. If
101  * error occurred, return a negative error number */
102 static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
103                                                       int64_t start,
104                                                       int64_t end,
105                                                       bool is_write_notifier,
106                                                       bool *error_is_read,
107                                                       void **bounce_buffer)
108 {
109     int ret;
110     BlockBackend *blk = job->common.blk;
111     int nbytes;
112     int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
113     int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
114 
115     assert(QEMU_IS_ALIGNED(start, job->cluster_size));
116     hbitmap_reset(job->copy_bitmap, start, job->cluster_size);
117     nbytes = MIN(job->cluster_size, job->len - start);
118     if (!*bounce_buffer) {
119         *bounce_buffer = blk_blockalign(blk, job->cluster_size);
120     }
121 
122     ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
123     if (ret < 0) {
124         trace_backup_do_cow_read_fail(job, start, ret);
125         if (error_is_read) {
126             *error_is_read = true;
127         }
128         goto fail;
129     }
130 
131     if (buffer_is_zero(*bounce_buffer, nbytes)) {
132         ret = blk_co_pwrite_zeroes(job->target, start,
133                                    nbytes, write_flags | BDRV_REQ_MAY_UNMAP);
134     } else {
135         ret = blk_co_pwrite(job->target, start,
136                             nbytes, *bounce_buffer, write_flags |
137                             (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
138     }
139     if (ret < 0) {
140         trace_backup_do_cow_write_fail(job, start, ret);
141         if (error_is_read) {
142             *error_is_read = false;
143         }
144         goto fail;
145     }
146 
147     return nbytes;
148 fail:
149     hbitmap_set(job->copy_bitmap, start, job->cluster_size);
150     return ret;
151 
152 }
153 
154 /* Copy range to target and return the bytes copied. If error occurred, return a
155  * negative error number. */
156 static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
157                                                 int64_t start,
158                                                 int64_t end,
159                                                 bool is_write_notifier)
160 {
161     int ret;
162     int nr_clusters;
163     BlockBackend *blk = job->common.blk;
164     int nbytes;
165     int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
166     int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
167 
168     assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
169     assert(QEMU_IS_ALIGNED(start, job->cluster_size));
170     nbytes = MIN(job->copy_range_size, end - start);
171     nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size);
172     hbitmap_reset(job->copy_bitmap, start, job->cluster_size * nr_clusters);
173     ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
174                             read_flags, write_flags);
175     if (ret < 0) {
176         trace_backup_do_cow_copy_range_fail(job, start, ret);
177         hbitmap_set(job->copy_bitmap, start, job->cluster_size * nr_clusters);
178         return ret;
179     }
180 
181     return nbytes;
182 }
183 
184 static int coroutine_fn backup_do_cow(BackupBlockJob *job,
185                                       int64_t offset, uint64_t bytes,
186                                       bool *error_is_read,
187                                       bool is_write_notifier)
188 {
189     CowRequest cow_request;
190     int ret = 0;
191     int64_t start, end; /* bytes */
192     void *bounce_buffer = NULL;
193 
194     qemu_co_rwlock_rdlock(&job->flush_rwlock);
195 
196     start = QEMU_ALIGN_DOWN(offset, job->cluster_size);
197     end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size);
198 
199     trace_backup_do_cow_enter(job, start, offset, bytes);
200 
201     wait_for_overlapping_requests(job, start, end);
202     cow_request_begin(&cow_request, job, start, end);
203 
204     while (start < end) {
205         int64_t dirty_end;
206 
207         if (!hbitmap_get(job->copy_bitmap, start)) {
208             trace_backup_do_cow_skip(job, start);
209             start += job->cluster_size;
210             continue; /* already copied */
211         }
212 
213         dirty_end = hbitmap_next_zero(job->copy_bitmap, start, (end - start));
214         if (dirty_end < 0) {
215             dirty_end = end;
216         }
217 
218         trace_backup_do_cow_process(job, start);
219 
220         if (job->use_copy_range) {
221             ret = backup_cow_with_offload(job, start, dirty_end,
222                                           is_write_notifier);
223             if (ret < 0) {
224                 job->use_copy_range = false;
225             }
226         }
227         if (!job->use_copy_range) {
228             ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
229                                                 is_write_notifier,
230                                                 error_is_read, &bounce_buffer);
231         }
232         if (ret < 0) {
233             break;
234         }
235 
236         /* Publish progress, guest I/O counts as progress too.  Note that the
237          * offset field is an opaque progress value, it is not a disk offset.
238          */
239         start += ret;
240         job->bytes_read += ret;
241         job_progress_update(&job->common.job, ret);
242         ret = 0;
243     }
244 
245     if (bounce_buffer) {
246         qemu_vfree(bounce_buffer);
247     }
248 
249     cow_request_end(&cow_request);
250 
251     trace_backup_do_cow_return(job, offset, bytes, ret);
252 
253     qemu_co_rwlock_unlock(&job->flush_rwlock);
254 
255     return ret;
256 }
257 
258 static int coroutine_fn backup_before_write_notify(
259         NotifierWithReturn *notifier,
260         void *opaque)
261 {
262     BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
263     BdrvTrackedRequest *req = opaque;
264 
265     assert(req->bs == blk_bs(job->common.blk));
266     assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE));
267     assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE));
268 
269     return backup_do_cow(job, req->offset, req->bytes, NULL, true);
270 }
271 
272 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
273 {
274     BdrvDirtyBitmap *bm;
275     BlockDriverState *bs = blk_bs(job->common.blk);
276 
277     if (ret < 0) {
278         /* Merge the successor back into the parent, delete nothing. */
279         bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
280         assert(bm);
281     } else {
282         /* Everything is fine, delete this bitmap and install the backup. */
283         bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
284         assert(bm);
285     }
286 }
287 
288 static void backup_commit(Job *job)
289 {
290     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
291     if (s->sync_bitmap) {
292         backup_cleanup_sync_bitmap(s, 0);
293     }
294 }
295 
296 static void backup_abort(Job *job)
297 {
298     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
299     if (s->sync_bitmap) {
300         backup_cleanup_sync_bitmap(s, -1);
301     }
302 }
303 
304 static void backup_clean(Job *job)
305 {
306     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
307     assert(s->target);
308     blk_unref(s->target);
309     s->target = NULL;
310 
311     if (s->copy_bitmap) {
312         hbitmap_free(s->copy_bitmap);
313         s->copy_bitmap = NULL;
314     }
315 }
316 
317 void backup_do_checkpoint(BlockJob *job, Error **errp)
318 {
319     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
320 
321     assert(block_job_driver(job) == &backup_job_driver);
322 
323     if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
324         error_setg(errp, "The backup job only supports block checkpoint in"
325                    " sync=none mode");
326         return;
327     }
328 
329     hbitmap_set(backup_job->copy_bitmap, 0, backup_job->len);
330 }
331 
332 static void backup_drain(BlockJob *job)
333 {
334     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
335 
336     /* Need to keep a reference in case blk_drain triggers execution
337      * of backup_complete...
338      */
339     if (s->target) {
340         BlockBackend *target = s->target;
341         blk_ref(target);
342         blk_drain(target);
343         blk_unref(target);
344     }
345 }
346 
347 static BlockErrorAction backup_error_action(BackupBlockJob *job,
348                                             bool read, int error)
349 {
350     if (read) {
351         return block_job_error_action(&job->common, job->on_source_error,
352                                       true, error);
353     } else {
354         return block_job_error_action(&job->common, job->on_target_error,
355                                       false, error);
356     }
357 }
358 
359 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
360 {
361     uint64_t delay_ns;
362 
363     if (job_is_cancelled(&job->common.job)) {
364         return true;
365     }
366 
367     /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
368      * return. Without a yield, the VM would not reboot. */
369     delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
370     job->bytes_read = 0;
371     job_sleep_ns(&job->common.job, delay_ns);
372 
373     if (job_is_cancelled(&job->common.job)) {
374         return true;
375     }
376 
377     return false;
378 }
379 
380 static bool bdrv_is_unallocated_range(BlockDriverState *bs,
381                                       int64_t offset, int64_t bytes)
382 {
383     int64_t end = offset + bytes;
384 
385     while (offset < end && !bdrv_is_allocated(bs, offset, bytes, &bytes)) {
386         if (bytes == 0) {
387             return true;
388         }
389         offset += bytes;
390         bytes = end - offset;
391     }
392 
393     return offset >= end;
394 }
395 
396 static int coroutine_fn backup_loop(BackupBlockJob *job)
397 {
398     int ret;
399     bool error_is_read;
400     int64_t offset;
401     HBitmapIter hbi;
402     BlockDriverState *bs = blk_bs(job->common.blk);
403 
404     hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
405     while ((offset = hbitmap_iter_next(&hbi)) != -1) {
406         if (job->sync_mode == MIRROR_SYNC_MODE_TOP &&
407             bdrv_is_unallocated_range(bs, offset, job->cluster_size))
408         {
409             hbitmap_reset(job->copy_bitmap, offset, job->cluster_size);
410             continue;
411         }
412 
413         do {
414             if (yield_and_check(job)) {
415                 return 0;
416             }
417             ret = backup_do_cow(job, offset,
418                                 job->cluster_size, &error_is_read, false);
419             if (ret < 0 && backup_error_action(job, error_is_read, -ret) ==
420                            BLOCK_ERROR_ACTION_REPORT)
421             {
422                 return ret;
423             }
424         } while (ret < 0);
425     }
426 
427     return 0;
428 }
429 
430 /* init copy_bitmap from sync_bitmap */
431 static void backup_incremental_init_copy_bitmap(BackupBlockJob *job)
432 {
433     uint64_t offset = 0;
434     uint64_t bytes = job->len;
435 
436     while (bdrv_dirty_bitmap_next_dirty_area(job->sync_bitmap,
437                                              &offset, &bytes))
438     {
439         hbitmap_set(job->copy_bitmap, offset, bytes);
440 
441         offset += bytes;
442         if (offset >= job->len) {
443             break;
444         }
445         bytes = job->len - offset;
446     }
447 
448     /* TODO job_progress_set_remaining() would make more sense */
449     job_progress_update(&job->common.job,
450         job->len - hbitmap_count(job->copy_bitmap));
451 }
452 
453 static int coroutine_fn backup_run(Job *job, Error **errp)
454 {
455     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
456     BlockDriverState *bs = blk_bs(s->common.blk);
457     int ret = 0;
458 
459     QLIST_INIT(&s->inflight_reqs);
460     qemu_co_rwlock_init(&s->flush_rwlock);
461 
462     job_progress_set_remaining(job, s->len);
463 
464     if (s->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
465         backup_incremental_init_copy_bitmap(s);
466     } else {
467         hbitmap_set(s->copy_bitmap, 0, s->len);
468     }
469 
470     s->before_write.notify = backup_before_write_notify;
471     bdrv_add_before_write_notifier(bs, &s->before_write);
472 
473     if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
474         /* All bits are set in copy_bitmap to allow any cluster to be copied.
475          * This does not actually require them to be copied. */
476         while (!job_is_cancelled(job)) {
477             /* Yield until the job is cancelled.  We just let our before_write
478              * notify callback service CoW requests. */
479             job_yield(job);
480         }
481     } else {
482         ret = backup_loop(s);
483     }
484 
485     notifier_with_return_remove(&s->before_write);
486 
487     /* wait until pending backup_do_cow() calls have completed */
488     qemu_co_rwlock_wrlock(&s->flush_rwlock);
489     qemu_co_rwlock_unlock(&s->flush_rwlock);
490 
491     return ret;
492 }
493 
494 static const BlockJobDriver backup_job_driver = {
495     .job_driver = {
496         .instance_size          = sizeof(BackupBlockJob),
497         .job_type               = JOB_TYPE_BACKUP,
498         .free                   = block_job_free,
499         .user_resume            = block_job_user_resume,
500         .drain                  = block_job_drain,
501         .run                    = backup_run,
502         .commit                 = backup_commit,
503         .abort                  = backup_abort,
504         .clean                  = backup_clean,
505     },
506     .drain                  = backup_drain,
507 };
508 
509 static int64_t backup_calculate_cluster_size(BlockDriverState *target,
510                                              Error **errp)
511 {
512     int ret;
513     BlockDriverInfo bdi;
514 
515     /*
516      * If there is no backing file on the target, we cannot rely on COW if our
517      * backup cluster size is smaller than the target cluster size. Even for
518      * targets with a backing file, try to avoid COW if possible.
519      */
520     ret = bdrv_get_info(target, &bdi);
521     if (ret == -ENOTSUP && !target->backing) {
522         /* Cluster size is not defined */
523         warn_report("The target block device doesn't provide "
524                     "information about the block size and it doesn't have a "
525                     "backing file. The default block size of %u bytes is "
526                     "used. If the actual block size of the target exceeds "
527                     "this default, the backup may be unusable",
528                     BACKUP_CLUSTER_SIZE_DEFAULT);
529         return BACKUP_CLUSTER_SIZE_DEFAULT;
530     } else if (ret < 0 && !target->backing) {
531         error_setg_errno(errp, -ret,
532             "Couldn't determine the cluster size of the target image, "
533             "which has no backing file");
534         error_append_hint(errp,
535             "Aborting, since this may create an unusable destination image\n");
536         return ret;
537     } else if (ret < 0 && target->backing) {
538         /* Not fatal; just trudge on ahead. */
539         return BACKUP_CLUSTER_SIZE_DEFAULT;
540     }
541 
542     return MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
543 }
544 
545 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
546                   BlockDriverState *target, int64_t speed,
547                   MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
548                   bool compress,
549                   BlockdevOnError on_source_error,
550                   BlockdevOnError on_target_error,
551                   int creation_flags,
552                   BlockCompletionFunc *cb, void *opaque,
553                   JobTxn *txn, Error **errp)
554 {
555     int64_t len;
556     BackupBlockJob *job = NULL;
557     int ret;
558     int64_t cluster_size;
559     HBitmap *copy_bitmap = NULL;
560 
561     assert(bs);
562     assert(target);
563 
564     if (bs == target) {
565         error_setg(errp, "Source and target cannot be the same");
566         return NULL;
567     }
568 
569     if (!bdrv_is_inserted(bs)) {
570         error_setg(errp, "Device is not inserted: %s",
571                    bdrv_get_device_name(bs));
572         return NULL;
573     }
574 
575     if (!bdrv_is_inserted(target)) {
576         error_setg(errp, "Device is not inserted: %s",
577                    bdrv_get_device_name(target));
578         return NULL;
579     }
580 
581     if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) {
582         error_setg(errp, "Compression is not supported for this drive %s",
583                    bdrv_get_device_name(target));
584         return NULL;
585     }
586 
587     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
588         return NULL;
589     }
590 
591     if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
592         return NULL;
593     }
594 
595     if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
596         if (!sync_bitmap) {
597             error_setg(errp, "must provide a valid bitmap name for "
598                              "\"incremental\" sync mode");
599             return NULL;
600         }
601 
602         /* Create a new bitmap, and freeze/disable this one. */
603         if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
604             return NULL;
605         }
606     } else if (sync_bitmap) {
607         error_setg(errp,
608                    "a sync_bitmap was provided to backup_run, "
609                    "but received an incompatible sync_mode (%s)",
610                    MirrorSyncMode_str(sync_mode));
611         return NULL;
612     }
613 
614     len = bdrv_getlength(bs);
615     if (len < 0) {
616         error_setg_errno(errp, -len, "unable to get length for '%s'",
617                          bdrv_get_device_name(bs));
618         goto error;
619     }
620 
621     cluster_size = backup_calculate_cluster_size(target, errp);
622     if (cluster_size < 0) {
623         goto error;
624     }
625 
626     copy_bitmap = hbitmap_alloc(len, ctz32(cluster_size));
627 
628     /* job->len is fixed, so we can't allow resize */
629     job = block_job_create(job_id, &backup_job_driver, txn, bs,
630                            BLK_PERM_CONSISTENT_READ,
631                            BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
632                            BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD,
633                            speed, creation_flags, cb, opaque, errp);
634     if (!job) {
635         goto error;
636     }
637 
638     /* The target must match the source in size, so no resize here either */
639     job->target = blk_new(job->common.job.aio_context,
640                           BLK_PERM_WRITE,
641                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
642                           BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD);
643     ret = blk_insert_bs(job->target, target, errp);
644     if (ret < 0) {
645         goto error;
646     }
647     blk_set_disable_request_queuing(job->target, true);
648 
649     job->on_source_error = on_source_error;
650     job->on_target_error = on_target_error;
651     job->sync_mode = sync_mode;
652     job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
653                        sync_bitmap : NULL;
654     job->compress = compress;
655 
656     /* Detect image-fleecing (and similar) schemes */
657     job->serialize_target_writes = bdrv_chain_contains(target, bs);
658     job->cluster_size = cluster_size;
659     job->copy_bitmap = copy_bitmap;
660     copy_bitmap = NULL;
661     job->use_copy_range = !compress; /* compression isn't supported for it */
662     job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk),
663                                         blk_get_max_transfer(job->target));
664     job->copy_range_size = MAX(job->cluster_size,
665                                QEMU_ALIGN_UP(job->copy_range_size,
666                                              job->cluster_size));
667 
668     /* Required permissions are already taken with target's blk_new() */
669     block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
670                        &error_abort);
671     job->len = len;
672 
673     return &job->common;
674 
675  error:
676     if (copy_bitmap) {
677         assert(!job || !job->copy_bitmap);
678         hbitmap_free(copy_bitmap);
679     }
680     if (sync_bitmap) {
681         bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
682     }
683     if (job) {
684         backup_clean(&job->common.job);
685         job_early_fail(&job->common.job);
686     }
687 
688     return NULL;
689 }
690