xref: /openbmc/qemu/block/backup.c (revision 500eb6db)
1 /*
2  * QEMU backup
3  *
4  * Copyright (C) 2013 Proxmox Server Solutions
5  *
6  * Authors:
7  *  Dietmar Maurer (dietmar@proxmox.com)
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  *
12  */
13 
14 #include "qemu/osdep.h"
15 
16 #include "trace.h"
17 #include "block/block.h"
18 #include "block/block_int.h"
19 #include "block/blockjob_int.h"
20 #include "block/block_backup.h"
21 #include "qapi/error.h"
22 #include "qapi/qmp/qerror.h"
23 #include "qemu/ratelimit.h"
24 #include "qemu/cutils.h"
25 #include "sysemu/block-backend.h"
26 #include "qemu/bitmap.h"
27 #include "qemu/error-report.h"
28 
29 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
30 
31 typedef struct CowRequest {
32     int64_t start_byte;
33     int64_t end_byte;
34     QLIST_ENTRY(CowRequest) list;
35     CoQueue wait_queue; /* coroutines blocked on this request */
36 } CowRequest;
37 
38 typedef struct BackupBlockJob {
39     BlockJob common;
40     BlockBackend *target;
41     /* bitmap for sync=incremental */
42     BdrvDirtyBitmap *sync_bitmap;
43     MirrorSyncMode sync_mode;
44     BlockdevOnError on_source_error;
45     BlockdevOnError on_target_error;
46     CoRwlock flush_rwlock;
47     uint64_t len;
48     uint64_t bytes_read;
49     int64_t cluster_size;
50     bool compress;
51     NotifierWithReturn before_write;
52     QLIST_HEAD(, CowRequest) inflight_reqs;
53 
54     HBitmap *copy_bitmap;
55     bool use_copy_range;
56     int64_t copy_range_size;
57 
58     bool serialize_target_writes;
59 } BackupBlockJob;
60 
61 static const BlockJobDriver backup_job_driver;
62 
63 /* See if in-flight requests overlap and wait for them to complete */
64 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
65                                                        int64_t start,
66                                                        int64_t end)
67 {
68     CowRequest *req;
69     bool retry;
70 
71     do {
72         retry = false;
73         QLIST_FOREACH(req, &job->inflight_reqs, list) {
74             if (end > req->start_byte && start < req->end_byte) {
75                 qemu_co_queue_wait(&req->wait_queue, NULL);
76                 retry = true;
77                 break;
78             }
79         }
80     } while (retry);
81 }
82 
83 /* Keep track of an in-flight request */
84 static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
85                               int64_t start, int64_t end)
86 {
87     req->start_byte = start;
88     req->end_byte = end;
89     qemu_co_queue_init(&req->wait_queue);
90     QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
91 }
92 
93 /* Forget about a completed request */
94 static void cow_request_end(CowRequest *req)
95 {
96     QLIST_REMOVE(req, list);
97     qemu_co_queue_restart_all(&req->wait_queue);
98 }
99 
100 /* Copy range to target with a bounce buffer and return the bytes copied. If
101  * error occurred, return a negative error number */
102 static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
103                                                       int64_t start,
104                                                       int64_t end,
105                                                       bool is_write_notifier,
106                                                       bool *error_is_read,
107                                                       void **bounce_buffer)
108 {
109     int ret;
110     BlockBackend *blk = job->common.blk;
111     int nbytes;
112     int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
113     int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
114 
115     assert(QEMU_IS_ALIGNED(start, job->cluster_size));
116     hbitmap_reset(job->copy_bitmap, start, job->cluster_size);
117     nbytes = MIN(job->cluster_size, job->len - start);
118     if (!*bounce_buffer) {
119         *bounce_buffer = blk_blockalign(blk, job->cluster_size);
120     }
121 
122     ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
123     if (ret < 0) {
124         trace_backup_do_cow_read_fail(job, start, ret);
125         if (error_is_read) {
126             *error_is_read = true;
127         }
128         goto fail;
129     }
130 
131     if (buffer_is_zero(*bounce_buffer, nbytes)) {
132         ret = blk_co_pwrite_zeroes(job->target, start,
133                                    nbytes, write_flags | BDRV_REQ_MAY_UNMAP);
134     } else {
135         ret = blk_co_pwrite(job->target, start,
136                             nbytes, *bounce_buffer, write_flags |
137                             (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
138     }
139     if (ret < 0) {
140         trace_backup_do_cow_write_fail(job, start, ret);
141         if (error_is_read) {
142             *error_is_read = false;
143         }
144         goto fail;
145     }
146 
147     return nbytes;
148 fail:
149     hbitmap_set(job->copy_bitmap, start, job->cluster_size);
150     return ret;
151 
152 }
153 
154 /* Copy range to target and return the bytes copied. If error occurred, return a
155  * negative error number. */
156 static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
157                                                 int64_t start,
158                                                 int64_t end,
159                                                 bool is_write_notifier)
160 {
161     int ret;
162     int nr_clusters;
163     BlockBackend *blk = job->common.blk;
164     int nbytes;
165     int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
166     int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
167 
168     assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
169     assert(QEMU_IS_ALIGNED(start, job->cluster_size));
170     nbytes = MIN(job->copy_range_size, end - start);
171     nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size);
172     hbitmap_reset(job->copy_bitmap, start, job->cluster_size * nr_clusters);
173     ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
174                             read_flags, write_flags);
175     if (ret < 0) {
176         trace_backup_do_cow_copy_range_fail(job, start, ret);
177         hbitmap_set(job->copy_bitmap, start, job->cluster_size * nr_clusters);
178         return ret;
179     }
180 
181     return nbytes;
182 }
183 
184 static int coroutine_fn backup_do_cow(BackupBlockJob *job,
185                                       int64_t offset, uint64_t bytes,
186                                       bool *error_is_read,
187                                       bool is_write_notifier)
188 {
189     CowRequest cow_request;
190     int ret = 0;
191     int64_t start, end; /* bytes */
192     void *bounce_buffer = NULL;
193 
194     qemu_co_rwlock_rdlock(&job->flush_rwlock);
195 
196     start = QEMU_ALIGN_DOWN(offset, job->cluster_size);
197     end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size);
198 
199     trace_backup_do_cow_enter(job, start, offset, bytes);
200 
201     wait_for_overlapping_requests(job, start, end);
202     cow_request_begin(&cow_request, job, start, end);
203 
204     while (start < end) {
205         if (!hbitmap_get(job->copy_bitmap, start)) {
206             trace_backup_do_cow_skip(job, start);
207             start += job->cluster_size;
208             continue; /* already copied */
209         }
210 
211         trace_backup_do_cow_process(job, start);
212 
213         if (job->use_copy_range) {
214             ret = backup_cow_with_offload(job, start, end, is_write_notifier);
215             if (ret < 0) {
216                 job->use_copy_range = false;
217             }
218         }
219         if (!job->use_copy_range) {
220             ret = backup_cow_with_bounce_buffer(job, start, end, is_write_notifier,
221                                                 error_is_read, &bounce_buffer);
222         }
223         if (ret < 0) {
224             break;
225         }
226 
227         /* Publish progress, guest I/O counts as progress too.  Note that the
228          * offset field is an opaque progress value, it is not a disk offset.
229          */
230         start += ret;
231         job->bytes_read += ret;
232         job_progress_update(&job->common.job, ret);
233         ret = 0;
234     }
235 
236     if (bounce_buffer) {
237         qemu_vfree(bounce_buffer);
238     }
239 
240     cow_request_end(&cow_request);
241 
242     trace_backup_do_cow_return(job, offset, bytes, ret);
243 
244     qemu_co_rwlock_unlock(&job->flush_rwlock);
245 
246     return ret;
247 }
248 
249 static int coroutine_fn backup_before_write_notify(
250         NotifierWithReturn *notifier,
251         void *opaque)
252 {
253     BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
254     BdrvTrackedRequest *req = opaque;
255 
256     assert(req->bs == blk_bs(job->common.blk));
257     assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE));
258     assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE));
259 
260     return backup_do_cow(job, req->offset, req->bytes, NULL, true);
261 }
262 
263 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
264 {
265     BdrvDirtyBitmap *bm;
266     BlockDriverState *bs = blk_bs(job->common.blk);
267 
268     if (ret < 0) {
269         /* Merge the successor back into the parent, delete nothing. */
270         bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
271         assert(bm);
272     } else {
273         /* Everything is fine, delete this bitmap and install the backup. */
274         bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
275         assert(bm);
276     }
277 }
278 
279 static void backup_commit(Job *job)
280 {
281     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
282     if (s->sync_bitmap) {
283         backup_cleanup_sync_bitmap(s, 0);
284     }
285 }
286 
287 static void backup_abort(Job *job)
288 {
289     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
290     if (s->sync_bitmap) {
291         backup_cleanup_sync_bitmap(s, -1);
292     }
293 }
294 
295 static void backup_clean(Job *job)
296 {
297     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
298     assert(s->target);
299     blk_unref(s->target);
300     s->target = NULL;
301 
302     if (s->copy_bitmap) {
303         hbitmap_free(s->copy_bitmap);
304         s->copy_bitmap = NULL;
305     }
306 }
307 
308 void backup_do_checkpoint(BlockJob *job, Error **errp)
309 {
310     BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
311 
312     assert(block_job_driver(job) == &backup_job_driver);
313 
314     if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
315         error_setg(errp, "The backup job only supports block checkpoint in"
316                    " sync=none mode");
317         return;
318     }
319 
320     hbitmap_set(backup_job->copy_bitmap, 0, backup_job->len);
321 }
322 
323 static void backup_drain(BlockJob *job)
324 {
325     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
326 
327     /* Need to keep a reference in case blk_drain triggers execution
328      * of backup_complete...
329      */
330     if (s->target) {
331         BlockBackend *target = s->target;
332         blk_ref(target);
333         blk_drain(target);
334         blk_unref(target);
335     }
336 }
337 
338 static BlockErrorAction backup_error_action(BackupBlockJob *job,
339                                             bool read, int error)
340 {
341     if (read) {
342         return block_job_error_action(&job->common, job->on_source_error,
343                                       true, error);
344     } else {
345         return block_job_error_action(&job->common, job->on_target_error,
346                                       false, error);
347     }
348 }
349 
350 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
351 {
352     uint64_t delay_ns;
353 
354     if (job_is_cancelled(&job->common.job)) {
355         return true;
356     }
357 
358     /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
359      * return. Without a yield, the VM would not reboot. */
360     delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
361     job->bytes_read = 0;
362     job_sleep_ns(&job->common.job, delay_ns);
363 
364     if (job_is_cancelled(&job->common.job)) {
365         return true;
366     }
367 
368     return false;
369 }
370 
371 static bool bdrv_is_unallocated_range(BlockDriverState *bs,
372                                       int64_t offset, int64_t bytes)
373 {
374     int64_t end = offset + bytes;
375 
376     while (offset < end && !bdrv_is_allocated(bs, offset, bytes, &bytes)) {
377         if (bytes == 0) {
378             return true;
379         }
380         offset += bytes;
381         bytes = end - offset;
382     }
383 
384     return offset >= end;
385 }
386 
387 static int coroutine_fn backup_loop(BackupBlockJob *job)
388 {
389     int ret;
390     bool error_is_read;
391     int64_t offset;
392     HBitmapIter hbi;
393     BlockDriverState *bs = blk_bs(job->common.blk);
394 
395     hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
396     while ((offset = hbitmap_iter_next(&hbi)) != -1) {
397         if (job->sync_mode == MIRROR_SYNC_MODE_TOP &&
398             bdrv_is_unallocated_range(bs, offset, job->cluster_size))
399         {
400             hbitmap_reset(job->copy_bitmap, offset, job->cluster_size);
401             continue;
402         }
403 
404         do {
405             if (yield_and_check(job)) {
406                 return 0;
407             }
408             ret = backup_do_cow(job, offset,
409                                 job->cluster_size, &error_is_read, false);
410             if (ret < 0 && backup_error_action(job, error_is_read, -ret) ==
411                            BLOCK_ERROR_ACTION_REPORT)
412             {
413                 return ret;
414             }
415         } while (ret < 0);
416     }
417 
418     return 0;
419 }
420 
421 /* init copy_bitmap from sync_bitmap */
422 static void backup_incremental_init_copy_bitmap(BackupBlockJob *job)
423 {
424     uint64_t offset = 0;
425     uint64_t bytes = job->len;
426 
427     while (bdrv_dirty_bitmap_next_dirty_area(job->sync_bitmap,
428                                              &offset, &bytes))
429     {
430         hbitmap_set(job->copy_bitmap, offset, bytes);
431 
432         offset += bytes;
433         if (offset >= job->len) {
434             break;
435         }
436         bytes = job->len - offset;
437     }
438 
439     /* TODO job_progress_set_remaining() would make more sense */
440     job_progress_update(&job->common.job,
441         job->len - hbitmap_count(job->copy_bitmap));
442 }
443 
444 static int coroutine_fn backup_run(Job *job, Error **errp)
445 {
446     BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
447     BlockDriverState *bs = blk_bs(s->common.blk);
448     int ret = 0;
449 
450     QLIST_INIT(&s->inflight_reqs);
451     qemu_co_rwlock_init(&s->flush_rwlock);
452 
453     job_progress_set_remaining(job, s->len);
454 
455     if (s->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
456         backup_incremental_init_copy_bitmap(s);
457     } else {
458         hbitmap_set(s->copy_bitmap, 0, s->len);
459     }
460 
461     s->before_write.notify = backup_before_write_notify;
462     bdrv_add_before_write_notifier(bs, &s->before_write);
463 
464     if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
465         /* All bits are set in copy_bitmap to allow any cluster to be copied.
466          * This does not actually require them to be copied. */
467         while (!job_is_cancelled(job)) {
468             /* Yield until the job is cancelled.  We just let our before_write
469              * notify callback service CoW requests. */
470             job_yield(job);
471         }
472     } else {
473         ret = backup_loop(s);
474     }
475 
476     notifier_with_return_remove(&s->before_write);
477 
478     /* wait until pending backup_do_cow() calls have completed */
479     qemu_co_rwlock_wrlock(&s->flush_rwlock);
480     qemu_co_rwlock_unlock(&s->flush_rwlock);
481 
482     return ret;
483 }
484 
485 static const BlockJobDriver backup_job_driver = {
486     .job_driver = {
487         .instance_size          = sizeof(BackupBlockJob),
488         .job_type               = JOB_TYPE_BACKUP,
489         .free                   = block_job_free,
490         .user_resume            = block_job_user_resume,
491         .drain                  = block_job_drain,
492         .run                    = backup_run,
493         .commit                 = backup_commit,
494         .abort                  = backup_abort,
495         .clean                  = backup_clean,
496     },
497     .drain                  = backup_drain,
498 };
499 
500 static int64_t backup_calculate_cluster_size(BlockDriverState *target,
501                                              Error **errp)
502 {
503     int ret;
504     BlockDriverInfo bdi;
505 
506     /*
507      * If there is no backing file on the target, we cannot rely on COW if our
508      * backup cluster size is smaller than the target cluster size. Even for
509      * targets with a backing file, try to avoid COW if possible.
510      */
511     ret = bdrv_get_info(target, &bdi);
512     if (ret == -ENOTSUP && !target->backing) {
513         /* Cluster size is not defined */
514         warn_report("The target block device doesn't provide "
515                     "information about the block size and it doesn't have a "
516                     "backing file. The default block size of %u bytes is "
517                     "used. If the actual block size of the target exceeds "
518                     "this default, the backup may be unusable",
519                     BACKUP_CLUSTER_SIZE_DEFAULT);
520         return BACKUP_CLUSTER_SIZE_DEFAULT;
521     } else if (ret < 0 && !target->backing) {
522         error_setg_errno(errp, -ret,
523             "Couldn't determine the cluster size of the target image, "
524             "which has no backing file");
525         error_append_hint(errp,
526             "Aborting, since this may create an unusable destination image\n");
527         return ret;
528     } else if (ret < 0 && target->backing) {
529         /* Not fatal; just trudge on ahead. */
530         return BACKUP_CLUSTER_SIZE_DEFAULT;
531     }
532 
533     return MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
534 }
535 
536 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
537                   BlockDriverState *target, int64_t speed,
538                   MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
539                   bool compress,
540                   BlockdevOnError on_source_error,
541                   BlockdevOnError on_target_error,
542                   int creation_flags,
543                   BlockCompletionFunc *cb, void *opaque,
544                   JobTxn *txn, Error **errp)
545 {
546     int64_t len;
547     BackupBlockJob *job = NULL;
548     int ret;
549     int64_t cluster_size;
550     HBitmap *copy_bitmap = NULL;
551 
552     assert(bs);
553     assert(target);
554 
555     if (bs == target) {
556         error_setg(errp, "Source and target cannot be the same");
557         return NULL;
558     }
559 
560     if (!bdrv_is_inserted(bs)) {
561         error_setg(errp, "Device is not inserted: %s",
562                    bdrv_get_device_name(bs));
563         return NULL;
564     }
565 
566     if (!bdrv_is_inserted(target)) {
567         error_setg(errp, "Device is not inserted: %s",
568                    bdrv_get_device_name(target));
569         return NULL;
570     }
571 
572     if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) {
573         error_setg(errp, "Compression is not supported for this drive %s",
574                    bdrv_get_device_name(target));
575         return NULL;
576     }
577 
578     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
579         return NULL;
580     }
581 
582     if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
583         return NULL;
584     }
585 
586     if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
587         if (!sync_bitmap) {
588             error_setg(errp, "must provide a valid bitmap name for "
589                              "\"incremental\" sync mode");
590             return NULL;
591         }
592 
593         /* Create a new bitmap, and freeze/disable this one. */
594         if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
595             return NULL;
596         }
597     } else if (sync_bitmap) {
598         error_setg(errp,
599                    "a sync_bitmap was provided to backup_run, "
600                    "but received an incompatible sync_mode (%s)",
601                    MirrorSyncMode_str(sync_mode));
602         return NULL;
603     }
604 
605     len = bdrv_getlength(bs);
606     if (len < 0) {
607         error_setg_errno(errp, -len, "unable to get length for '%s'",
608                          bdrv_get_device_name(bs));
609         goto error;
610     }
611 
612     cluster_size = backup_calculate_cluster_size(target, errp);
613     if (cluster_size < 0) {
614         goto error;
615     }
616 
617     copy_bitmap = hbitmap_alloc(len, ctz32(cluster_size));
618 
619     /* job->len is fixed, so we can't allow resize */
620     job = block_job_create(job_id, &backup_job_driver, txn, bs,
621                            BLK_PERM_CONSISTENT_READ,
622                            BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
623                            BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD,
624                            speed, creation_flags, cb, opaque, errp);
625     if (!job) {
626         goto error;
627     }
628 
629     /* The target must match the source in size, so no resize here either */
630     job->target = blk_new(job->common.job.aio_context,
631                           BLK_PERM_WRITE,
632                           BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
633                           BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD);
634     ret = blk_insert_bs(job->target, target, errp);
635     if (ret < 0) {
636         goto error;
637     }
638 
639     job->on_source_error = on_source_error;
640     job->on_target_error = on_target_error;
641     job->sync_mode = sync_mode;
642     job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
643                        sync_bitmap : NULL;
644     job->compress = compress;
645 
646     /* Detect image-fleecing (and similar) schemes */
647     job->serialize_target_writes = bdrv_chain_contains(target, bs);
648     job->cluster_size = cluster_size;
649     job->copy_bitmap = copy_bitmap;
650     copy_bitmap = NULL;
651     job->use_copy_range = true;
652     job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk),
653                                         blk_get_max_transfer(job->target));
654     job->copy_range_size = MAX(job->cluster_size,
655                                QEMU_ALIGN_UP(job->copy_range_size,
656                                              job->cluster_size));
657 
658     /* Required permissions are already taken with target's blk_new() */
659     block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
660                        &error_abort);
661     job->len = len;
662 
663     return &job->common;
664 
665  error:
666     if (copy_bitmap) {
667         assert(!job || !job->copy_bitmap);
668         hbitmap_free(copy_bitmap);
669     }
670     if (sync_bitmap) {
671         bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
672     }
673     if (job) {
674         backup_clean(&job->common.job);
675         job_early_fail(&job->common.job);
676     }
677 
678     return NULL;
679 }
680