xref: /openbmc/qemu/block/qcow2-refcount.c (revision 9bf040b9)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu-common.h"
26 #include "block/block_int.h"
27 #include "block/qcow2.h"
28 #include "qemu/range.h"
29 #include "qapi/qmp/types.h"
30 #include "qapi-event.h"
31 
32 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
33 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
34                             int64_t offset, int64_t length,
35                             int addend, enum qcow2_discard_type type);
36 
37 
38 /*********************************************************/
39 /* refcount handling */
40 
41 int qcow2_refcount_init(BlockDriverState *bs)
42 {
43     BDRVQcowState *s = bs->opaque;
44     unsigned int refcount_table_size2, i;
45     int ret;
46 
47     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
48     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
49     s->refcount_table = g_try_malloc(refcount_table_size2);
50 
51     if (s->refcount_table_size > 0) {
52         if (s->refcount_table == NULL) {
53             ret = -ENOMEM;
54             goto fail;
55         }
56         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
57         ret = bdrv_pread(bs->file, s->refcount_table_offset,
58                          s->refcount_table, refcount_table_size2);
59         if (ret < 0) {
60             goto fail;
61         }
62         for(i = 0; i < s->refcount_table_size; i++)
63             be64_to_cpus(&s->refcount_table[i]);
64     }
65     return 0;
66  fail:
67     return ret;
68 }
69 
70 void qcow2_refcount_close(BlockDriverState *bs)
71 {
72     BDRVQcowState *s = bs->opaque;
73     g_free(s->refcount_table);
74 }
75 
76 
77 static int load_refcount_block(BlockDriverState *bs,
78                                int64_t refcount_block_offset,
79                                void **refcount_block)
80 {
81     BDRVQcowState *s = bs->opaque;
82     int ret;
83 
84     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
85     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
86         refcount_block);
87 
88     return ret;
89 }
90 
91 /*
92  * Returns the refcount of the cluster given by its index. Any non-negative
93  * return value is the refcount of the cluster, negative values are -errno
94  * and indicate an error.
95  */
96 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
97 {
98     BDRVQcowState *s = bs->opaque;
99     uint64_t refcount_table_index, block_index;
100     int64_t refcount_block_offset;
101     int ret;
102     uint16_t *refcount_block;
103     uint16_t refcount;
104 
105     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
106     if (refcount_table_index >= s->refcount_table_size)
107         return 0;
108     refcount_block_offset =
109         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
110     if (!refcount_block_offset)
111         return 0;
112 
113     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
114         (void**) &refcount_block);
115     if (ret < 0) {
116         return ret;
117     }
118 
119     block_index = cluster_index &
120         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
121     refcount = be16_to_cpu(refcount_block[block_index]);
122 
123     ret = qcow2_cache_put(bs, s->refcount_block_cache,
124         (void**) &refcount_block);
125     if (ret < 0) {
126         return ret;
127     }
128 
129     return refcount;
130 }
131 
132 /*
133  * Rounds the refcount table size up to avoid growing the table for each single
134  * refcount block that is allocated.
135  */
136 static unsigned int next_refcount_table_size(BDRVQcowState *s,
137     unsigned int min_size)
138 {
139     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
140     unsigned int refcount_table_clusters =
141         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
142 
143     while (min_clusters > refcount_table_clusters) {
144         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
145     }
146 
147     return refcount_table_clusters << (s->cluster_bits - 3);
148 }
149 
150 
151 /* Checks if two offsets are described by the same refcount block */
152 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
153     uint64_t offset_b)
154 {
155     uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
156     uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
157 
158     return (block_a == block_b);
159 }
160 
161 /*
162  * Loads a refcount block. If it doesn't exist yet, it is allocated first
163  * (including growing the refcount table if needed).
164  *
165  * Returns 0 on success or -errno in error case
166  */
167 static int alloc_refcount_block(BlockDriverState *bs,
168     int64_t cluster_index, uint16_t **refcount_block)
169 {
170     BDRVQcowState *s = bs->opaque;
171     unsigned int refcount_table_index;
172     int ret;
173 
174     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
175 
176     /* Find the refcount block for the given cluster */
177     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
178 
179     if (refcount_table_index < s->refcount_table_size) {
180 
181         uint64_t refcount_block_offset =
182             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
183 
184         /* If it's already there, we're done */
185         if (refcount_block_offset) {
186              return load_refcount_block(bs, refcount_block_offset,
187                  (void**) refcount_block);
188         }
189     }
190 
191     /*
192      * If we came here, we need to allocate something. Something is at least
193      * a cluster for the new refcount block. It may also include a new refcount
194      * table if the old refcount table is too small.
195      *
196      * Note that allocating clusters here needs some special care:
197      *
198      * - We can't use the normal qcow2_alloc_clusters(), it would try to
199      *   increase the refcount and very likely we would end up with an endless
200      *   recursion. Instead we must place the refcount blocks in a way that
201      *   they can describe them themselves.
202      *
203      * - We need to consider that at this point we are inside update_refcounts
204      *   and potentially doing an initial refcount increase. This means that
205      *   some clusters have already been allocated by the caller, but their
206      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
207      *   need to return -EAGAIN to signal the caller that it needs to restart
208      *   the search for free clusters.
209      *
210      * - alloc_clusters_noref and qcow2_free_clusters may load a different
211      *   refcount block into the cache
212      */
213 
214     *refcount_block = NULL;
215 
216     /* We write to the refcount table, so we might depend on L2 tables */
217     ret = qcow2_cache_flush(bs, s->l2_table_cache);
218     if (ret < 0) {
219         return ret;
220     }
221 
222     /* Allocate the refcount block itself and mark it as used */
223     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
224     if (new_block < 0) {
225         return new_block;
226     }
227 
228 #ifdef DEBUG_ALLOC2
229     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
230         " at %" PRIx64 "\n",
231         refcount_table_index, cluster_index << s->cluster_bits, new_block);
232 #endif
233 
234     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
235         /* Zero the new refcount block before updating it */
236         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
237             (void**) refcount_block);
238         if (ret < 0) {
239             goto fail_block;
240         }
241 
242         memset(*refcount_block, 0, s->cluster_size);
243 
244         /* The block describes itself, need to update the cache */
245         int block_index = (new_block >> s->cluster_bits) &
246             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
247         (*refcount_block)[block_index] = cpu_to_be16(1);
248     } else {
249         /* Described somewhere else. This can recurse at most twice before we
250          * arrive at a block that describes itself. */
251         ret = update_refcount(bs, new_block, s->cluster_size, 1,
252                               QCOW2_DISCARD_NEVER);
253         if (ret < 0) {
254             goto fail_block;
255         }
256 
257         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
258         if (ret < 0) {
259             goto fail_block;
260         }
261 
262         /* Initialize the new refcount block only after updating its refcount,
263          * update_refcount uses the refcount cache itself */
264         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
265             (void**) refcount_block);
266         if (ret < 0) {
267             goto fail_block;
268         }
269 
270         memset(*refcount_block, 0, s->cluster_size);
271     }
272 
273     /* Now the new refcount block needs to be written to disk */
274     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
275     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
276     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
277     if (ret < 0) {
278         goto fail_block;
279     }
280 
281     /* If the refcount table is big enough, just hook the block up there */
282     if (refcount_table_index < s->refcount_table_size) {
283         uint64_t data64 = cpu_to_be64(new_block);
284         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
285         ret = bdrv_pwrite_sync(bs->file,
286             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
287             &data64, sizeof(data64));
288         if (ret < 0) {
289             goto fail_block;
290         }
291 
292         s->refcount_table[refcount_table_index] = new_block;
293 
294         /* The new refcount block may be where the caller intended to put its
295          * data, so let it restart the search. */
296         return -EAGAIN;
297     }
298 
299     ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
300     if (ret < 0) {
301         goto fail_block;
302     }
303 
304     /*
305      * If we come here, we need to grow the refcount table. Again, a new
306      * refcount table needs some space and we can't simply allocate to avoid
307      * endless recursion.
308      *
309      * Therefore let's grab new refcount blocks at the end of the image, which
310      * will describe themselves and the new refcount table. This way we can
311      * reference them only in the new table and do the switch to the new
312      * refcount table at once without producing an inconsistent state in
313      * between.
314      */
315     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
316 
317     /* Calculate the number of refcount blocks needed so far */
318     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
319     uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
320 
321     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
322         return -EFBIG;
323     }
324 
325     /* And now we need at least one block more for the new metadata */
326     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
327     uint64_t last_table_size;
328     uint64_t blocks_clusters;
329     do {
330         uint64_t table_clusters =
331             size_to_clusters(s, table_size * sizeof(uint64_t));
332         blocks_clusters = 1 +
333             ((table_clusters + refcount_block_clusters - 1)
334             / refcount_block_clusters);
335         uint64_t meta_clusters = table_clusters + blocks_clusters;
336 
337         last_table_size = table_size;
338         table_size = next_refcount_table_size(s, blocks_used +
339             ((meta_clusters + refcount_block_clusters - 1)
340             / refcount_block_clusters));
341 
342     } while (last_table_size != table_size);
343 
344 #ifdef DEBUG_ALLOC2
345     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
346         s->refcount_table_size, table_size);
347 #endif
348 
349     /* Create the new refcount table and blocks */
350     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
351         s->cluster_size;
352     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
353     uint64_t *new_table = g_try_new0(uint64_t, table_size);
354     uint16_t *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
355 
356     assert(table_size > 0 && blocks_clusters > 0);
357     if (new_table == NULL || new_blocks == NULL) {
358         ret = -ENOMEM;
359         goto fail_table;
360     }
361 
362     /* Fill the new refcount table */
363     memcpy(new_table, s->refcount_table,
364         s->refcount_table_size * sizeof(uint64_t));
365     new_table[refcount_table_index] = new_block;
366 
367     int i;
368     for (i = 0; i < blocks_clusters; i++) {
369         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
370     }
371 
372     /* Fill the refcount blocks */
373     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
374     int block = 0;
375     for (i = 0; i < table_clusters + blocks_clusters; i++) {
376         new_blocks[block++] = cpu_to_be16(1);
377     }
378 
379     /* Write refcount blocks to disk */
380     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
381     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
382         blocks_clusters * s->cluster_size);
383     g_free(new_blocks);
384     new_blocks = NULL;
385     if (ret < 0) {
386         goto fail_table;
387     }
388 
389     /* Write refcount table to disk */
390     for(i = 0; i < table_size; i++) {
391         cpu_to_be64s(&new_table[i]);
392     }
393 
394     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
395     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
396         table_size * sizeof(uint64_t));
397     if (ret < 0) {
398         goto fail_table;
399     }
400 
401     for(i = 0; i < table_size; i++) {
402         be64_to_cpus(&new_table[i]);
403     }
404 
405     /* Hook up the new refcount table in the qcow2 header */
406     uint8_t data[12];
407     cpu_to_be64w((uint64_t*)data, table_offset);
408     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
409     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
410     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
411         data, sizeof(data));
412     if (ret < 0) {
413         goto fail_table;
414     }
415 
416     /* And switch it in memory */
417     uint64_t old_table_offset = s->refcount_table_offset;
418     uint64_t old_table_size = s->refcount_table_size;
419 
420     g_free(s->refcount_table);
421     s->refcount_table = new_table;
422     s->refcount_table_size = table_size;
423     s->refcount_table_offset = table_offset;
424 
425     /* Free old table. */
426     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
427                         QCOW2_DISCARD_OTHER);
428 
429     ret = load_refcount_block(bs, new_block, (void**) refcount_block);
430     if (ret < 0) {
431         return ret;
432     }
433 
434     /* If we were trying to do the initial refcount update for some cluster
435      * allocation, we might have used the same clusters to store newly
436      * allocated metadata. Make the caller search some new space. */
437     return -EAGAIN;
438 
439 fail_table:
440     g_free(new_blocks);
441     g_free(new_table);
442 fail_block:
443     if (*refcount_block != NULL) {
444         qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
445     }
446     return ret;
447 }
448 
449 void qcow2_process_discards(BlockDriverState *bs, int ret)
450 {
451     BDRVQcowState *s = bs->opaque;
452     Qcow2DiscardRegion *d, *next;
453 
454     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
455         QTAILQ_REMOVE(&s->discards, d, next);
456 
457         /* Discard is optional, ignore the return value */
458         if (ret >= 0) {
459             bdrv_discard(bs->file,
460                          d->offset >> BDRV_SECTOR_BITS,
461                          d->bytes >> BDRV_SECTOR_BITS);
462         }
463 
464         g_free(d);
465     }
466 }
467 
468 static void update_refcount_discard(BlockDriverState *bs,
469                                     uint64_t offset, uint64_t length)
470 {
471     BDRVQcowState *s = bs->opaque;
472     Qcow2DiscardRegion *d, *p, *next;
473 
474     QTAILQ_FOREACH(d, &s->discards, next) {
475         uint64_t new_start = MIN(offset, d->offset);
476         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
477 
478         if (new_end - new_start <= length + d->bytes) {
479             /* There can't be any overlap, areas ending up here have no
480              * references any more and therefore shouldn't get freed another
481              * time. */
482             assert(d->bytes + length == new_end - new_start);
483             d->offset = new_start;
484             d->bytes = new_end - new_start;
485             goto found;
486         }
487     }
488 
489     d = g_malloc(sizeof(*d));
490     *d = (Qcow2DiscardRegion) {
491         .bs     = bs,
492         .offset = offset,
493         .bytes  = length,
494     };
495     QTAILQ_INSERT_TAIL(&s->discards, d, next);
496 
497 found:
498     /* Merge discard requests if they are adjacent now */
499     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
500         if (p == d
501             || p->offset > d->offset + d->bytes
502             || d->offset > p->offset + p->bytes)
503         {
504             continue;
505         }
506 
507         /* Still no overlap possible */
508         assert(p->offset == d->offset + d->bytes
509             || d->offset == p->offset + p->bytes);
510 
511         QTAILQ_REMOVE(&s->discards, p, next);
512         d->offset = MIN(d->offset, p->offset);
513         d->bytes += p->bytes;
514     }
515 }
516 
517 /* XXX: cache several refcount block clusters ? */
518 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
519     int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
520 {
521     BDRVQcowState *s = bs->opaque;
522     int64_t start, last, cluster_offset;
523     uint16_t *refcount_block = NULL;
524     int64_t old_table_index = -1;
525     int ret;
526 
527 #ifdef DEBUG_ALLOC2
528     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
529            offset, length, addend);
530 #endif
531     if (length < 0) {
532         return -EINVAL;
533     } else if (length == 0) {
534         return 0;
535     }
536 
537     if (addend < 0) {
538         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
539             s->l2_table_cache);
540     }
541 
542     start = start_of_cluster(s, offset);
543     last = start_of_cluster(s, offset + length - 1);
544     for(cluster_offset = start; cluster_offset <= last;
545         cluster_offset += s->cluster_size)
546     {
547         int block_index, refcount;
548         int64_t cluster_index = cluster_offset >> s->cluster_bits;
549         int64_t table_index =
550             cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
551 
552         /* Load the refcount block and allocate it if needed */
553         if (table_index != old_table_index) {
554             if (refcount_block) {
555                 ret = qcow2_cache_put(bs, s->refcount_block_cache,
556                     (void**) &refcount_block);
557                 if (ret < 0) {
558                     goto fail;
559                 }
560             }
561 
562             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
563             if (ret < 0) {
564                 goto fail;
565             }
566         }
567         old_table_index = table_index;
568 
569         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
570 
571         /* we can update the count and save it */
572         block_index = cluster_index &
573             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
574 
575         refcount = be16_to_cpu(refcount_block[block_index]);
576         refcount += addend;
577         if (refcount < 0 || refcount > 0xffff) {
578             ret = -EINVAL;
579             goto fail;
580         }
581         if (refcount == 0 && cluster_index < s->free_cluster_index) {
582             s->free_cluster_index = cluster_index;
583         }
584         refcount_block[block_index] = cpu_to_be16(refcount);
585 
586         if (refcount == 0 && s->discard_passthrough[type]) {
587             update_refcount_discard(bs, cluster_offset, s->cluster_size);
588         }
589     }
590 
591     ret = 0;
592 fail:
593     if (!s->cache_discards) {
594         qcow2_process_discards(bs, ret);
595     }
596 
597     /* Write last changed block to disk */
598     if (refcount_block) {
599         int wret;
600         wret = qcow2_cache_put(bs, s->refcount_block_cache,
601             (void**) &refcount_block);
602         if (wret < 0) {
603             return ret < 0 ? ret : wret;
604         }
605     }
606 
607     /*
608      * Try do undo any updates if an error is returned (This may succeed in
609      * some cases like ENOSPC for allocating a new refcount block)
610      */
611     if (ret < 0) {
612         int dummy;
613         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
614                                 QCOW2_DISCARD_NEVER);
615         (void)dummy;
616     }
617 
618     return ret;
619 }
620 
621 /*
622  * Increases or decreases the refcount of a given cluster by one.
623  * addend must be 1 or -1.
624  *
625  * If the return value is non-negative, it is the new refcount of the cluster.
626  * If it is negative, it is -errno and indicates an error.
627  */
628 int qcow2_update_cluster_refcount(BlockDriverState *bs,
629                                   int64_t cluster_index,
630                                   int addend,
631                                   enum qcow2_discard_type type)
632 {
633     BDRVQcowState *s = bs->opaque;
634     int ret;
635 
636     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
637                           type);
638     if (ret < 0) {
639         return ret;
640     }
641 
642     return get_refcount(bs, cluster_index);
643 }
644 
645 
646 
647 /*********************************************************/
648 /* cluster allocation functions */
649 
650 
651 
652 /* return < 0 if error */
653 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
654 {
655     BDRVQcowState *s = bs->opaque;
656     uint64_t i, nb_clusters;
657     int refcount;
658 
659     nb_clusters = size_to_clusters(s, size);
660 retry:
661     for(i = 0; i < nb_clusters; i++) {
662         uint64_t next_cluster_index = s->free_cluster_index++;
663         refcount = get_refcount(bs, next_cluster_index);
664 
665         if (refcount < 0) {
666             return refcount;
667         } else if (refcount != 0) {
668             goto retry;
669         }
670     }
671 
672     /* Make sure that all offsets in the "allocated" range are representable
673      * in an int64_t */
674     if (s->free_cluster_index > 0 &&
675         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
676     {
677         return -EFBIG;
678     }
679 
680 #ifdef DEBUG_ALLOC2
681     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
682             size,
683             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
684 #endif
685     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
686 }
687 
688 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
689 {
690     int64_t offset;
691     int ret;
692 
693     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
694     do {
695         offset = alloc_clusters_noref(bs, size);
696         if (offset < 0) {
697             return offset;
698         }
699 
700         ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
701     } while (ret == -EAGAIN);
702 
703     if (ret < 0) {
704         return ret;
705     }
706 
707     return offset;
708 }
709 
710 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
711     int nb_clusters)
712 {
713     BDRVQcowState *s = bs->opaque;
714     uint64_t cluster_index;
715     uint64_t i;
716     int refcount, ret;
717 
718     assert(nb_clusters >= 0);
719     if (nb_clusters == 0) {
720         return 0;
721     }
722 
723     do {
724         /* Check how many clusters there are free */
725         cluster_index = offset >> s->cluster_bits;
726         for(i = 0; i < nb_clusters; i++) {
727             refcount = get_refcount(bs, cluster_index++);
728 
729             if (refcount < 0) {
730                 return refcount;
731             } else if (refcount != 0) {
732                 break;
733             }
734         }
735 
736         /* And then allocate them */
737         ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
738                               QCOW2_DISCARD_NEVER);
739     } while (ret == -EAGAIN);
740 
741     if (ret < 0) {
742         return ret;
743     }
744 
745     return i;
746 }
747 
748 /* only used to allocate compressed sectors. We try to allocate
749    contiguous sectors. size must be <= cluster_size */
750 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
751 {
752     BDRVQcowState *s = bs->opaque;
753     int64_t offset, cluster_offset;
754     int free_in_cluster;
755 
756     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
757     assert(size > 0 && size <= s->cluster_size);
758     if (s->free_byte_offset == 0) {
759         offset = qcow2_alloc_clusters(bs, s->cluster_size);
760         if (offset < 0) {
761             return offset;
762         }
763         s->free_byte_offset = offset;
764     }
765  redo:
766     free_in_cluster = s->cluster_size -
767         offset_into_cluster(s, s->free_byte_offset);
768     if (size <= free_in_cluster) {
769         /* enough space in current cluster */
770         offset = s->free_byte_offset;
771         s->free_byte_offset += size;
772         free_in_cluster -= size;
773         if (free_in_cluster == 0)
774             s->free_byte_offset = 0;
775         if (offset_into_cluster(s, offset) != 0)
776             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
777                                           QCOW2_DISCARD_NEVER);
778     } else {
779         offset = qcow2_alloc_clusters(bs, s->cluster_size);
780         if (offset < 0) {
781             return offset;
782         }
783         cluster_offset = start_of_cluster(s, s->free_byte_offset);
784         if ((cluster_offset + s->cluster_size) == offset) {
785             /* we are lucky: contiguous data */
786             offset = s->free_byte_offset;
787             qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
788                                           QCOW2_DISCARD_NEVER);
789             s->free_byte_offset += size;
790         } else {
791             s->free_byte_offset = offset;
792             goto redo;
793         }
794     }
795 
796     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
797      * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
798      * be flushed before the caller's L2 table updates.
799      */
800     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
801     return offset;
802 }
803 
804 void qcow2_free_clusters(BlockDriverState *bs,
805                           int64_t offset, int64_t size,
806                           enum qcow2_discard_type type)
807 {
808     int ret;
809 
810     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
811     ret = update_refcount(bs, offset, size, -1, type);
812     if (ret < 0) {
813         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
814         /* TODO Remember the clusters to free them later and avoid leaking */
815     }
816 }
817 
818 /*
819  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
820  * normal cluster, compressed cluster, etc.)
821  */
822 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
823                              int nb_clusters, enum qcow2_discard_type type)
824 {
825     BDRVQcowState *s = bs->opaque;
826 
827     switch (qcow2_get_cluster_type(l2_entry)) {
828     case QCOW2_CLUSTER_COMPRESSED:
829         {
830             int nb_csectors;
831             nb_csectors = ((l2_entry >> s->csize_shift) &
832                            s->csize_mask) + 1;
833             qcow2_free_clusters(bs,
834                 (l2_entry & s->cluster_offset_mask) & ~511,
835                 nb_csectors * 512, type);
836         }
837         break;
838     case QCOW2_CLUSTER_NORMAL:
839     case QCOW2_CLUSTER_ZERO:
840         if (l2_entry & L2E_OFFSET_MASK) {
841             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
842                                 nb_clusters << s->cluster_bits, type);
843         }
844         break;
845     case QCOW2_CLUSTER_UNALLOCATED:
846         break;
847     default:
848         abort();
849     }
850 }
851 
852 
853 
854 /*********************************************************/
855 /* snapshots and image creation */
856 
857 
858 
859 /* update the refcounts of snapshots and the copied flag */
860 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
861     int64_t l1_table_offset, int l1_size, int addend)
862 {
863     BDRVQcowState *s = bs->opaque;
864     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
865     bool l1_allocated = false;
866     int64_t old_offset, old_l2_offset;
867     int i, j, l1_modified = 0, nb_csectors, refcount;
868     int ret;
869 
870     l2_table = NULL;
871     l1_table = NULL;
872     l1_size2 = l1_size * sizeof(uint64_t);
873 
874     s->cache_discards = true;
875 
876     /* WARNING: qcow2_snapshot_goto relies on this function not using the
877      * l1_table_offset when it is the current s->l1_table_offset! Be careful
878      * when changing this! */
879     if (l1_table_offset != s->l1_table_offset) {
880         l1_table = g_try_malloc0(align_offset(l1_size2, 512));
881         if (l1_size2 && l1_table == NULL) {
882             ret = -ENOMEM;
883             goto fail;
884         }
885         l1_allocated = true;
886 
887         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
888         if (ret < 0) {
889             goto fail;
890         }
891 
892         for(i = 0;i < l1_size; i++)
893             be64_to_cpus(&l1_table[i]);
894     } else {
895         assert(l1_size == s->l1_size);
896         l1_table = s->l1_table;
897         l1_allocated = false;
898     }
899 
900     for(i = 0; i < l1_size; i++) {
901         l2_offset = l1_table[i];
902         if (l2_offset) {
903             old_l2_offset = l2_offset;
904             l2_offset &= L1E_OFFSET_MASK;
905 
906             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
907                 (void**) &l2_table);
908             if (ret < 0) {
909                 goto fail;
910             }
911 
912             for(j = 0; j < s->l2_size; j++) {
913                 uint64_t cluster_index;
914 
915                 offset = be64_to_cpu(l2_table[j]);
916                 old_offset = offset;
917                 offset &= ~QCOW_OFLAG_COPIED;
918 
919                 switch (qcow2_get_cluster_type(offset)) {
920                     case QCOW2_CLUSTER_COMPRESSED:
921                         nb_csectors = ((offset >> s->csize_shift) &
922                                        s->csize_mask) + 1;
923                         if (addend != 0) {
924                             ret = update_refcount(bs,
925                                 (offset & s->cluster_offset_mask) & ~511,
926                                 nb_csectors * 512, addend,
927                                 QCOW2_DISCARD_SNAPSHOT);
928                             if (ret < 0) {
929                                 goto fail;
930                             }
931                         }
932                         /* compressed clusters are never modified */
933                         refcount = 2;
934                         break;
935 
936                     case QCOW2_CLUSTER_NORMAL:
937                     case QCOW2_CLUSTER_ZERO:
938                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
939                         if (!cluster_index) {
940                             /* unallocated */
941                             refcount = 0;
942                             break;
943                         }
944                         if (addend != 0) {
945                             refcount = qcow2_update_cluster_refcount(bs,
946                                     cluster_index, addend,
947                                     QCOW2_DISCARD_SNAPSHOT);
948                         } else {
949                             refcount = get_refcount(bs, cluster_index);
950                         }
951 
952                         if (refcount < 0) {
953                             ret = refcount;
954                             goto fail;
955                         }
956                         break;
957 
958                     case QCOW2_CLUSTER_UNALLOCATED:
959                         refcount = 0;
960                         break;
961 
962                     default:
963                         abort();
964                 }
965 
966                 if (refcount == 1) {
967                     offset |= QCOW_OFLAG_COPIED;
968                 }
969                 if (offset != old_offset) {
970                     if (addend > 0) {
971                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
972                             s->refcount_block_cache);
973                     }
974                     l2_table[j] = cpu_to_be64(offset);
975                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
976                 }
977             }
978 
979             ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
980             if (ret < 0) {
981                 goto fail;
982             }
983 
984 
985             if (addend != 0) {
986                 refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
987                         s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
988             } else {
989                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
990             }
991             if (refcount < 0) {
992                 ret = refcount;
993                 goto fail;
994             } else if (refcount == 1) {
995                 l2_offset |= QCOW_OFLAG_COPIED;
996             }
997             if (l2_offset != old_l2_offset) {
998                 l1_table[i] = l2_offset;
999                 l1_modified = 1;
1000             }
1001         }
1002     }
1003 
1004     ret = bdrv_flush(bs);
1005 fail:
1006     if (l2_table) {
1007         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1008     }
1009 
1010     s->cache_discards = false;
1011     qcow2_process_discards(bs, ret);
1012 
1013     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1014     if (ret == 0 && addend >= 0 && l1_modified) {
1015         for (i = 0; i < l1_size; i++) {
1016             cpu_to_be64s(&l1_table[i]);
1017         }
1018 
1019         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
1020 
1021         for (i = 0; i < l1_size; i++) {
1022             be64_to_cpus(&l1_table[i]);
1023         }
1024     }
1025     if (l1_allocated)
1026         g_free(l1_table);
1027     return ret;
1028 }
1029 
1030 
1031 
1032 
1033 /*********************************************************/
1034 /* refcount checking functions */
1035 
1036 
1037 
1038 /*
1039  * Increases the refcount for a range of clusters in a given refcount table.
1040  * This is used to construct a temporary refcount table out of L1 and L2 tables
1041  * which can be compared the the refcount table saved in the image.
1042  *
1043  * Modifies the number of errors in res.
1044  */
1045 static void inc_refcounts(BlockDriverState *bs,
1046                           BdrvCheckResult *res,
1047                           uint16_t *refcount_table,
1048                           int refcount_table_size,
1049                           int64_t offset, int64_t size)
1050 {
1051     BDRVQcowState *s = bs->opaque;
1052     uint64_t start, last, cluster_offset, k;
1053 
1054     if (size <= 0)
1055         return;
1056 
1057     start = start_of_cluster(s, offset);
1058     last = start_of_cluster(s, offset + size - 1);
1059     for(cluster_offset = start; cluster_offset <= last;
1060         cluster_offset += s->cluster_size) {
1061         k = cluster_offset >> s->cluster_bits;
1062         if (k >= refcount_table_size) {
1063             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
1064                 "the end of the image file, can't properly check refcounts.\n",
1065                 cluster_offset);
1066             res->check_errors++;
1067         } else {
1068             if (++refcount_table[k] == 0) {
1069                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1070                     "\n", cluster_offset);
1071                 res->corruptions++;
1072             }
1073         }
1074     }
1075 }
1076 
1077 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1078 enum {
1079     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1080 };
1081 
1082 /*
1083  * Increases the refcount in the given refcount table for the all clusters
1084  * referenced in the L2 table. While doing so, performs some checks on L2
1085  * entries.
1086  *
1087  * Returns the number of errors found by the checks or -errno if an internal
1088  * error occurred.
1089  */
1090 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1091     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
1092     int flags)
1093 {
1094     BDRVQcowState *s = bs->opaque;
1095     uint64_t *l2_table, l2_entry;
1096     uint64_t next_contiguous_offset = 0;
1097     int i, l2_size, nb_csectors;
1098 
1099     /* Read L2 table from disk */
1100     l2_size = s->l2_size * sizeof(uint64_t);
1101     l2_table = g_malloc(l2_size);
1102 
1103     if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
1104         goto fail;
1105 
1106     /* Do the actual checks */
1107     for(i = 0; i < s->l2_size; i++) {
1108         l2_entry = be64_to_cpu(l2_table[i]);
1109 
1110         switch (qcow2_get_cluster_type(l2_entry)) {
1111         case QCOW2_CLUSTER_COMPRESSED:
1112             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1113             if (l2_entry & QCOW_OFLAG_COPIED) {
1114                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1115                     "copied flag must never be set for compressed "
1116                     "clusters\n", l2_entry >> s->cluster_bits);
1117                 l2_entry &= ~QCOW_OFLAG_COPIED;
1118                 res->corruptions++;
1119             }
1120 
1121             /* Mark cluster as used */
1122             nb_csectors = ((l2_entry >> s->csize_shift) &
1123                            s->csize_mask) + 1;
1124             l2_entry &= s->cluster_offset_mask;
1125             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1126                 l2_entry & ~511, nb_csectors * 512);
1127 
1128             if (flags & CHECK_FRAG_INFO) {
1129                 res->bfi.allocated_clusters++;
1130                 res->bfi.compressed_clusters++;
1131 
1132                 /* Compressed clusters are fragmented by nature.  Since they
1133                  * take up sub-sector space but we only have sector granularity
1134                  * I/O we need to re-read the same sectors even for adjacent
1135                  * compressed clusters.
1136                  */
1137                 res->bfi.fragmented_clusters++;
1138             }
1139             break;
1140 
1141         case QCOW2_CLUSTER_ZERO:
1142             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1143                 break;
1144             }
1145             /* fall through */
1146 
1147         case QCOW2_CLUSTER_NORMAL:
1148         {
1149             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1150 
1151             if (flags & CHECK_FRAG_INFO) {
1152                 res->bfi.allocated_clusters++;
1153                 if (next_contiguous_offset &&
1154                     offset != next_contiguous_offset) {
1155                     res->bfi.fragmented_clusters++;
1156                 }
1157                 next_contiguous_offset = offset + s->cluster_size;
1158             }
1159 
1160             /* Mark cluster as used */
1161             inc_refcounts(bs, res, refcount_table,refcount_table_size,
1162                 offset, s->cluster_size);
1163 
1164             /* Correct offsets are cluster aligned */
1165             if (offset_into_cluster(s, offset)) {
1166                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1167                     "properly aligned; L2 entry corrupted.\n", offset);
1168                 res->corruptions++;
1169             }
1170             break;
1171         }
1172 
1173         case QCOW2_CLUSTER_UNALLOCATED:
1174             break;
1175 
1176         default:
1177             abort();
1178         }
1179     }
1180 
1181     g_free(l2_table);
1182     return 0;
1183 
1184 fail:
1185     fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1186     g_free(l2_table);
1187     return -EIO;
1188 }
1189 
1190 /*
1191  * Increases the refcount for the L1 table, its L2 tables and all referenced
1192  * clusters in the given refcount table. While doing so, performs some checks
1193  * on L1 and L2 entries.
1194  *
1195  * Returns the number of errors found by the checks or -errno if an internal
1196  * error occurred.
1197  */
1198 static int check_refcounts_l1(BlockDriverState *bs,
1199                               BdrvCheckResult *res,
1200                               uint16_t *refcount_table,
1201                               int refcount_table_size,
1202                               int64_t l1_table_offset, int l1_size,
1203                               int flags)
1204 {
1205     BDRVQcowState *s = bs->opaque;
1206     uint64_t *l1_table, l2_offset, l1_size2;
1207     int i, ret;
1208 
1209     l1_size2 = l1_size * sizeof(uint64_t);
1210 
1211     /* Mark L1 table as used */
1212     inc_refcounts(bs, res, refcount_table, refcount_table_size,
1213         l1_table_offset, l1_size2);
1214 
1215     /* Read L1 table entries from disk */
1216     if (l1_size2 == 0) {
1217         l1_table = NULL;
1218     } else {
1219         l1_table = g_try_malloc(l1_size2);
1220         if (l1_table == NULL) {
1221             ret = -ENOMEM;
1222             goto fail;
1223         }
1224         if (bdrv_pread(bs->file, l1_table_offset,
1225                        l1_table, l1_size2) != l1_size2)
1226             goto fail;
1227         for(i = 0;i < l1_size; i++)
1228             be64_to_cpus(&l1_table[i]);
1229     }
1230 
1231     /* Do the actual checks */
1232     for(i = 0; i < l1_size; i++) {
1233         l2_offset = l1_table[i];
1234         if (l2_offset) {
1235             /* Mark L2 table as used */
1236             l2_offset &= L1E_OFFSET_MASK;
1237             inc_refcounts(bs, res, refcount_table, refcount_table_size,
1238                 l2_offset, s->cluster_size);
1239 
1240             /* L2 tables are cluster aligned */
1241             if (offset_into_cluster(s, l2_offset)) {
1242                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1243                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1244                 res->corruptions++;
1245             }
1246 
1247             /* Process and check L2 entries */
1248             ret = check_refcounts_l2(bs, res, refcount_table,
1249                                      refcount_table_size, l2_offset, flags);
1250             if (ret < 0) {
1251                 goto fail;
1252             }
1253         }
1254     }
1255     g_free(l1_table);
1256     return 0;
1257 
1258 fail:
1259     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1260     res->check_errors++;
1261     g_free(l1_table);
1262     return -EIO;
1263 }
1264 
1265 /*
1266  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1267  *
1268  * This function does not print an error message nor does it increment
1269  * check_errors if get_refcount fails (this is because such an error will have
1270  * been already detected and sufficiently signaled by the calling function
1271  * (qcow2_check_refcounts) by the time this function is called).
1272  */
1273 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1274                               BdrvCheckMode fix)
1275 {
1276     BDRVQcowState *s = bs->opaque;
1277     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1278     int ret;
1279     int refcount;
1280     int i, j;
1281 
1282     for (i = 0; i < s->l1_size; i++) {
1283         uint64_t l1_entry = s->l1_table[i];
1284         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1285         bool l2_dirty = false;
1286 
1287         if (!l2_offset) {
1288             continue;
1289         }
1290 
1291         refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1292         if (refcount < 0) {
1293             /* don't print message nor increment check_errors */
1294             continue;
1295         }
1296         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1297             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1298                     "l1_entry=%" PRIx64 " refcount=%d\n",
1299                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1300                                             "ERROR",
1301                     i, l1_entry, refcount);
1302             if (fix & BDRV_FIX_ERRORS) {
1303                 s->l1_table[i] = refcount == 1
1304                                ? l1_entry |  QCOW_OFLAG_COPIED
1305                                : l1_entry & ~QCOW_OFLAG_COPIED;
1306                 ret = qcow2_write_l1_entry(bs, i);
1307                 if (ret < 0) {
1308                     res->check_errors++;
1309                     goto fail;
1310                 }
1311                 res->corruptions_fixed++;
1312             } else {
1313                 res->corruptions++;
1314             }
1315         }
1316 
1317         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1318                          s->l2_size * sizeof(uint64_t));
1319         if (ret < 0) {
1320             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1321                     strerror(-ret));
1322             res->check_errors++;
1323             goto fail;
1324         }
1325 
1326         for (j = 0; j < s->l2_size; j++) {
1327             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1328             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1329             int cluster_type = qcow2_get_cluster_type(l2_entry);
1330 
1331             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1332                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1333                 refcount = get_refcount(bs, data_offset >> s->cluster_bits);
1334                 if (refcount < 0) {
1335                     /* don't print message nor increment check_errors */
1336                     continue;
1337                 }
1338                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1339                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1340                             "l2_entry=%" PRIx64 " refcount=%d\n",
1341                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1342                                                     "ERROR",
1343                             l2_entry, refcount);
1344                     if (fix & BDRV_FIX_ERRORS) {
1345                         l2_table[j] = cpu_to_be64(refcount == 1
1346                                     ? l2_entry |  QCOW_OFLAG_COPIED
1347                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1348                         l2_dirty = true;
1349                         res->corruptions_fixed++;
1350                     } else {
1351                         res->corruptions++;
1352                     }
1353                 }
1354             }
1355         }
1356 
1357         if (l2_dirty) {
1358             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1359                                                 l2_offset, s->cluster_size);
1360             if (ret < 0) {
1361                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1362                         "overlap check failed: %s\n", strerror(-ret));
1363                 res->check_errors++;
1364                 goto fail;
1365             }
1366 
1367             ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
1368             if (ret < 0) {
1369                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1370                         strerror(-ret));
1371                 res->check_errors++;
1372                 goto fail;
1373             }
1374         }
1375     }
1376 
1377     ret = 0;
1378 
1379 fail:
1380     qemu_vfree(l2_table);
1381     return ret;
1382 }
1383 
1384 /*
1385  * Writes one sector of the refcount table to the disk
1386  */
1387 #define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
1388 static int write_reftable_entry(BlockDriverState *bs, int rt_index)
1389 {
1390     BDRVQcowState *s = bs->opaque;
1391     uint64_t buf[RT_ENTRIES_PER_SECTOR];
1392     int rt_start_index;
1393     int i, ret;
1394 
1395     rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
1396     for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
1397         buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
1398     }
1399 
1400     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
1401             s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
1402             sizeof(buf));
1403     if (ret < 0) {
1404         return ret;
1405     }
1406 
1407     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
1408     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
1409             rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
1410     if (ret < 0) {
1411         return ret;
1412     }
1413 
1414     return 0;
1415 }
1416 
1417 /*
1418  * Allocates a new cluster for the given refcount block (represented by its
1419  * offset in the image file) and copies the current content there. This function
1420  * does _not_ decrement the reference count for the currently occupied cluster.
1421  *
1422  * This function prints an informative message to stderr on error (and returns
1423  * -errno); on success, the offset of the newly allocated cluster is returned.
1424  */
1425 static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
1426                                       uint64_t offset)
1427 {
1428     BDRVQcowState *s = bs->opaque;
1429     int64_t new_offset = 0;
1430     void *refcount_block = NULL;
1431     int ret;
1432 
1433     /* allocate new refcount block */
1434     new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
1435     if (new_offset < 0) {
1436         fprintf(stderr, "Could not allocate new cluster: %s\n",
1437                 strerror(-new_offset));
1438         ret = new_offset;
1439         goto done;
1440     }
1441 
1442     /* fetch current refcount block content */
1443     ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
1444     if (ret < 0) {
1445         fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
1446         goto fail_free_cluster;
1447     }
1448 
1449     /* new block has not yet been entered into refcount table, therefore it is
1450      * no refcount block yet (regarding this check) */
1451     ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
1452     if (ret < 0) {
1453         fprintf(stderr, "Could not write refcount block; metadata overlap "
1454                 "check failed: %s\n", strerror(-ret));
1455         /* the image will be marked corrupt, so don't even attempt on freeing
1456          * the cluster */
1457         goto done;
1458     }
1459 
1460     /* write to new block */
1461     ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
1462             s->cluster_sectors);
1463     if (ret < 0) {
1464         fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
1465         goto fail_free_cluster;
1466     }
1467 
1468     /* update refcount table */
1469     assert(!offset_into_cluster(s, new_offset));
1470     s->refcount_table[reftable_index] = new_offset;
1471     ret = write_reftable_entry(bs, reftable_index);
1472     if (ret < 0) {
1473         fprintf(stderr, "Could not update refcount table: %s\n",
1474                 strerror(-ret));
1475         goto fail_free_cluster;
1476     }
1477 
1478     goto done;
1479 
1480 fail_free_cluster:
1481     qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER);
1482 
1483 done:
1484     if (refcount_block) {
1485         /* This should never fail, as it would only do so if the given refcount
1486          * block cannot be found in the cache. As this is impossible as long as
1487          * there are no bugs, assert the success. */
1488         int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
1489         assert(tmp == 0);
1490     }
1491 
1492     if (ret < 0) {
1493         return ret;
1494     }
1495 
1496     return new_offset;
1497 }
1498 
1499 /*
1500  * Checks an image for refcount consistency.
1501  *
1502  * Returns 0 if no errors are found, the number of errors in case the image is
1503  * detected as corrupted, and -errno when an internal error occurred.
1504  */
1505 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1506                           BdrvCheckMode fix)
1507 {
1508     BDRVQcowState *s = bs->opaque;
1509     int64_t size, i, highest_cluster, nb_clusters;
1510     int refcount1, refcount2;
1511     QCowSnapshot *sn;
1512     uint16_t *refcount_table;
1513     int ret;
1514 
1515     size = bdrv_getlength(bs->file);
1516     if (size < 0) {
1517         res->check_errors++;
1518         return size;
1519     }
1520 
1521     nb_clusters = size_to_clusters(s, size);
1522     if (nb_clusters > INT_MAX) {
1523         res->check_errors++;
1524         return -EFBIG;
1525     }
1526 
1527     refcount_table = g_try_new0(uint16_t, nb_clusters);
1528     if (nb_clusters && refcount_table == NULL) {
1529         res->check_errors++;
1530         return -ENOMEM;
1531     }
1532 
1533     res->bfi.total_clusters =
1534         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
1535 
1536     /* header */
1537     inc_refcounts(bs, res, refcount_table, nb_clusters,
1538         0, s->cluster_size);
1539 
1540     /* current L1 table */
1541     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1542                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1543     if (ret < 0) {
1544         goto fail;
1545     }
1546 
1547     /* snapshots */
1548     for(i = 0; i < s->nb_snapshots; i++) {
1549         sn = s->snapshots + i;
1550         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1551             sn->l1_table_offset, sn->l1_size, 0);
1552         if (ret < 0) {
1553             goto fail;
1554         }
1555     }
1556     inc_refcounts(bs, res, refcount_table, nb_clusters,
1557         s->snapshots_offset, s->snapshots_size);
1558 
1559     /* refcount data */
1560     inc_refcounts(bs, res, refcount_table, nb_clusters,
1561         s->refcount_table_offset,
1562         s->refcount_table_size * sizeof(uint64_t));
1563 
1564     for(i = 0; i < s->refcount_table_size; i++) {
1565         uint64_t offset, cluster;
1566         offset = s->refcount_table[i];
1567         cluster = offset >> s->cluster_bits;
1568 
1569         /* Refcount blocks are cluster aligned */
1570         if (offset_into_cluster(s, offset)) {
1571             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1572                 "cluster aligned; refcount table entry corrupted\n", i);
1573             res->corruptions++;
1574             continue;
1575         }
1576 
1577         if (cluster >= nb_clusters) {
1578             fprintf(stderr, "ERROR refcount block %" PRId64
1579                     " is outside image\n", i);
1580             res->corruptions++;
1581             continue;
1582         }
1583 
1584         if (offset != 0) {
1585             inc_refcounts(bs, res, refcount_table, nb_clusters,
1586                 offset, s->cluster_size);
1587             if (refcount_table[cluster] != 1) {
1588                 fprintf(stderr, "%s refcount block %" PRId64
1589                     " refcount=%d\n",
1590                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1591                                             "ERROR",
1592                     i, refcount_table[cluster]);
1593 
1594                 if (fix & BDRV_FIX_ERRORS) {
1595                     int64_t new_offset;
1596 
1597                     new_offset = realloc_refcount_block(bs, i, offset);
1598                     if (new_offset < 0) {
1599                         res->corruptions++;
1600                         continue;
1601                     }
1602 
1603                     /* update refcounts */
1604                     if ((new_offset >> s->cluster_bits) >= nb_clusters) {
1605                         /* increase refcount_table size if necessary */
1606                         int old_nb_clusters = nb_clusters;
1607                         nb_clusters = (new_offset >> s->cluster_bits) + 1;
1608                         refcount_table = g_renew(uint16_t, refcount_table,
1609                                                  nb_clusters);
1610                         memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
1611                                 - old_nb_clusters) * sizeof(uint16_t));
1612                     }
1613                     refcount_table[cluster]--;
1614                     inc_refcounts(bs, res, refcount_table, nb_clusters,
1615                             new_offset, s->cluster_size);
1616 
1617                     res->corruptions_fixed++;
1618                 } else {
1619                     res->corruptions++;
1620                 }
1621             }
1622         }
1623     }
1624 
1625     /* compare ref counts */
1626     for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
1627         refcount1 = get_refcount(bs, i);
1628         if (refcount1 < 0) {
1629             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1630                 i, strerror(-refcount1));
1631             res->check_errors++;
1632             continue;
1633         }
1634 
1635         refcount2 = refcount_table[i];
1636 
1637         if (refcount1 > 0 || refcount2 > 0) {
1638             highest_cluster = i;
1639         }
1640 
1641         if (refcount1 != refcount2) {
1642 
1643             /* Check if we're allowed to fix the mismatch */
1644             int *num_fixed = NULL;
1645             if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1646                 num_fixed = &res->leaks_fixed;
1647             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1648                 num_fixed = &res->corruptions_fixed;
1649             }
1650 
1651             fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
1652                    num_fixed != NULL     ? "Repairing" :
1653                    refcount1 < refcount2 ? "ERROR" :
1654                                            "Leaked",
1655                    i, refcount1, refcount2);
1656 
1657             if (num_fixed) {
1658                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1659                                       refcount2 - refcount1,
1660                                       QCOW2_DISCARD_ALWAYS);
1661                 if (ret >= 0) {
1662                     (*num_fixed)++;
1663                     continue;
1664                 }
1665             }
1666 
1667             /* And if we couldn't, print an error */
1668             if (refcount1 < refcount2) {
1669                 res->corruptions++;
1670             } else {
1671                 res->leaks++;
1672             }
1673         }
1674     }
1675 
1676     /* check OFLAG_COPIED */
1677     ret = check_oflag_copied(bs, res, fix);
1678     if (ret < 0) {
1679         goto fail;
1680     }
1681 
1682     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
1683     ret = 0;
1684 
1685 fail:
1686     g_free(refcount_table);
1687 
1688     return ret;
1689 }
1690 
1691 #define overlaps_with(ofs, sz) \
1692     ranges_overlap(offset, size, ofs, sz)
1693 
1694 /*
1695  * Checks if the given offset into the image file is actually free to use by
1696  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
1697  * i.e. a sanity check without relying on the refcount tables.
1698  *
1699  * The ign parameter specifies what checks not to perform (being a bitmask of
1700  * QCow2MetadataOverlap values), i.e., what sections to ignore.
1701  *
1702  * Returns:
1703  * - 0 if writing to this offset will not affect the mentioned metadata
1704  * - a positive QCow2MetadataOverlap value indicating one overlapping section
1705  * - a negative value (-errno) indicating an error while performing a check,
1706  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
1707  */
1708 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
1709                                  int64_t size)
1710 {
1711     BDRVQcowState *s = bs->opaque;
1712     int chk = s->overlap_check & ~ign;
1713     int i, j;
1714 
1715     if (!size) {
1716         return 0;
1717     }
1718 
1719     if (chk & QCOW2_OL_MAIN_HEADER) {
1720         if (offset < s->cluster_size) {
1721             return QCOW2_OL_MAIN_HEADER;
1722         }
1723     }
1724 
1725     /* align range to test to cluster boundaries */
1726     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
1727     offset = start_of_cluster(s, offset);
1728 
1729     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
1730         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
1731             return QCOW2_OL_ACTIVE_L1;
1732         }
1733     }
1734 
1735     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
1736         if (overlaps_with(s->refcount_table_offset,
1737             s->refcount_table_size * sizeof(uint64_t))) {
1738             return QCOW2_OL_REFCOUNT_TABLE;
1739         }
1740     }
1741 
1742     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
1743         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
1744             return QCOW2_OL_SNAPSHOT_TABLE;
1745         }
1746     }
1747 
1748     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
1749         for (i = 0; i < s->nb_snapshots; i++) {
1750             if (s->snapshots[i].l1_size &&
1751                 overlaps_with(s->snapshots[i].l1_table_offset,
1752                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
1753                 return QCOW2_OL_INACTIVE_L1;
1754             }
1755         }
1756     }
1757 
1758     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
1759         for (i = 0; i < s->l1_size; i++) {
1760             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
1761                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
1762                 s->cluster_size)) {
1763                 return QCOW2_OL_ACTIVE_L2;
1764             }
1765         }
1766     }
1767 
1768     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
1769         for (i = 0; i < s->refcount_table_size; i++) {
1770             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
1771                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
1772                 s->cluster_size)) {
1773                 return QCOW2_OL_REFCOUNT_BLOCK;
1774             }
1775         }
1776     }
1777 
1778     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
1779         for (i = 0; i < s->nb_snapshots; i++) {
1780             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
1781             uint32_t l1_sz  = s->snapshots[i].l1_size;
1782             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
1783             uint64_t *l1 = g_try_malloc(l1_sz2);
1784             int ret;
1785 
1786             if (l1_sz2 && l1 == NULL) {
1787                 return -ENOMEM;
1788             }
1789 
1790             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
1791             if (ret < 0) {
1792                 g_free(l1);
1793                 return ret;
1794             }
1795 
1796             for (j = 0; j < l1_sz; j++) {
1797                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
1798                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
1799                     g_free(l1);
1800                     return QCOW2_OL_INACTIVE_L2;
1801                 }
1802             }
1803 
1804             g_free(l1);
1805         }
1806     }
1807 
1808     return 0;
1809 }
1810 
1811 static const char *metadata_ol_names[] = {
1812     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
1813     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
1814     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
1815     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
1816     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
1817     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
1818     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
1819     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
1820 };
1821 
1822 /*
1823  * First performs a check for metadata overlaps (through
1824  * qcow2_check_metadata_overlap); if that fails with a negative value (error
1825  * while performing a check), that value is returned. If an impending overlap
1826  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
1827  * and -EIO returned.
1828  *
1829  * Returns 0 if there were neither overlaps nor errors while checking for
1830  * overlaps; or a negative value (-errno) on error.
1831  */
1832 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
1833                                   int64_t size)
1834 {
1835     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
1836 
1837     if (ret < 0) {
1838         return ret;
1839     } else if (ret > 0) {
1840         int metadata_ol_bitnr = ffs(ret) - 1;
1841         char *message;
1842 
1843         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
1844 
1845         fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
1846                 "with %s); image marked as corrupt.\n",
1847                 metadata_ol_names[metadata_ol_bitnr]);
1848         message = g_strdup_printf("Prevented %s overwrite",
1849                 metadata_ol_names[metadata_ol_bitnr]);
1850         qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
1851                                               message,
1852                                               true,
1853                                               offset,
1854                                               true,
1855                                               size,
1856                                               true,
1857                                               &error_abort);
1858         g_free(message);
1859 
1860         qcow2_mark_corrupt(bs);
1861         bs->drv = NULL; /* make BDS unusable */
1862         return -EIO;
1863     }
1864 
1865     return 0;
1866 }
1867