xref: /openbmc/qemu/block/qcow2-refcount.c (revision f1f7e4bf)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu-common.h"
26 #include "block/block_int.h"
27 #include "block/qcow2.h"
28 #include "qemu/range.h"
29 
30 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
31 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
32                             int64_t offset, int64_t length, uint64_t addend,
33                             bool decrease, enum qcow2_discard_type type);
34 
35 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
36 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
37 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
38 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
39 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
42 
43 static void set_refcount_ro0(void *refcount_array, uint64_t index,
44                              uint64_t value);
45 static void set_refcount_ro1(void *refcount_array, uint64_t index,
46                              uint64_t value);
47 static void set_refcount_ro2(void *refcount_array, uint64_t index,
48                              uint64_t value);
49 static void set_refcount_ro3(void *refcount_array, uint64_t index,
50                              uint64_t value);
51 static void set_refcount_ro4(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro5(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro6(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 
58 
59 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
60     &get_refcount_ro0,
61     &get_refcount_ro1,
62     &get_refcount_ro2,
63     &get_refcount_ro3,
64     &get_refcount_ro4,
65     &get_refcount_ro5,
66     &get_refcount_ro6
67 };
68 
69 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
70     &set_refcount_ro0,
71     &set_refcount_ro1,
72     &set_refcount_ro2,
73     &set_refcount_ro3,
74     &set_refcount_ro4,
75     &set_refcount_ro5,
76     &set_refcount_ro6
77 };
78 
79 
80 /*********************************************************/
81 /* refcount handling */
82 
83 int qcow2_refcount_init(BlockDriverState *bs)
84 {
85     BDRVQcow2State *s = bs->opaque;
86     unsigned int refcount_table_size2, i;
87     int ret;
88 
89     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
90 
91     s->get_refcount = get_refcount_funcs[s->refcount_order];
92     s->set_refcount = set_refcount_funcs[s->refcount_order];
93 
94     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
95     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
96     s->refcount_table = g_try_malloc(refcount_table_size2);
97 
98     if (s->refcount_table_size > 0) {
99         if (s->refcount_table == NULL) {
100             ret = -ENOMEM;
101             goto fail;
102         }
103         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
104         ret = bdrv_pread(bs->file->bs, s->refcount_table_offset,
105                          s->refcount_table, refcount_table_size2);
106         if (ret < 0) {
107             goto fail;
108         }
109         for(i = 0; i < s->refcount_table_size; i++)
110             be64_to_cpus(&s->refcount_table[i]);
111     }
112     return 0;
113  fail:
114     return ret;
115 }
116 
117 void qcow2_refcount_close(BlockDriverState *bs)
118 {
119     BDRVQcow2State *s = bs->opaque;
120     g_free(s->refcount_table);
121 }
122 
123 
124 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
125 {
126     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
127 }
128 
129 static void set_refcount_ro0(void *refcount_array, uint64_t index,
130                              uint64_t value)
131 {
132     assert(!(value >> 1));
133     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
134     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
135 }
136 
137 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
138 {
139     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
140            & 0x3;
141 }
142 
143 static void set_refcount_ro1(void *refcount_array, uint64_t index,
144                              uint64_t value)
145 {
146     assert(!(value >> 2));
147     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
148     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
149 }
150 
151 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
152 {
153     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
154            & 0xf;
155 }
156 
157 static void set_refcount_ro2(void *refcount_array, uint64_t index,
158                              uint64_t value)
159 {
160     assert(!(value >> 4));
161     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
162     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
163 }
164 
165 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
166 {
167     return ((const uint8_t *)refcount_array)[index];
168 }
169 
170 static void set_refcount_ro3(void *refcount_array, uint64_t index,
171                              uint64_t value)
172 {
173     assert(!(value >> 8));
174     ((uint8_t *)refcount_array)[index] = value;
175 }
176 
177 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
178 {
179     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
180 }
181 
182 static void set_refcount_ro4(void *refcount_array, uint64_t index,
183                              uint64_t value)
184 {
185     assert(!(value >> 16));
186     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
187 }
188 
189 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
190 {
191     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
192 }
193 
194 static void set_refcount_ro5(void *refcount_array, uint64_t index,
195                              uint64_t value)
196 {
197     assert(!(value >> 32));
198     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
199 }
200 
201 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
202 {
203     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
204 }
205 
206 static void set_refcount_ro6(void *refcount_array, uint64_t index,
207                              uint64_t value)
208 {
209     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
210 }
211 
212 
213 static int load_refcount_block(BlockDriverState *bs,
214                                int64_t refcount_block_offset,
215                                void **refcount_block)
216 {
217     BDRVQcow2State *s = bs->opaque;
218     int ret;
219 
220     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
221     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
222         refcount_block);
223 
224     return ret;
225 }
226 
227 /*
228  * Retrieves the refcount of the cluster given by its index and stores it in
229  * *refcount. Returns 0 on success and -errno on failure.
230  */
231 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
232                        uint64_t *refcount)
233 {
234     BDRVQcow2State *s = bs->opaque;
235     uint64_t refcount_table_index, block_index;
236     int64_t refcount_block_offset;
237     int ret;
238     void *refcount_block;
239 
240     refcount_table_index = cluster_index >> s->refcount_block_bits;
241     if (refcount_table_index >= s->refcount_table_size) {
242         *refcount = 0;
243         return 0;
244     }
245     refcount_block_offset =
246         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
247     if (!refcount_block_offset) {
248         *refcount = 0;
249         return 0;
250     }
251 
252     if (offset_into_cluster(s, refcount_block_offset)) {
253         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
254                                 " unaligned (reftable index: %#" PRIx64 ")",
255                                 refcount_block_offset, refcount_table_index);
256         return -EIO;
257     }
258 
259     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
260                           &refcount_block);
261     if (ret < 0) {
262         return ret;
263     }
264 
265     block_index = cluster_index & (s->refcount_block_size - 1);
266     *refcount = s->get_refcount(refcount_block, block_index);
267 
268     qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
269 
270     return 0;
271 }
272 
273 /*
274  * Rounds the refcount table size up to avoid growing the table for each single
275  * refcount block that is allocated.
276  */
277 static unsigned int next_refcount_table_size(BDRVQcow2State *s,
278     unsigned int min_size)
279 {
280     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
281     unsigned int refcount_table_clusters =
282         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
283 
284     while (min_clusters > refcount_table_clusters) {
285         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
286     }
287 
288     return refcount_table_clusters << (s->cluster_bits - 3);
289 }
290 
291 
292 /* Checks if two offsets are described by the same refcount block */
293 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
294     uint64_t offset_b)
295 {
296     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
297     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
298 
299     return (block_a == block_b);
300 }
301 
302 /*
303  * Loads a refcount block. If it doesn't exist yet, it is allocated first
304  * (including growing the refcount table if needed).
305  *
306  * Returns 0 on success or -errno in error case
307  */
308 static int alloc_refcount_block(BlockDriverState *bs,
309                                 int64_t cluster_index, void **refcount_block)
310 {
311     BDRVQcow2State *s = bs->opaque;
312     unsigned int refcount_table_index;
313     int ret;
314 
315     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
316 
317     /* Find the refcount block for the given cluster */
318     refcount_table_index = cluster_index >> s->refcount_block_bits;
319 
320     if (refcount_table_index < s->refcount_table_size) {
321 
322         uint64_t refcount_block_offset =
323             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
324 
325         /* If it's already there, we're done */
326         if (refcount_block_offset) {
327             if (offset_into_cluster(s, refcount_block_offset)) {
328                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
329                                         PRIx64 " unaligned (reftable index: "
330                                         "%#x)", refcount_block_offset,
331                                         refcount_table_index);
332                 return -EIO;
333             }
334 
335              return load_refcount_block(bs, refcount_block_offset,
336                                         refcount_block);
337         }
338     }
339 
340     /*
341      * If we came here, we need to allocate something. Something is at least
342      * a cluster for the new refcount block. It may also include a new refcount
343      * table if the old refcount table is too small.
344      *
345      * Note that allocating clusters here needs some special care:
346      *
347      * - We can't use the normal qcow2_alloc_clusters(), it would try to
348      *   increase the refcount and very likely we would end up with an endless
349      *   recursion. Instead we must place the refcount blocks in a way that
350      *   they can describe them themselves.
351      *
352      * - We need to consider that at this point we are inside update_refcounts
353      *   and potentially doing an initial refcount increase. This means that
354      *   some clusters have already been allocated by the caller, but their
355      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
356      *   need to return -EAGAIN to signal the caller that it needs to restart
357      *   the search for free clusters.
358      *
359      * - alloc_clusters_noref and qcow2_free_clusters may load a different
360      *   refcount block into the cache
361      */
362 
363     *refcount_block = NULL;
364 
365     /* We write to the refcount table, so we might depend on L2 tables */
366     ret = qcow2_cache_flush(bs, s->l2_table_cache);
367     if (ret < 0) {
368         return ret;
369     }
370 
371     /* Allocate the refcount block itself and mark it as used */
372     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
373     if (new_block < 0) {
374         return new_block;
375     }
376 
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382 
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail_block;
389         }
390 
391         memset(*refcount_block, 0, s->cluster_size);
392 
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail_block;
404         }
405 
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail_block;
409         }
410 
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail_block;
417         }
418 
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421 
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail_block;
428     }
429 
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file->bs,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail_block;
439         }
440 
441         s->refcount_table[refcount_table_index] = new_block;
442 
443         /* The new refcount block may be where the caller intended to put its
444          * data, so let it restart the search. */
445         return -EAGAIN;
446     }
447 
448     qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
449 
450     /*
451      * If we come here, we need to grow the refcount table. Again, a new
452      * refcount table needs some space and we can't simply allocate to avoid
453      * endless recursion.
454      *
455      * Therefore let's grab new refcount blocks at the end of the image, which
456      * will describe themselves and the new refcount table. This way we can
457      * reference them only in the new table and do the switch to the new
458      * refcount table at once without producing an inconsistent state in
459      * between.
460      */
461     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
462 
463     /* Calculate the number of refcount blocks needed so far; this will be the
464      * basis for calculating the index of the first cluster used for the
465      * self-describing refcount structures which we are about to create.
466      *
467      * Because we reached this point, there cannot be any refcount entries for
468      * cluster_index or higher indices yet. However, because new_block has been
469      * allocated to describe that cluster (and it will assume this role later
470      * on), we cannot use that index; also, new_block may actually have a higher
471      * cluster index than cluster_index, so it needs to be taken into account
472      * here (and 1 needs to be added to its value because that cluster is used).
473      */
474     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
475                                             (new_block >> s->cluster_bits) + 1),
476                                         s->refcount_block_size);
477 
478     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
479         return -EFBIG;
480     }
481 
482     /* And now we need at least one block more for the new metadata */
483     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
484     uint64_t last_table_size;
485     uint64_t blocks_clusters;
486     do {
487         uint64_t table_clusters =
488             size_to_clusters(s, table_size * sizeof(uint64_t));
489         blocks_clusters = 1 +
490             ((table_clusters + s->refcount_block_size - 1)
491             / s->refcount_block_size);
492         uint64_t meta_clusters = table_clusters + blocks_clusters;
493 
494         last_table_size = table_size;
495         table_size = next_refcount_table_size(s, blocks_used +
496             ((meta_clusters + s->refcount_block_size - 1)
497             / s->refcount_block_size));
498 
499     } while (last_table_size != table_size);
500 
501 #ifdef DEBUG_ALLOC2
502     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
503         s->refcount_table_size, table_size);
504 #endif
505 
506     /* Create the new refcount table and blocks */
507     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
508         s->cluster_size;
509     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
510     uint64_t *new_table = g_try_new0(uint64_t, table_size);
511     void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
512 
513     assert(table_size > 0 && blocks_clusters > 0);
514     if (new_table == NULL || new_blocks == NULL) {
515         ret = -ENOMEM;
516         goto fail_table;
517     }
518 
519     /* Fill the new refcount table */
520     memcpy(new_table, s->refcount_table,
521         s->refcount_table_size * sizeof(uint64_t));
522     new_table[refcount_table_index] = new_block;
523 
524     int i;
525     for (i = 0; i < blocks_clusters; i++) {
526         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
527     }
528 
529     /* Fill the refcount blocks */
530     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
531     int block = 0;
532     for (i = 0; i < table_clusters + blocks_clusters; i++) {
533         s->set_refcount(new_blocks, block++, 1);
534     }
535 
536     /* Write refcount blocks to disk */
537     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
538     ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks,
539         blocks_clusters * s->cluster_size);
540     g_free(new_blocks);
541     new_blocks = NULL;
542     if (ret < 0) {
543         goto fail_table;
544     }
545 
546     /* Write refcount table to disk */
547     for(i = 0; i < table_size; i++) {
548         cpu_to_be64s(&new_table[i]);
549     }
550 
551     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
552     ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table,
553         table_size * sizeof(uint64_t));
554     if (ret < 0) {
555         goto fail_table;
556     }
557 
558     for(i = 0; i < table_size; i++) {
559         be64_to_cpus(&new_table[i]);
560     }
561 
562     /* Hook up the new refcount table in the qcow2 header */
563     struct QEMU_PACKED {
564         uint64_t d64;
565         uint32_t d32;
566     } data;
567     cpu_to_be64w(&data.d64, table_offset);
568     cpu_to_be32w(&data.d32, table_clusters);
569     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
570     ret = bdrv_pwrite_sync(bs->file->bs,
571                            offsetof(QCowHeader, refcount_table_offset),
572                            &data, sizeof(data));
573     if (ret < 0) {
574         goto fail_table;
575     }
576 
577     /* And switch it in memory */
578     uint64_t old_table_offset = s->refcount_table_offset;
579     uint64_t old_table_size = s->refcount_table_size;
580 
581     g_free(s->refcount_table);
582     s->refcount_table = new_table;
583     s->refcount_table_size = table_size;
584     s->refcount_table_offset = table_offset;
585 
586     /* Free old table. */
587     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
588                         QCOW2_DISCARD_OTHER);
589 
590     ret = load_refcount_block(bs, new_block, refcount_block);
591     if (ret < 0) {
592         return ret;
593     }
594 
595     /* If we were trying to do the initial refcount update for some cluster
596      * allocation, we might have used the same clusters to store newly
597      * allocated metadata. Make the caller search some new space. */
598     return -EAGAIN;
599 
600 fail_table:
601     g_free(new_blocks);
602     g_free(new_table);
603 fail_block:
604     if (*refcount_block != NULL) {
605         qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
606     }
607     return ret;
608 }
609 
610 void qcow2_process_discards(BlockDriverState *bs, int ret)
611 {
612     BDRVQcow2State *s = bs->opaque;
613     Qcow2DiscardRegion *d, *next;
614 
615     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
616         QTAILQ_REMOVE(&s->discards, d, next);
617 
618         /* Discard is optional, ignore the return value */
619         if (ret >= 0) {
620             bdrv_discard(bs->file->bs,
621                          d->offset >> BDRV_SECTOR_BITS,
622                          d->bytes >> BDRV_SECTOR_BITS);
623         }
624 
625         g_free(d);
626     }
627 }
628 
629 static void update_refcount_discard(BlockDriverState *bs,
630                                     uint64_t offset, uint64_t length)
631 {
632     BDRVQcow2State *s = bs->opaque;
633     Qcow2DiscardRegion *d, *p, *next;
634 
635     QTAILQ_FOREACH(d, &s->discards, next) {
636         uint64_t new_start = MIN(offset, d->offset);
637         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
638 
639         if (new_end - new_start <= length + d->bytes) {
640             /* There can't be any overlap, areas ending up here have no
641              * references any more and therefore shouldn't get freed another
642              * time. */
643             assert(d->bytes + length == new_end - new_start);
644             d->offset = new_start;
645             d->bytes = new_end - new_start;
646             goto found;
647         }
648     }
649 
650     d = g_malloc(sizeof(*d));
651     *d = (Qcow2DiscardRegion) {
652         .bs     = bs,
653         .offset = offset,
654         .bytes  = length,
655     };
656     QTAILQ_INSERT_TAIL(&s->discards, d, next);
657 
658 found:
659     /* Merge discard requests if they are adjacent now */
660     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
661         if (p == d
662             || p->offset > d->offset + d->bytes
663             || d->offset > p->offset + p->bytes)
664         {
665             continue;
666         }
667 
668         /* Still no overlap possible */
669         assert(p->offset == d->offset + d->bytes
670             || d->offset == p->offset + p->bytes);
671 
672         QTAILQ_REMOVE(&s->discards, p, next);
673         d->offset = MIN(d->offset, p->offset);
674         d->bytes += p->bytes;
675         g_free(p);
676     }
677 }
678 
679 /* XXX: cache several refcount block clusters ? */
680 /* @addend is the absolute value of the addend; if @decrease is set, @addend
681  * will be subtracted from the current refcount, otherwise it will be added */
682 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
683                                                    int64_t offset,
684                                                    int64_t length,
685                                                    uint64_t addend,
686                                                    bool decrease,
687                                                    enum qcow2_discard_type type)
688 {
689     BDRVQcow2State *s = bs->opaque;
690     int64_t start, last, cluster_offset;
691     void *refcount_block = NULL;
692     int64_t old_table_index = -1;
693     int ret;
694 
695 #ifdef DEBUG_ALLOC2
696     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
697             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
698             addend);
699 #endif
700     if (length < 0) {
701         return -EINVAL;
702     } else if (length == 0) {
703         return 0;
704     }
705 
706     if (decrease) {
707         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
708             s->l2_table_cache);
709     }
710 
711     start = start_of_cluster(s, offset);
712     last = start_of_cluster(s, offset + length - 1);
713     for(cluster_offset = start; cluster_offset <= last;
714         cluster_offset += s->cluster_size)
715     {
716         int block_index;
717         uint64_t refcount;
718         int64_t cluster_index = cluster_offset >> s->cluster_bits;
719         int64_t table_index = cluster_index >> s->refcount_block_bits;
720 
721         /* Load the refcount block and allocate it if needed */
722         if (table_index != old_table_index) {
723             if (refcount_block) {
724                 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
725             }
726             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
727             if (ret < 0) {
728                 goto fail;
729             }
730         }
731         old_table_index = table_index;
732 
733         qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
734                                      refcount_block);
735 
736         /* we can update the count and save it */
737         block_index = cluster_index & (s->refcount_block_size - 1);
738 
739         refcount = s->get_refcount(refcount_block, block_index);
740         if (decrease ? (refcount - addend > refcount)
741                      : (refcount + addend < refcount ||
742                         refcount + addend > s->refcount_max))
743         {
744             ret = -EINVAL;
745             goto fail;
746         }
747         if (decrease) {
748             refcount -= addend;
749         } else {
750             refcount += addend;
751         }
752         if (refcount == 0 && cluster_index < s->free_cluster_index) {
753             s->free_cluster_index = cluster_index;
754         }
755         s->set_refcount(refcount_block, block_index, refcount);
756 
757         if (refcount == 0 && s->discard_passthrough[type]) {
758             update_refcount_discard(bs, cluster_offset, s->cluster_size);
759         }
760     }
761 
762     ret = 0;
763 fail:
764     if (!s->cache_discards) {
765         qcow2_process_discards(bs, ret);
766     }
767 
768     /* Write last changed block to disk */
769     if (refcount_block) {
770         qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
771     }
772 
773     /*
774      * Try do undo any updates if an error is returned (This may succeed in
775      * some cases like ENOSPC for allocating a new refcount block)
776      */
777     if (ret < 0) {
778         int dummy;
779         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
780                                 !decrease, QCOW2_DISCARD_NEVER);
781         (void)dummy;
782     }
783 
784     return ret;
785 }
786 
787 /*
788  * Increases or decreases the refcount of a given cluster.
789  *
790  * @addend is the absolute value of the addend; if @decrease is set, @addend
791  * will be subtracted from the current refcount, otherwise it will be added.
792  *
793  * On success 0 is returned; on failure -errno is returned.
794  */
795 int qcow2_update_cluster_refcount(BlockDriverState *bs,
796                                   int64_t cluster_index,
797                                   uint64_t addend, bool decrease,
798                                   enum qcow2_discard_type type)
799 {
800     BDRVQcow2State *s = bs->opaque;
801     int ret;
802 
803     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
804                           decrease, type);
805     if (ret < 0) {
806         return ret;
807     }
808 
809     return 0;
810 }
811 
812 
813 
814 /*********************************************************/
815 /* cluster allocation functions */
816 
817 
818 
819 /* return < 0 if error */
820 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
821 {
822     BDRVQcow2State *s = bs->opaque;
823     uint64_t i, nb_clusters, refcount;
824     int ret;
825 
826     /* We can't allocate clusters if they may still be queued for discard. */
827     if (s->cache_discards) {
828         qcow2_process_discards(bs, 0);
829     }
830 
831     nb_clusters = size_to_clusters(s, size);
832 retry:
833     for(i = 0; i < nb_clusters; i++) {
834         uint64_t next_cluster_index = s->free_cluster_index++;
835         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
836 
837         if (ret < 0) {
838             return ret;
839         } else if (refcount != 0) {
840             goto retry;
841         }
842     }
843 
844     /* Make sure that all offsets in the "allocated" range are representable
845      * in an int64_t */
846     if (s->free_cluster_index > 0 &&
847         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
848     {
849         return -EFBIG;
850     }
851 
852 #ifdef DEBUG_ALLOC2
853     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
854             size,
855             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
856 #endif
857     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
858 }
859 
860 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
861 {
862     int64_t offset;
863     int ret;
864 
865     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
866     do {
867         offset = alloc_clusters_noref(bs, size);
868         if (offset < 0) {
869             return offset;
870         }
871 
872         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
873     } while (ret == -EAGAIN);
874 
875     if (ret < 0) {
876         return ret;
877     }
878 
879     return offset;
880 }
881 
882 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
883                                 int64_t nb_clusters)
884 {
885     BDRVQcow2State *s = bs->opaque;
886     uint64_t cluster_index, refcount;
887     uint64_t i;
888     int ret;
889 
890     assert(nb_clusters >= 0);
891     if (nb_clusters == 0) {
892         return 0;
893     }
894 
895     do {
896         /* Check how many clusters there are free */
897         cluster_index = offset >> s->cluster_bits;
898         for(i = 0; i < nb_clusters; i++) {
899             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
900             if (ret < 0) {
901                 return ret;
902             } else if (refcount != 0) {
903                 break;
904             }
905         }
906 
907         /* And then allocate them */
908         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
909                               QCOW2_DISCARD_NEVER);
910     } while (ret == -EAGAIN);
911 
912     if (ret < 0) {
913         return ret;
914     }
915 
916     return i;
917 }
918 
919 /* only used to allocate compressed sectors. We try to allocate
920    contiguous sectors. size must be <= cluster_size */
921 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
922 {
923     BDRVQcow2State *s = bs->opaque;
924     int64_t offset;
925     size_t free_in_cluster;
926     int ret;
927 
928     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
929     assert(size > 0 && size <= s->cluster_size);
930     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
931 
932     offset = s->free_byte_offset;
933 
934     if (offset) {
935         uint64_t refcount;
936         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
937         if (ret < 0) {
938             return ret;
939         }
940 
941         if (refcount == s->refcount_max) {
942             offset = 0;
943         }
944     }
945 
946     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
947     do {
948         if (!offset || free_in_cluster < size) {
949             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
950             if (new_cluster < 0) {
951                 return new_cluster;
952             }
953 
954             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
955                 offset = new_cluster;
956                 free_in_cluster = s->cluster_size;
957             } else {
958                 free_in_cluster += s->cluster_size;
959             }
960         }
961 
962         assert(offset);
963         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
964         if (ret < 0) {
965             offset = 0;
966         }
967     } while (ret == -EAGAIN);
968     if (ret < 0) {
969         return ret;
970     }
971 
972     /* The cluster refcount was incremented; refcount blocks must be flushed
973      * before the caller's L2 table updates. */
974     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
975 
976     s->free_byte_offset = offset + size;
977     if (!offset_into_cluster(s, s->free_byte_offset)) {
978         s->free_byte_offset = 0;
979     }
980 
981     return offset;
982 }
983 
984 void qcow2_free_clusters(BlockDriverState *bs,
985                           int64_t offset, int64_t size,
986                           enum qcow2_discard_type type)
987 {
988     int ret;
989 
990     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
991     ret = update_refcount(bs, offset, size, 1, true, type);
992     if (ret < 0) {
993         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
994         /* TODO Remember the clusters to free them later and avoid leaking */
995     }
996 }
997 
998 /*
999  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1000  * normal cluster, compressed cluster, etc.)
1001  */
1002 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1003                              int nb_clusters, enum qcow2_discard_type type)
1004 {
1005     BDRVQcow2State *s = bs->opaque;
1006 
1007     switch (qcow2_get_cluster_type(l2_entry)) {
1008     case QCOW2_CLUSTER_COMPRESSED:
1009         {
1010             int nb_csectors;
1011             nb_csectors = ((l2_entry >> s->csize_shift) &
1012                            s->csize_mask) + 1;
1013             qcow2_free_clusters(bs,
1014                 (l2_entry & s->cluster_offset_mask) & ~511,
1015                 nb_csectors * 512, type);
1016         }
1017         break;
1018     case QCOW2_CLUSTER_NORMAL:
1019     case QCOW2_CLUSTER_ZERO:
1020         if (l2_entry & L2E_OFFSET_MASK) {
1021             if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1022                 qcow2_signal_corruption(bs, false, -1, -1,
1023                                         "Cannot free unaligned cluster %#llx",
1024                                         l2_entry & L2E_OFFSET_MASK);
1025             } else {
1026                 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1027                                     nb_clusters << s->cluster_bits, type);
1028             }
1029         }
1030         break;
1031     case QCOW2_CLUSTER_UNALLOCATED:
1032         break;
1033     default:
1034         abort();
1035     }
1036 }
1037 
1038 
1039 
1040 /*********************************************************/
1041 /* snapshots and image creation */
1042 
1043 
1044 
1045 /* update the refcounts of snapshots and the copied flag */
1046 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1047     int64_t l1_table_offset, int l1_size, int addend)
1048 {
1049     BDRVQcow2State *s = bs->opaque;
1050     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1051     bool l1_allocated = false;
1052     int64_t old_offset, old_l2_offset;
1053     int i, j, l1_modified = 0, nb_csectors;
1054     int ret;
1055 
1056     assert(addend >= -1 && addend <= 1);
1057 
1058     l2_table = NULL;
1059     l1_table = NULL;
1060     l1_size2 = l1_size * sizeof(uint64_t);
1061 
1062     s->cache_discards = true;
1063 
1064     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1065      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1066      * when changing this! */
1067     if (l1_table_offset != s->l1_table_offset) {
1068         l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1069         if (l1_size2 && l1_table == NULL) {
1070             ret = -ENOMEM;
1071             goto fail;
1072         }
1073         l1_allocated = true;
1074 
1075         ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1076         if (ret < 0) {
1077             goto fail;
1078         }
1079 
1080         for(i = 0;i < l1_size; i++)
1081             be64_to_cpus(&l1_table[i]);
1082     } else {
1083         assert(l1_size == s->l1_size);
1084         l1_table = s->l1_table;
1085         l1_allocated = false;
1086     }
1087 
1088     for(i = 0; i < l1_size; i++) {
1089         l2_offset = l1_table[i];
1090         if (l2_offset) {
1091             old_l2_offset = l2_offset;
1092             l2_offset &= L1E_OFFSET_MASK;
1093 
1094             if (offset_into_cluster(s, l2_offset)) {
1095                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1096                                         PRIx64 " unaligned (L1 index: %#x)",
1097                                         l2_offset, i);
1098                 ret = -EIO;
1099                 goto fail;
1100             }
1101 
1102             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1103                 (void**) &l2_table);
1104             if (ret < 0) {
1105                 goto fail;
1106             }
1107 
1108             for(j = 0; j < s->l2_size; j++) {
1109                 uint64_t cluster_index;
1110 
1111                 offset = be64_to_cpu(l2_table[j]);
1112                 old_offset = offset;
1113                 offset &= ~QCOW_OFLAG_COPIED;
1114 
1115                 switch (qcow2_get_cluster_type(offset)) {
1116                     case QCOW2_CLUSTER_COMPRESSED:
1117                         nb_csectors = ((offset >> s->csize_shift) &
1118                                        s->csize_mask) + 1;
1119                         if (addend != 0) {
1120                             ret = update_refcount(bs,
1121                                 (offset & s->cluster_offset_mask) & ~511,
1122                                 nb_csectors * 512, abs(addend), addend < 0,
1123                                 QCOW2_DISCARD_SNAPSHOT);
1124                             if (ret < 0) {
1125                                 goto fail;
1126                             }
1127                         }
1128                         /* compressed clusters are never modified */
1129                         refcount = 2;
1130                         break;
1131 
1132                     case QCOW2_CLUSTER_NORMAL:
1133                     case QCOW2_CLUSTER_ZERO:
1134                         if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1135                             qcow2_signal_corruption(bs, true, -1, -1, "Data "
1136                                                     "cluster offset %#llx "
1137                                                     "unaligned (L2 offset: %#"
1138                                                     PRIx64 ", L2 index: %#x)",
1139                                                     offset & L2E_OFFSET_MASK,
1140                                                     l2_offset, j);
1141                             ret = -EIO;
1142                             goto fail;
1143                         }
1144 
1145                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1146                         if (!cluster_index) {
1147                             /* unallocated */
1148                             refcount = 0;
1149                             break;
1150                         }
1151                         if (addend != 0) {
1152                             ret = qcow2_update_cluster_refcount(bs,
1153                                     cluster_index, abs(addend), addend < 0,
1154                                     QCOW2_DISCARD_SNAPSHOT);
1155                             if (ret < 0) {
1156                                 goto fail;
1157                             }
1158                         }
1159 
1160                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1161                         if (ret < 0) {
1162                             goto fail;
1163                         }
1164                         break;
1165 
1166                     case QCOW2_CLUSTER_UNALLOCATED:
1167                         refcount = 0;
1168                         break;
1169 
1170                     default:
1171                         abort();
1172                 }
1173 
1174                 if (refcount == 1) {
1175                     offset |= QCOW_OFLAG_COPIED;
1176                 }
1177                 if (offset != old_offset) {
1178                     if (addend > 0) {
1179                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1180                             s->refcount_block_cache);
1181                     }
1182                     l2_table[j] = cpu_to_be64(offset);
1183                     qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1184                                                  l2_table);
1185                 }
1186             }
1187 
1188             qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1189 
1190             if (addend != 0) {
1191                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1192                                                         s->cluster_bits,
1193                                                     abs(addend), addend < 0,
1194                                                     QCOW2_DISCARD_SNAPSHOT);
1195                 if (ret < 0) {
1196                     goto fail;
1197                 }
1198             }
1199             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1200                                      &refcount);
1201             if (ret < 0) {
1202                 goto fail;
1203             } else if (refcount == 1) {
1204                 l2_offset |= QCOW_OFLAG_COPIED;
1205             }
1206             if (l2_offset != old_l2_offset) {
1207                 l1_table[i] = l2_offset;
1208                 l1_modified = 1;
1209             }
1210         }
1211     }
1212 
1213     ret = bdrv_flush(bs);
1214 fail:
1215     if (l2_table) {
1216         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1217     }
1218 
1219     s->cache_discards = false;
1220     qcow2_process_discards(bs, ret);
1221 
1222     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1223     if (ret == 0 && addend >= 0 && l1_modified) {
1224         for (i = 0; i < l1_size; i++) {
1225             cpu_to_be64s(&l1_table[i]);
1226         }
1227 
1228         ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset,
1229                                l1_table, l1_size2);
1230 
1231         for (i = 0; i < l1_size; i++) {
1232             be64_to_cpus(&l1_table[i]);
1233         }
1234     }
1235     if (l1_allocated)
1236         g_free(l1_table);
1237     return ret;
1238 }
1239 
1240 
1241 
1242 
1243 /*********************************************************/
1244 /* refcount checking functions */
1245 
1246 
1247 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1248 {
1249     /* This assertion holds because there is no way we can address more than
1250      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1251      * offsets have to be representable in bytes); due to every cluster
1252      * corresponding to one refcount entry, we are well below that limit */
1253     assert(entries < (UINT64_C(1) << (64 - 9)));
1254 
1255     /* Thanks to the assertion this will not overflow, because
1256      * s->refcount_order < 7.
1257      * (note: x << s->refcount_order == x * s->refcount_bits) */
1258     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1259 }
1260 
1261 /**
1262  * Reallocates *array so that it can hold new_size entries. *size must contain
1263  * the current number of entries in *array. If the reallocation fails, *array
1264  * and *size will not be modified and -errno will be returned. If the
1265  * reallocation is successful, *array will be set to the new buffer, *size
1266  * will be set to new_size and 0 will be returned. The size of the reallocated
1267  * refcount array buffer will be aligned to a cluster boundary, and the newly
1268  * allocated area will be zeroed.
1269  */
1270 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1271                                   int64_t *size, int64_t new_size)
1272 {
1273     int64_t old_byte_size, new_byte_size;
1274     void *new_ptr;
1275 
1276     /* Round to clusters so the array can be directly written to disk */
1277     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1278                     * s->cluster_size;
1279     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1280                     * s->cluster_size;
1281 
1282     if (new_byte_size == old_byte_size) {
1283         *size = new_size;
1284         return 0;
1285     }
1286 
1287     assert(new_byte_size > 0);
1288 
1289     if (new_byte_size > SIZE_MAX) {
1290         return -ENOMEM;
1291     }
1292 
1293     new_ptr = g_try_realloc(*array, new_byte_size);
1294     if (!new_ptr) {
1295         return -ENOMEM;
1296     }
1297 
1298     if (new_byte_size > old_byte_size) {
1299         memset((char *)new_ptr + old_byte_size, 0,
1300                new_byte_size - old_byte_size);
1301     }
1302 
1303     *array = new_ptr;
1304     *size  = new_size;
1305 
1306     return 0;
1307 }
1308 
1309 /*
1310  * Increases the refcount for a range of clusters in a given refcount table.
1311  * This is used to construct a temporary refcount table out of L1 and L2 tables
1312  * which can be compared to the refcount table saved in the image.
1313  *
1314  * Modifies the number of errors in res.
1315  */
1316 static int inc_refcounts(BlockDriverState *bs,
1317                          BdrvCheckResult *res,
1318                          void **refcount_table,
1319                          int64_t *refcount_table_size,
1320                          int64_t offset, int64_t size)
1321 {
1322     BDRVQcow2State *s = bs->opaque;
1323     uint64_t start, last, cluster_offset, k, refcount;
1324     int ret;
1325 
1326     if (size <= 0) {
1327         return 0;
1328     }
1329 
1330     start = start_of_cluster(s, offset);
1331     last = start_of_cluster(s, offset + size - 1);
1332     for(cluster_offset = start; cluster_offset <= last;
1333         cluster_offset += s->cluster_size) {
1334         k = cluster_offset >> s->cluster_bits;
1335         if (k >= *refcount_table_size) {
1336             ret = realloc_refcount_array(s, refcount_table,
1337                                          refcount_table_size, k + 1);
1338             if (ret < 0) {
1339                 res->check_errors++;
1340                 return ret;
1341             }
1342         }
1343 
1344         refcount = s->get_refcount(*refcount_table, k);
1345         if (refcount == s->refcount_max) {
1346             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1347                     "\n", cluster_offset);
1348             res->corruptions++;
1349             continue;
1350         }
1351         s->set_refcount(*refcount_table, k, refcount + 1);
1352     }
1353 
1354     return 0;
1355 }
1356 
1357 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1358 enum {
1359     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1360 };
1361 
1362 /*
1363  * Increases the refcount in the given refcount table for the all clusters
1364  * referenced in the L2 table. While doing so, performs some checks on L2
1365  * entries.
1366  *
1367  * Returns the number of errors found by the checks or -errno if an internal
1368  * error occurred.
1369  */
1370 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1371                               void **refcount_table,
1372                               int64_t *refcount_table_size, int64_t l2_offset,
1373                               int flags)
1374 {
1375     BDRVQcow2State *s = bs->opaque;
1376     uint64_t *l2_table, l2_entry;
1377     uint64_t next_contiguous_offset = 0;
1378     int i, l2_size, nb_csectors, ret;
1379 
1380     /* Read L2 table from disk */
1381     l2_size = s->l2_size * sizeof(uint64_t);
1382     l2_table = g_malloc(l2_size);
1383 
1384     ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size);
1385     if (ret < 0) {
1386         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1387         res->check_errors++;
1388         goto fail;
1389     }
1390 
1391     /* Do the actual checks */
1392     for(i = 0; i < s->l2_size; i++) {
1393         l2_entry = be64_to_cpu(l2_table[i]);
1394 
1395         switch (qcow2_get_cluster_type(l2_entry)) {
1396         case QCOW2_CLUSTER_COMPRESSED:
1397             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1398             if (l2_entry & QCOW_OFLAG_COPIED) {
1399                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1400                     "copied flag must never be set for compressed "
1401                     "clusters\n", l2_entry >> s->cluster_bits);
1402                 l2_entry &= ~QCOW_OFLAG_COPIED;
1403                 res->corruptions++;
1404             }
1405 
1406             /* Mark cluster as used */
1407             nb_csectors = ((l2_entry >> s->csize_shift) &
1408                            s->csize_mask) + 1;
1409             l2_entry &= s->cluster_offset_mask;
1410             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1411                                 l2_entry & ~511, nb_csectors * 512);
1412             if (ret < 0) {
1413                 goto fail;
1414             }
1415 
1416             if (flags & CHECK_FRAG_INFO) {
1417                 res->bfi.allocated_clusters++;
1418                 res->bfi.compressed_clusters++;
1419 
1420                 /* Compressed clusters are fragmented by nature.  Since they
1421                  * take up sub-sector space but we only have sector granularity
1422                  * I/O we need to re-read the same sectors even for adjacent
1423                  * compressed clusters.
1424                  */
1425                 res->bfi.fragmented_clusters++;
1426             }
1427             break;
1428 
1429         case QCOW2_CLUSTER_ZERO:
1430             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1431                 break;
1432             }
1433             /* fall through */
1434 
1435         case QCOW2_CLUSTER_NORMAL:
1436         {
1437             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1438 
1439             if (flags & CHECK_FRAG_INFO) {
1440                 res->bfi.allocated_clusters++;
1441                 if (next_contiguous_offset &&
1442                     offset != next_contiguous_offset) {
1443                     res->bfi.fragmented_clusters++;
1444                 }
1445                 next_contiguous_offset = offset + s->cluster_size;
1446             }
1447 
1448             /* Mark cluster as used */
1449             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1450                                 offset, s->cluster_size);
1451             if (ret < 0) {
1452                 goto fail;
1453             }
1454 
1455             /* Correct offsets are cluster aligned */
1456             if (offset_into_cluster(s, offset)) {
1457                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1458                     "properly aligned; L2 entry corrupted.\n", offset);
1459                 res->corruptions++;
1460             }
1461             break;
1462         }
1463 
1464         case QCOW2_CLUSTER_UNALLOCATED:
1465             break;
1466 
1467         default:
1468             abort();
1469         }
1470     }
1471 
1472     g_free(l2_table);
1473     return 0;
1474 
1475 fail:
1476     g_free(l2_table);
1477     return ret;
1478 }
1479 
1480 /*
1481  * Increases the refcount for the L1 table, its L2 tables and all referenced
1482  * clusters in the given refcount table. While doing so, performs some checks
1483  * on L1 and L2 entries.
1484  *
1485  * Returns the number of errors found by the checks or -errno if an internal
1486  * error occurred.
1487  */
1488 static int check_refcounts_l1(BlockDriverState *bs,
1489                               BdrvCheckResult *res,
1490                               void **refcount_table,
1491                               int64_t *refcount_table_size,
1492                               int64_t l1_table_offset, int l1_size,
1493                               int flags)
1494 {
1495     BDRVQcow2State *s = bs->opaque;
1496     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1497     int i, ret;
1498 
1499     l1_size2 = l1_size * sizeof(uint64_t);
1500 
1501     /* Mark L1 table as used */
1502     ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1503                         l1_table_offset, l1_size2);
1504     if (ret < 0) {
1505         goto fail;
1506     }
1507 
1508     /* Read L1 table entries from disk */
1509     if (l1_size2 > 0) {
1510         l1_table = g_try_malloc(l1_size2);
1511         if (l1_table == NULL) {
1512             ret = -ENOMEM;
1513             res->check_errors++;
1514             goto fail;
1515         }
1516         ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1517         if (ret < 0) {
1518             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1519             res->check_errors++;
1520             goto fail;
1521         }
1522         for(i = 0;i < l1_size; i++)
1523             be64_to_cpus(&l1_table[i]);
1524     }
1525 
1526     /* Do the actual checks */
1527     for(i = 0; i < l1_size; i++) {
1528         l2_offset = l1_table[i];
1529         if (l2_offset) {
1530             /* Mark L2 table as used */
1531             l2_offset &= L1E_OFFSET_MASK;
1532             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1533                                 l2_offset, s->cluster_size);
1534             if (ret < 0) {
1535                 goto fail;
1536             }
1537 
1538             /* L2 tables are cluster aligned */
1539             if (offset_into_cluster(s, l2_offset)) {
1540                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1541                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1542                 res->corruptions++;
1543             }
1544 
1545             /* Process and check L2 entries */
1546             ret = check_refcounts_l2(bs, res, refcount_table,
1547                                      refcount_table_size, l2_offset, flags);
1548             if (ret < 0) {
1549                 goto fail;
1550             }
1551         }
1552     }
1553     g_free(l1_table);
1554     return 0;
1555 
1556 fail:
1557     g_free(l1_table);
1558     return ret;
1559 }
1560 
1561 /*
1562  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1563  *
1564  * This function does not print an error message nor does it increment
1565  * check_errors if qcow2_get_refcount fails (this is because such an error will
1566  * have been already detected and sufficiently signaled by the calling function
1567  * (qcow2_check_refcounts) by the time this function is called).
1568  */
1569 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1570                               BdrvCheckMode fix)
1571 {
1572     BDRVQcow2State *s = bs->opaque;
1573     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1574     int ret;
1575     uint64_t refcount;
1576     int i, j;
1577 
1578     for (i = 0; i < s->l1_size; i++) {
1579         uint64_t l1_entry = s->l1_table[i];
1580         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1581         bool l2_dirty = false;
1582 
1583         if (!l2_offset) {
1584             continue;
1585         }
1586 
1587         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1588                                  &refcount);
1589         if (ret < 0) {
1590             /* don't print message nor increment check_errors */
1591             continue;
1592         }
1593         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1594             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1595                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1596                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1597                                             "ERROR",
1598                     i, l1_entry, refcount);
1599             if (fix & BDRV_FIX_ERRORS) {
1600                 s->l1_table[i] = refcount == 1
1601                                ? l1_entry |  QCOW_OFLAG_COPIED
1602                                : l1_entry & ~QCOW_OFLAG_COPIED;
1603                 ret = qcow2_write_l1_entry(bs, i);
1604                 if (ret < 0) {
1605                     res->check_errors++;
1606                     goto fail;
1607                 }
1608                 res->corruptions_fixed++;
1609             } else {
1610                 res->corruptions++;
1611             }
1612         }
1613 
1614         ret = bdrv_pread(bs->file->bs, l2_offset, l2_table,
1615                          s->l2_size * sizeof(uint64_t));
1616         if (ret < 0) {
1617             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1618                     strerror(-ret));
1619             res->check_errors++;
1620             goto fail;
1621         }
1622 
1623         for (j = 0; j < s->l2_size; j++) {
1624             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1625             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1626             int cluster_type = qcow2_get_cluster_type(l2_entry);
1627 
1628             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1629                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1630                 ret = qcow2_get_refcount(bs,
1631                                          data_offset >> s->cluster_bits,
1632                                          &refcount);
1633                 if (ret < 0) {
1634                     /* don't print message nor increment check_errors */
1635                     continue;
1636                 }
1637                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1638                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1639                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1640                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1641                                                     "ERROR",
1642                             l2_entry, refcount);
1643                     if (fix & BDRV_FIX_ERRORS) {
1644                         l2_table[j] = cpu_to_be64(refcount == 1
1645                                     ? l2_entry |  QCOW_OFLAG_COPIED
1646                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1647                         l2_dirty = true;
1648                         res->corruptions_fixed++;
1649                     } else {
1650                         res->corruptions++;
1651                     }
1652                 }
1653             }
1654         }
1655 
1656         if (l2_dirty) {
1657             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1658                                                 l2_offset, s->cluster_size);
1659             if (ret < 0) {
1660                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1661                         "overlap check failed: %s\n", strerror(-ret));
1662                 res->check_errors++;
1663                 goto fail;
1664             }
1665 
1666             ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table,
1667                               s->cluster_size);
1668             if (ret < 0) {
1669                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1670                         strerror(-ret));
1671                 res->check_errors++;
1672                 goto fail;
1673             }
1674         }
1675     }
1676 
1677     ret = 0;
1678 
1679 fail:
1680     qemu_vfree(l2_table);
1681     return ret;
1682 }
1683 
1684 /*
1685  * Checks consistency of refblocks and accounts for each refblock in
1686  * *refcount_table.
1687  */
1688 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1689                            BdrvCheckMode fix, bool *rebuild,
1690                            void **refcount_table, int64_t *nb_clusters)
1691 {
1692     BDRVQcow2State *s = bs->opaque;
1693     int64_t i, size;
1694     int ret;
1695 
1696     for(i = 0; i < s->refcount_table_size; i++) {
1697         uint64_t offset, cluster;
1698         offset = s->refcount_table[i];
1699         cluster = offset >> s->cluster_bits;
1700 
1701         /* Refcount blocks are cluster aligned */
1702         if (offset_into_cluster(s, offset)) {
1703             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1704                 "cluster aligned; refcount table entry corrupted\n", i);
1705             res->corruptions++;
1706             *rebuild = true;
1707             continue;
1708         }
1709 
1710         if (cluster >= *nb_clusters) {
1711             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1712                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1713 
1714             if (fix & BDRV_FIX_ERRORS) {
1715                 int64_t new_nb_clusters;
1716 
1717                 if (offset > INT64_MAX - s->cluster_size) {
1718                     ret = -EINVAL;
1719                     goto resize_fail;
1720                 }
1721 
1722                 ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1723                 if (ret < 0) {
1724                     goto resize_fail;
1725                 }
1726                 size = bdrv_getlength(bs->file->bs);
1727                 if (size < 0) {
1728                     ret = size;
1729                     goto resize_fail;
1730                 }
1731 
1732                 new_nb_clusters = size_to_clusters(s, size);
1733                 assert(new_nb_clusters >= *nb_clusters);
1734 
1735                 ret = realloc_refcount_array(s, refcount_table,
1736                                              nb_clusters, new_nb_clusters);
1737                 if (ret < 0) {
1738                     res->check_errors++;
1739                     return ret;
1740                 }
1741 
1742                 if (cluster >= *nb_clusters) {
1743                     ret = -EINVAL;
1744                     goto resize_fail;
1745                 }
1746 
1747                 res->corruptions_fixed++;
1748                 ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1749                                     offset, s->cluster_size);
1750                 if (ret < 0) {
1751                     return ret;
1752                 }
1753                 /* No need to check whether the refcount is now greater than 1:
1754                  * This area was just allocated and zeroed, so it can only be
1755                  * exactly 1 after inc_refcounts() */
1756                 continue;
1757 
1758 resize_fail:
1759                 res->corruptions++;
1760                 *rebuild = true;
1761                 fprintf(stderr, "ERROR could not resize image: %s\n",
1762                         strerror(-ret));
1763             } else {
1764                 res->corruptions++;
1765             }
1766             continue;
1767         }
1768 
1769         if (offset != 0) {
1770             ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1771                                 offset, s->cluster_size);
1772             if (ret < 0) {
1773                 return ret;
1774             }
1775             if (s->get_refcount(*refcount_table, cluster) != 1) {
1776                 fprintf(stderr, "ERROR refcount block %" PRId64
1777                         " refcount=%" PRIu64 "\n", i,
1778                         s->get_refcount(*refcount_table, cluster));
1779                 res->corruptions++;
1780                 *rebuild = true;
1781             }
1782         }
1783     }
1784 
1785     return 0;
1786 }
1787 
1788 /*
1789  * Calculates an in-memory refcount table.
1790  */
1791 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1792                                BdrvCheckMode fix, bool *rebuild,
1793                                void **refcount_table, int64_t *nb_clusters)
1794 {
1795     BDRVQcow2State *s = bs->opaque;
1796     int64_t i;
1797     QCowSnapshot *sn;
1798     int ret;
1799 
1800     if (!*refcount_table) {
1801         int64_t old_size = 0;
1802         ret = realloc_refcount_array(s, refcount_table,
1803                                      &old_size, *nb_clusters);
1804         if (ret < 0) {
1805             res->check_errors++;
1806             return ret;
1807         }
1808     }
1809 
1810     /* header */
1811     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1812                         0, s->cluster_size);
1813     if (ret < 0) {
1814         return ret;
1815     }
1816 
1817     /* current L1 table */
1818     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1819                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1820     if (ret < 0) {
1821         return ret;
1822     }
1823 
1824     /* snapshots */
1825     for (i = 0; i < s->nb_snapshots; i++) {
1826         sn = s->snapshots + i;
1827         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1828                                  sn->l1_table_offset, sn->l1_size, 0);
1829         if (ret < 0) {
1830             return ret;
1831         }
1832     }
1833     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1834                         s->snapshots_offset, s->snapshots_size);
1835     if (ret < 0) {
1836         return ret;
1837     }
1838 
1839     /* refcount data */
1840     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1841                         s->refcount_table_offset,
1842                         s->refcount_table_size * sizeof(uint64_t));
1843     if (ret < 0) {
1844         return ret;
1845     }
1846 
1847     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1848 }
1849 
1850 /*
1851  * Compares the actual reference count for each cluster in the image against the
1852  * refcount as reported by the refcount structures on-disk.
1853  */
1854 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1855                               BdrvCheckMode fix, bool *rebuild,
1856                               int64_t *highest_cluster,
1857                               void *refcount_table, int64_t nb_clusters)
1858 {
1859     BDRVQcow2State *s = bs->opaque;
1860     int64_t i;
1861     uint64_t refcount1, refcount2;
1862     int ret;
1863 
1864     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1865         ret = qcow2_get_refcount(bs, i, &refcount1);
1866         if (ret < 0) {
1867             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1868                     i, strerror(-ret));
1869             res->check_errors++;
1870             continue;
1871         }
1872 
1873         refcount2 = s->get_refcount(refcount_table, i);
1874 
1875         if (refcount1 > 0 || refcount2 > 0) {
1876             *highest_cluster = i;
1877         }
1878 
1879         if (refcount1 != refcount2) {
1880             /* Check if we're allowed to fix the mismatch */
1881             int *num_fixed = NULL;
1882             if (refcount1 == 0) {
1883                 *rebuild = true;
1884             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1885                 num_fixed = &res->leaks_fixed;
1886             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1887                 num_fixed = &res->corruptions_fixed;
1888             }
1889 
1890             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1891                     " reference=%" PRIu64 "\n",
1892                    num_fixed != NULL     ? "Repairing" :
1893                    refcount1 < refcount2 ? "ERROR" :
1894                                            "Leaked",
1895                    i, refcount1, refcount2);
1896 
1897             if (num_fixed) {
1898                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1899                                       refcount_diff(refcount1, refcount2),
1900                                       refcount1 > refcount2,
1901                                       QCOW2_DISCARD_ALWAYS);
1902                 if (ret >= 0) {
1903                     (*num_fixed)++;
1904                     continue;
1905                 }
1906             }
1907 
1908             /* And if we couldn't, print an error */
1909             if (refcount1 < refcount2) {
1910                 res->corruptions++;
1911             } else {
1912                 res->leaks++;
1913             }
1914         }
1915     }
1916 }
1917 
1918 /*
1919  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1920  * the on-disk refcount structures.
1921  *
1922  * On input, *first_free_cluster tells where to start looking, and need not
1923  * actually be a free cluster; the returned offset will not be before that
1924  * cluster.  On output, *first_free_cluster points to the first gap found, even
1925  * if that gap was too small to be used as the returned offset.
1926  *
1927  * Note that *first_free_cluster is a cluster index whereas the return value is
1928  * an offset.
1929  */
1930 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1931                                    int cluster_count,
1932                                    void **refcount_table,
1933                                    int64_t *imrt_nb_clusters,
1934                                    int64_t *first_free_cluster)
1935 {
1936     BDRVQcow2State *s = bs->opaque;
1937     int64_t cluster = *first_free_cluster, i;
1938     bool first_gap = true;
1939     int contiguous_free_clusters;
1940     int ret;
1941 
1942     /* Starting at *first_free_cluster, find a range of at least cluster_count
1943      * continuously free clusters */
1944     for (contiguous_free_clusters = 0;
1945          cluster < *imrt_nb_clusters &&
1946          contiguous_free_clusters < cluster_count;
1947          cluster++)
1948     {
1949         if (!s->get_refcount(*refcount_table, cluster)) {
1950             contiguous_free_clusters++;
1951             if (first_gap) {
1952                 /* If this is the first free cluster found, update
1953                  * *first_free_cluster accordingly */
1954                 *first_free_cluster = cluster;
1955                 first_gap = false;
1956             }
1957         } else if (contiguous_free_clusters) {
1958             contiguous_free_clusters = 0;
1959         }
1960     }
1961 
1962     /* If contiguous_free_clusters is greater than zero, it contains the number
1963      * of continuously free clusters until the current cluster; the first free
1964      * cluster in the current "gap" is therefore
1965      * cluster - contiguous_free_clusters */
1966 
1967     /* If no such range could be found, grow the in-memory refcount table
1968      * accordingly to append free clusters at the end of the image */
1969     if (contiguous_free_clusters < cluster_count) {
1970         /* contiguous_free_clusters clusters are already empty at the image end;
1971          * we need cluster_count clusters; therefore, we have to allocate
1972          * cluster_count - contiguous_free_clusters new clusters at the end of
1973          * the image (which is the current value of cluster; note that cluster
1974          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1975          * the image end) */
1976         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1977                                      cluster + cluster_count
1978                                      - contiguous_free_clusters);
1979         if (ret < 0) {
1980             return ret;
1981         }
1982     }
1983 
1984     /* Go back to the first free cluster */
1985     cluster -= contiguous_free_clusters;
1986     for (i = 0; i < cluster_count; i++) {
1987         s->set_refcount(*refcount_table, cluster + i, 1);
1988     }
1989 
1990     return cluster << s->cluster_bits;
1991 }
1992 
1993 /*
1994  * Creates a new refcount structure based solely on the in-memory information
1995  * given through *refcount_table. All necessary allocations will be reflected
1996  * in that array.
1997  *
1998  * On success, the old refcount structure is leaked (it will be covered by the
1999  * new refcount structure).
2000  */
2001 static int rebuild_refcount_structure(BlockDriverState *bs,
2002                                       BdrvCheckResult *res,
2003                                       void **refcount_table,
2004                                       int64_t *nb_clusters)
2005 {
2006     BDRVQcow2State *s = bs->opaque;
2007     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2008     int64_t refblock_offset, refblock_start, refblock_index;
2009     uint32_t reftable_size = 0;
2010     uint64_t *on_disk_reftable = NULL;
2011     void *on_disk_refblock;
2012     int ret = 0;
2013     struct {
2014         uint64_t reftable_offset;
2015         uint32_t reftable_clusters;
2016     } QEMU_PACKED reftable_offset_and_clusters;
2017 
2018     qcow2_cache_empty(bs, s->refcount_block_cache);
2019 
2020 write_refblocks:
2021     for (; cluster < *nb_clusters; cluster++) {
2022         if (!s->get_refcount(*refcount_table, cluster)) {
2023             continue;
2024         }
2025 
2026         refblock_index = cluster >> s->refcount_block_bits;
2027         refblock_start = refblock_index << s->refcount_block_bits;
2028 
2029         /* Don't allocate a cluster in a refblock already written to disk */
2030         if (first_free_cluster < refblock_start) {
2031             first_free_cluster = refblock_start;
2032         }
2033         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2034                                               nb_clusters, &first_free_cluster);
2035         if (refblock_offset < 0) {
2036             fprintf(stderr, "ERROR allocating refblock: %s\n",
2037                     strerror(-refblock_offset));
2038             res->check_errors++;
2039             ret = refblock_offset;
2040             goto fail;
2041         }
2042 
2043         if (reftable_size <= refblock_index) {
2044             uint32_t old_reftable_size = reftable_size;
2045             uint64_t *new_on_disk_reftable;
2046 
2047             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2048                                      s->cluster_size) / sizeof(uint64_t);
2049             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2050                                                  reftable_size *
2051                                                  sizeof(uint64_t));
2052             if (!new_on_disk_reftable) {
2053                 res->check_errors++;
2054                 ret = -ENOMEM;
2055                 goto fail;
2056             }
2057             on_disk_reftable = new_on_disk_reftable;
2058 
2059             memset(on_disk_reftable + old_reftable_size, 0,
2060                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2061 
2062             /* The offset we have for the reftable is now no longer valid;
2063              * this will leak that range, but we can easily fix that by running
2064              * a leak-fixing check after this rebuild operation */
2065             reftable_offset = -1;
2066         }
2067         on_disk_reftable[refblock_index] = refblock_offset;
2068 
2069         /* If this is apparently the last refblock (for now), try to squeeze the
2070          * reftable in */
2071         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2072             reftable_offset < 0)
2073         {
2074             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2075                                                           sizeof(uint64_t));
2076             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2077                                                   refcount_table, nb_clusters,
2078                                                   &first_free_cluster);
2079             if (reftable_offset < 0) {
2080                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2081                         strerror(-reftable_offset));
2082                 res->check_errors++;
2083                 ret = reftable_offset;
2084                 goto fail;
2085             }
2086         }
2087 
2088         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2089                                             s->cluster_size);
2090         if (ret < 0) {
2091             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2092             goto fail;
2093         }
2094 
2095         /* The size of *refcount_table is always cluster-aligned, therefore the
2096          * write operation will not overflow */
2097         on_disk_refblock = (void *)((char *) *refcount_table +
2098                                     refblock_index * s->cluster_size);
2099 
2100         ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE,
2101                          on_disk_refblock, s->cluster_sectors);
2102         if (ret < 0) {
2103             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2104             goto fail;
2105         }
2106 
2107         /* Go to the end of this refblock */
2108         cluster = refblock_start + s->refcount_block_size - 1;
2109     }
2110 
2111     if (reftable_offset < 0) {
2112         uint64_t post_refblock_start, reftable_clusters;
2113 
2114         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2115         reftable_clusters = size_to_clusters(s,
2116                                              reftable_size * sizeof(uint64_t));
2117         /* Not pretty but simple */
2118         if (first_free_cluster < post_refblock_start) {
2119             first_free_cluster = post_refblock_start;
2120         }
2121         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2122                                               refcount_table, nb_clusters,
2123                                               &first_free_cluster);
2124         if (reftable_offset < 0) {
2125             fprintf(stderr, "ERROR allocating reftable: %s\n",
2126                     strerror(-reftable_offset));
2127             res->check_errors++;
2128             ret = reftable_offset;
2129             goto fail;
2130         }
2131 
2132         goto write_refblocks;
2133     }
2134 
2135     assert(on_disk_reftable);
2136 
2137     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2138         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2139     }
2140 
2141     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2142                                         reftable_size * sizeof(uint64_t));
2143     if (ret < 0) {
2144         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2145         goto fail;
2146     }
2147 
2148     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2149     ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable,
2150                       reftable_size * sizeof(uint64_t));
2151     if (ret < 0) {
2152         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2153         goto fail;
2154     }
2155 
2156     /* Enter new reftable into the image header */
2157     cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
2158                  reftable_offset);
2159     cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
2160                  size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2161     ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader,
2162                                                   refcount_table_offset),
2163                            &reftable_offset_and_clusters,
2164                            sizeof(reftable_offset_and_clusters));
2165     if (ret < 0) {
2166         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2167         goto fail;
2168     }
2169 
2170     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2171         be64_to_cpus(&on_disk_reftable[refblock_index]);
2172     }
2173     s->refcount_table = on_disk_reftable;
2174     s->refcount_table_offset = reftable_offset;
2175     s->refcount_table_size = reftable_size;
2176 
2177     return 0;
2178 
2179 fail:
2180     g_free(on_disk_reftable);
2181     return ret;
2182 }
2183 
2184 /*
2185  * Checks an image for refcount consistency.
2186  *
2187  * Returns 0 if no errors are found, the number of errors in case the image is
2188  * detected as corrupted, and -errno when an internal error occurred.
2189  */
2190 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2191                           BdrvCheckMode fix)
2192 {
2193     BDRVQcow2State *s = bs->opaque;
2194     BdrvCheckResult pre_compare_res;
2195     int64_t size, highest_cluster, nb_clusters;
2196     void *refcount_table = NULL;
2197     bool rebuild = false;
2198     int ret;
2199 
2200     size = bdrv_getlength(bs->file->bs);
2201     if (size < 0) {
2202         res->check_errors++;
2203         return size;
2204     }
2205 
2206     nb_clusters = size_to_clusters(s, size);
2207     if (nb_clusters > INT_MAX) {
2208         res->check_errors++;
2209         return -EFBIG;
2210     }
2211 
2212     res->bfi.total_clusters =
2213         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2214 
2215     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2216                               &nb_clusters);
2217     if (ret < 0) {
2218         goto fail;
2219     }
2220 
2221     /* In case we don't need to rebuild the refcount structure (but want to fix
2222      * something), this function is immediately called again, in which case the
2223      * result should be ignored */
2224     pre_compare_res = *res;
2225     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2226                       nb_clusters);
2227 
2228     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2229         BdrvCheckResult old_res = *res;
2230         int fresh_leaks = 0;
2231 
2232         fprintf(stderr, "Rebuilding refcount structure\n");
2233         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2234                                          &nb_clusters);
2235         if (ret < 0) {
2236             goto fail;
2237         }
2238 
2239         res->corruptions = 0;
2240         res->leaks = 0;
2241 
2242         /* Because the old reftable has been exchanged for a new one the
2243          * references have to be recalculated */
2244         rebuild = false;
2245         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2246         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2247                                   &nb_clusters);
2248         if (ret < 0) {
2249             goto fail;
2250         }
2251 
2252         if (fix & BDRV_FIX_LEAKS) {
2253             /* The old refcount structures are now leaked, fix it; the result
2254              * can be ignored, aside from leaks which were introduced by
2255              * rebuild_refcount_structure() that could not be fixed */
2256             BdrvCheckResult saved_res = *res;
2257             *res = (BdrvCheckResult){ 0 };
2258 
2259             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2260                               &highest_cluster, refcount_table, nb_clusters);
2261             if (rebuild) {
2262                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2263                         "broken\n");
2264             }
2265 
2266             /* Any leaks accounted for here were introduced by
2267              * rebuild_refcount_structure() because that function has created a
2268              * new refcount structure from scratch */
2269             fresh_leaks = res->leaks;
2270             *res = saved_res;
2271         }
2272 
2273         if (res->corruptions < old_res.corruptions) {
2274             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2275         }
2276         if (res->leaks < old_res.leaks) {
2277             res->leaks_fixed += old_res.leaks - res->leaks;
2278         }
2279         res->leaks += fresh_leaks;
2280     } else if (fix) {
2281         if (rebuild) {
2282             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2283             res->check_errors++;
2284             ret = -EIO;
2285             goto fail;
2286         }
2287 
2288         if (res->leaks || res->corruptions) {
2289             *res = pre_compare_res;
2290             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2291                               refcount_table, nb_clusters);
2292         }
2293     }
2294 
2295     /* check OFLAG_COPIED */
2296     ret = check_oflag_copied(bs, res, fix);
2297     if (ret < 0) {
2298         goto fail;
2299     }
2300 
2301     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2302     ret = 0;
2303 
2304 fail:
2305     g_free(refcount_table);
2306 
2307     return ret;
2308 }
2309 
2310 #define overlaps_with(ofs, sz) \
2311     ranges_overlap(offset, size, ofs, sz)
2312 
2313 /*
2314  * Checks if the given offset into the image file is actually free to use by
2315  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2316  * i.e. a sanity check without relying on the refcount tables.
2317  *
2318  * The ign parameter specifies what checks not to perform (being a bitmask of
2319  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2320  *
2321  * Returns:
2322  * - 0 if writing to this offset will not affect the mentioned metadata
2323  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2324  * - a negative value (-errno) indicating an error while performing a check,
2325  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2326  */
2327 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2328                                  int64_t size)
2329 {
2330     BDRVQcow2State *s = bs->opaque;
2331     int chk = s->overlap_check & ~ign;
2332     int i, j;
2333 
2334     if (!size) {
2335         return 0;
2336     }
2337 
2338     if (chk & QCOW2_OL_MAIN_HEADER) {
2339         if (offset < s->cluster_size) {
2340             return QCOW2_OL_MAIN_HEADER;
2341         }
2342     }
2343 
2344     /* align range to test to cluster boundaries */
2345     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2346     offset = start_of_cluster(s, offset);
2347 
2348     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2349         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2350             return QCOW2_OL_ACTIVE_L1;
2351         }
2352     }
2353 
2354     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2355         if (overlaps_with(s->refcount_table_offset,
2356             s->refcount_table_size * sizeof(uint64_t))) {
2357             return QCOW2_OL_REFCOUNT_TABLE;
2358         }
2359     }
2360 
2361     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2362         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2363             return QCOW2_OL_SNAPSHOT_TABLE;
2364         }
2365     }
2366 
2367     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2368         for (i = 0; i < s->nb_snapshots; i++) {
2369             if (s->snapshots[i].l1_size &&
2370                 overlaps_with(s->snapshots[i].l1_table_offset,
2371                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2372                 return QCOW2_OL_INACTIVE_L1;
2373             }
2374         }
2375     }
2376 
2377     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2378         for (i = 0; i < s->l1_size; i++) {
2379             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2380                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2381                 s->cluster_size)) {
2382                 return QCOW2_OL_ACTIVE_L2;
2383             }
2384         }
2385     }
2386 
2387     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2388         for (i = 0; i < s->refcount_table_size; i++) {
2389             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2390                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2391                 s->cluster_size)) {
2392                 return QCOW2_OL_REFCOUNT_BLOCK;
2393             }
2394         }
2395     }
2396 
2397     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2398         for (i = 0; i < s->nb_snapshots; i++) {
2399             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2400             uint32_t l1_sz  = s->snapshots[i].l1_size;
2401             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2402             uint64_t *l1 = g_try_malloc(l1_sz2);
2403             int ret;
2404 
2405             if (l1_sz2 && l1 == NULL) {
2406                 return -ENOMEM;
2407             }
2408 
2409             ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2);
2410             if (ret < 0) {
2411                 g_free(l1);
2412                 return ret;
2413             }
2414 
2415             for (j = 0; j < l1_sz; j++) {
2416                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2417                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2418                     g_free(l1);
2419                     return QCOW2_OL_INACTIVE_L2;
2420                 }
2421             }
2422 
2423             g_free(l1);
2424         }
2425     }
2426 
2427     return 0;
2428 }
2429 
2430 static const char *metadata_ol_names[] = {
2431     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2432     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2433     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2434     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2435     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2436     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2437     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2438     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2439 };
2440 
2441 /*
2442  * First performs a check for metadata overlaps (through
2443  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2444  * while performing a check), that value is returned. If an impending overlap
2445  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2446  * and -EIO returned.
2447  *
2448  * Returns 0 if there were neither overlaps nor errors while checking for
2449  * overlaps; or a negative value (-errno) on error.
2450  */
2451 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2452                                   int64_t size)
2453 {
2454     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2455 
2456     if (ret < 0) {
2457         return ret;
2458     } else if (ret > 0) {
2459         int metadata_ol_bitnr = ctz32(ret);
2460         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2461 
2462         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2463                                 "write on metadata (overlaps with %s)",
2464                                 metadata_ol_names[metadata_ol_bitnr]);
2465         return -EIO;
2466     }
2467 
2468     return 0;
2469 }
2470