xref: /openbmc/qemu/block/qcow2-refcount.c (revision 0b8f7448)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 #include "qemu/cutils.h"
33 
34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
35 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
36                             int64_t offset, int64_t length, uint64_t addend,
37                             bool decrease, enum qcow2_discard_type type);
38 
39 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
46 
47 static void set_refcount_ro0(void *refcount_array, uint64_t index,
48                              uint64_t value);
49 static void set_refcount_ro1(void *refcount_array, uint64_t index,
50                              uint64_t value);
51 static void set_refcount_ro2(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro3(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro4(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro5(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro6(void *refcount_array, uint64_t index,
60                              uint64_t value);
61 
62 
63 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
64     &get_refcount_ro0,
65     &get_refcount_ro1,
66     &get_refcount_ro2,
67     &get_refcount_ro3,
68     &get_refcount_ro4,
69     &get_refcount_ro5,
70     &get_refcount_ro6
71 };
72 
73 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
74     &set_refcount_ro0,
75     &set_refcount_ro1,
76     &set_refcount_ro2,
77     &set_refcount_ro3,
78     &set_refcount_ro4,
79     &set_refcount_ro5,
80     &set_refcount_ro6
81 };
82 
83 
84 /*********************************************************/
85 /* refcount handling */
86 
87 static void update_max_refcount_table_index(BDRVQcow2State *s)
88 {
89     unsigned i = s->refcount_table_size - 1;
90     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
91         i--;
92     }
93     /* Set s->max_refcount_table_index to the index of the last used entry */
94     s->max_refcount_table_index = i;
95 }
96 
97 int qcow2_refcount_init(BlockDriverState *bs)
98 {
99     BDRVQcow2State *s = bs->opaque;
100     unsigned int refcount_table_size2, i;
101     int ret;
102 
103     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
104 
105     s->get_refcount = get_refcount_funcs[s->refcount_order];
106     s->set_refcount = set_refcount_funcs[s->refcount_order];
107 
108     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
109     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
110     s->refcount_table = g_try_malloc(refcount_table_size2);
111 
112     if (s->refcount_table_size > 0) {
113         if (s->refcount_table == NULL) {
114             ret = -ENOMEM;
115             goto fail;
116         }
117         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
118         ret = bdrv_pread(bs->file, s->refcount_table_offset,
119                          s->refcount_table, refcount_table_size2);
120         if (ret < 0) {
121             goto fail;
122         }
123         for(i = 0; i < s->refcount_table_size; i++)
124             be64_to_cpus(&s->refcount_table[i]);
125         update_max_refcount_table_index(s);
126     }
127     return 0;
128  fail:
129     return ret;
130 }
131 
132 void qcow2_refcount_close(BlockDriverState *bs)
133 {
134     BDRVQcow2State *s = bs->opaque;
135     g_free(s->refcount_table);
136 }
137 
138 
139 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
140 {
141     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
142 }
143 
144 static void set_refcount_ro0(void *refcount_array, uint64_t index,
145                              uint64_t value)
146 {
147     assert(!(value >> 1));
148     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
149     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
150 }
151 
152 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
153 {
154     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
155            & 0x3;
156 }
157 
158 static void set_refcount_ro1(void *refcount_array, uint64_t index,
159                              uint64_t value)
160 {
161     assert(!(value >> 2));
162     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
163     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
164 }
165 
166 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
167 {
168     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
169            & 0xf;
170 }
171 
172 static void set_refcount_ro2(void *refcount_array, uint64_t index,
173                              uint64_t value)
174 {
175     assert(!(value >> 4));
176     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
177     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
178 }
179 
180 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
181 {
182     return ((const uint8_t *)refcount_array)[index];
183 }
184 
185 static void set_refcount_ro3(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 8));
189     ((uint8_t *)refcount_array)[index] = value;
190 }
191 
192 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
193 {
194     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
195 }
196 
197 static void set_refcount_ro4(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 16));
201     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
202 }
203 
204 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
205 {
206     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
207 }
208 
209 static void set_refcount_ro5(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     assert(!(value >> 32));
213     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
214 }
215 
216 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
217 {
218     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
219 }
220 
221 static void set_refcount_ro6(void *refcount_array, uint64_t index,
222                              uint64_t value)
223 {
224     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
225 }
226 
227 
228 static int load_refcount_block(BlockDriverState *bs,
229                                int64_t refcount_block_offset,
230                                void **refcount_block)
231 {
232     BDRVQcow2State *s = bs->opaque;
233 
234     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
235     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
236                            refcount_block);
237 }
238 
239 /*
240  * Retrieves the refcount of the cluster given by its index and stores it in
241  * *refcount. Returns 0 on success and -errno on failure.
242  */
243 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
244                        uint64_t *refcount)
245 {
246     BDRVQcow2State *s = bs->opaque;
247     uint64_t refcount_table_index, block_index;
248     int64_t refcount_block_offset;
249     int ret;
250     void *refcount_block;
251 
252     refcount_table_index = cluster_index >> s->refcount_block_bits;
253     if (refcount_table_index >= s->refcount_table_size) {
254         *refcount = 0;
255         return 0;
256     }
257     refcount_block_offset =
258         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
259     if (!refcount_block_offset) {
260         *refcount = 0;
261         return 0;
262     }
263 
264     if (offset_into_cluster(s, refcount_block_offset)) {
265         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
266                                 " unaligned (reftable index: %#" PRIx64 ")",
267                                 refcount_block_offset, refcount_table_index);
268         return -EIO;
269     }
270 
271     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
272                           &refcount_block);
273     if (ret < 0) {
274         return ret;
275     }
276 
277     block_index = cluster_index & (s->refcount_block_size - 1);
278     *refcount = s->get_refcount(refcount_block, block_index);
279 
280     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
281 
282     return 0;
283 }
284 
285 /* Checks if two offsets are described by the same refcount block */
286 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
287     uint64_t offset_b)
288 {
289     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
290     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
291 
292     return (block_a == block_b);
293 }
294 
295 /*
296  * Loads a refcount block. If it doesn't exist yet, it is allocated first
297  * (including growing the refcount table if needed).
298  *
299  * Returns 0 on success or -errno in error case
300  */
301 static int alloc_refcount_block(BlockDriverState *bs,
302                                 int64_t cluster_index, void **refcount_block)
303 {
304     BDRVQcow2State *s = bs->opaque;
305     unsigned int refcount_table_index;
306     int64_t ret;
307 
308     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
309 
310     /* Find the refcount block for the given cluster */
311     refcount_table_index = cluster_index >> s->refcount_block_bits;
312 
313     if (refcount_table_index < s->refcount_table_size) {
314 
315         uint64_t refcount_block_offset =
316             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
317 
318         /* If it's already there, we're done */
319         if (refcount_block_offset) {
320             if (offset_into_cluster(s, refcount_block_offset)) {
321                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
322                                         PRIx64 " unaligned (reftable index: "
323                                         "%#x)", refcount_block_offset,
324                                         refcount_table_index);
325                 return -EIO;
326             }
327 
328              return load_refcount_block(bs, refcount_block_offset,
329                                         refcount_block);
330         }
331     }
332 
333     /*
334      * If we came here, we need to allocate something. Something is at least
335      * a cluster for the new refcount block. It may also include a new refcount
336      * table if the old refcount table is too small.
337      *
338      * Note that allocating clusters here needs some special care:
339      *
340      * - We can't use the normal qcow2_alloc_clusters(), it would try to
341      *   increase the refcount and very likely we would end up with an endless
342      *   recursion. Instead we must place the refcount blocks in a way that
343      *   they can describe them themselves.
344      *
345      * - We need to consider that at this point we are inside update_refcounts
346      *   and potentially doing an initial refcount increase. This means that
347      *   some clusters have already been allocated by the caller, but their
348      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
349      *   need to return -EAGAIN to signal the caller that it needs to restart
350      *   the search for free clusters.
351      *
352      * - alloc_clusters_noref and qcow2_free_clusters may load a different
353      *   refcount block into the cache
354      */
355 
356     *refcount_block = NULL;
357 
358     /* We write to the refcount table, so we might depend on L2 tables */
359     ret = qcow2_cache_flush(bs, s->l2_table_cache);
360     if (ret < 0) {
361         return ret;
362     }
363 
364     /* Allocate the refcount block itself and mark it as used */
365     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
366     if (new_block < 0) {
367         return new_block;
368     }
369 
370     /* If we're allocating the block at offset 0 then something is wrong */
371     if (new_block == 0) {
372         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
373                                 "allocation of refcount block at offset 0");
374         return -EIO;
375     }
376 
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382 
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail;
389         }
390 
391         memset(*refcount_block, 0, s->cluster_size);
392 
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail;
404         }
405 
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail;
409         }
410 
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail;
417         }
418 
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421 
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail;
428     }
429 
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail;
439         }
440 
441         s->refcount_table[refcount_table_index] = new_block;
442         /* If there's a hole in s->refcount_table then it can happen
443          * that refcount_table_index < s->max_refcount_table_index */
444         s->max_refcount_table_index =
445             MAX(s->max_refcount_table_index, refcount_table_index);
446 
447         /* The new refcount block may be where the caller intended to put its
448          * data, so let it restart the search. */
449         return -EAGAIN;
450     }
451 
452     qcow2_cache_put(s->refcount_block_cache, refcount_block);
453 
454     /*
455      * If we come here, we need to grow the refcount table. Again, a new
456      * refcount table needs some space and we can't simply allocate to avoid
457      * endless recursion.
458      *
459      * Therefore let's grab new refcount blocks at the end of the image, which
460      * will describe themselves and the new refcount table. This way we can
461      * reference them only in the new table and do the switch to the new
462      * refcount table at once without producing an inconsistent state in
463      * between.
464      */
465     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
466 
467     /* Calculate the number of refcount blocks needed so far; this will be the
468      * basis for calculating the index of the first cluster used for the
469      * self-describing refcount structures which we are about to create.
470      *
471      * Because we reached this point, there cannot be any refcount entries for
472      * cluster_index or higher indices yet. However, because new_block has been
473      * allocated to describe that cluster (and it will assume this role later
474      * on), we cannot use that index; also, new_block may actually have a higher
475      * cluster index than cluster_index, so it needs to be taken into account
476      * here (and 1 needs to be added to its value because that cluster is used).
477      */
478     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
479                                             (new_block >> s->cluster_bits) + 1),
480                                         s->refcount_block_size);
481 
482     /* Create the new refcount table and blocks */
483     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
484         s->cluster_size;
485 
486     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
487                               refcount_table_index, new_block);
488     if (ret < 0) {
489         return ret;
490     }
491 
492     ret = load_refcount_block(bs, new_block, refcount_block);
493     if (ret < 0) {
494         return ret;
495     }
496 
497     /* If we were trying to do the initial refcount update for some cluster
498      * allocation, we might have used the same clusters to store newly
499      * allocated metadata. Make the caller search some new space. */
500     return -EAGAIN;
501 
502 fail:
503     if (*refcount_block != NULL) {
504         qcow2_cache_put(s->refcount_block_cache, refcount_block);
505     }
506     return ret;
507 }
508 
509 /*
510  * Starting at @start_offset, this function creates new self-covering refcount
511  * structures: A new refcount table and refcount blocks which cover all of
512  * themselves, and a number of @additional_clusters beyond their end.
513  * @start_offset must be at the end of the image file, that is, there must be
514  * only empty space beyond it.
515  * If @exact_size is false, the refcount table will have 50 % more entries than
516  * necessary so it will not need to grow again soon.
517  * If @new_refblock_offset is not zero, it contains the offset of a refcount
518  * block that should be entered into the new refcount table at index
519  * @new_refblock_index.
520  *
521  * Returns: The offset after the new refcount structures (i.e. where the
522  *          @additional_clusters may be placed) on success, -errno on error.
523  */
524 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
525                             uint64_t additional_clusters, bool exact_size,
526                             int new_refblock_index,
527                             uint64_t new_refblock_offset)
528 {
529     BDRVQcow2State *s = bs->opaque;
530     uint64_t total_refblock_count_u64, additional_refblock_count;
531     int total_refblock_count, table_size, area_reftable_index, table_clusters;
532     int i;
533     uint64_t table_offset, block_offset, end_offset;
534     int ret;
535     uint64_t *new_table;
536 
537     assert(!(start_offset % s->cluster_size));
538 
539     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
540                                  additional_clusters,
541                                  s->cluster_size, s->refcount_order,
542                                  !exact_size, &total_refblock_count_u64);
543     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
544         return -EFBIG;
545     }
546     total_refblock_count = total_refblock_count_u64;
547 
548     /* Index in the refcount table of the first refcount block to cover the area
549      * of refcount structures we are about to create; we know that
550      * @total_refblock_count can cover @start_offset, so this will definitely
551      * fit into an int. */
552     area_reftable_index = (start_offset / s->cluster_size) /
553                           s->refcount_block_size;
554 
555     if (exact_size) {
556         table_size = total_refblock_count;
557     } else {
558         table_size = total_refblock_count +
559                      DIV_ROUND_UP(total_refblock_count, 2);
560     }
561     /* The qcow2 file can only store the reftable size in number of clusters */
562     table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
563     table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
564 
565     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
566         return -EFBIG;
567     }
568 
569     new_table = g_try_new0(uint64_t, table_size);
570 
571     assert(table_size > 0);
572     if (new_table == NULL) {
573         ret = -ENOMEM;
574         goto fail;
575     }
576 
577     /* Fill the new refcount table */
578     if (table_size > s->max_refcount_table_index) {
579         /* We're actually growing the reftable */
580         memcpy(new_table, s->refcount_table,
581                (s->max_refcount_table_index + 1) * sizeof(uint64_t));
582     } else {
583         /* Improbable case: We're shrinking the reftable. However, the caller
584          * has assured us that there is only empty space beyond @start_offset,
585          * so we can simply drop all of the refblocks that won't fit into the
586          * new reftable. */
587         memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
588     }
589 
590     if (new_refblock_offset) {
591         assert(new_refblock_index < total_refblock_count);
592         new_table[new_refblock_index] = new_refblock_offset;
593     }
594 
595     /* Count how many new refblocks we have to create */
596     additional_refblock_count = 0;
597     for (i = area_reftable_index; i < total_refblock_count; i++) {
598         if (!new_table[i]) {
599             additional_refblock_count++;
600         }
601     }
602 
603     table_offset = start_offset + additional_refblock_count * s->cluster_size;
604     end_offset = table_offset + table_clusters * s->cluster_size;
605 
606     /* Fill the refcount blocks, and create new ones, if necessary */
607     block_offset = start_offset;
608     for (i = area_reftable_index; i < total_refblock_count; i++) {
609         void *refblock_data;
610         uint64_t first_offset_covered;
611 
612         /* Reuse an existing refblock if possible, create a new one otherwise */
613         if (new_table[i]) {
614             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
615                                   &refblock_data);
616             if (ret < 0) {
617                 goto fail;
618             }
619         } else {
620             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
621                                         block_offset, &refblock_data);
622             if (ret < 0) {
623                 goto fail;
624             }
625             memset(refblock_data, 0, s->cluster_size);
626             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
627                                          refblock_data);
628 
629             new_table[i] = block_offset;
630             block_offset += s->cluster_size;
631         }
632 
633         /* First host offset covered by this refblock */
634         first_offset_covered = (uint64_t)i * s->refcount_block_size *
635                                s->cluster_size;
636         if (first_offset_covered < end_offset) {
637             int j, end_index;
638 
639             /* Set the refcount of all of the new refcount structures to 1 */
640 
641             if (first_offset_covered < start_offset) {
642                 assert(i == area_reftable_index);
643                 j = (start_offset - first_offset_covered) / s->cluster_size;
644                 assert(j < s->refcount_block_size);
645             } else {
646                 j = 0;
647             }
648 
649             end_index = MIN((end_offset - first_offset_covered) /
650                             s->cluster_size,
651                             s->refcount_block_size);
652 
653             for (; j < end_index; j++) {
654                 /* The caller guaranteed us this space would be empty */
655                 assert(s->get_refcount(refblock_data, j) == 0);
656                 s->set_refcount(refblock_data, j, 1);
657             }
658 
659             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
660                                          refblock_data);
661         }
662 
663         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
664     }
665 
666     assert(block_offset == table_offset);
667 
668     /* Write refcount blocks to disk */
669     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
670     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
671     if (ret < 0) {
672         goto fail;
673     }
674 
675     /* Write refcount table to disk */
676     for (i = 0; i < total_refblock_count; i++) {
677         cpu_to_be64s(&new_table[i]);
678     }
679 
680     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
681     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
682         table_size * sizeof(uint64_t));
683     if (ret < 0) {
684         goto fail;
685     }
686 
687     for (i = 0; i < total_refblock_count; i++) {
688         be64_to_cpus(&new_table[i]);
689     }
690 
691     /* Hook up the new refcount table in the qcow2 header */
692     struct QEMU_PACKED {
693         uint64_t d64;
694         uint32_t d32;
695     } data;
696     data.d64 = cpu_to_be64(table_offset);
697     data.d32 = cpu_to_be32(table_clusters);
698     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
699     ret = bdrv_pwrite_sync(bs->file,
700                            offsetof(QCowHeader, refcount_table_offset),
701                            &data, sizeof(data));
702     if (ret < 0) {
703         goto fail;
704     }
705 
706     /* And switch it in memory */
707     uint64_t old_table_offset = s->refcount_table_offset;
708     uint64_t old_table_size = s->refcount_table_size;
709 
710     g_free(s->refcount_table);
711     s->refcount_table = new_table;
712     s->refcount_table_size = table_size;
713     s->refcount_table_offset = table_offset;
714     update_max_refcount_table_index(s);
715 
716     /* Free old table. */
717     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
718                         QCOW2_DISCARD_OTHER);
719 
720     return end_offset;
721 
722 fail:
723     g_free(new_table);
724     return ret;
725 }
726 
727 void qcow2_process_discards(BlockDriverState *bs, int ret)
728 {
729     BDRVQcow2State *s = bs->opaque;
730     Qcow2DiscardRegion *d, *next;
731 
732     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
733         QTAILQ_REMOVE(&s->discards, d, next);
734 
735         /* Discard is optional, ignore the return value */
736         if (ret >= 0) {
737             bdrv_pdiscard(bs->file, d->offset, d->bytes);
738         }
739 
740         g_free(d);
741     }
742 }
743 
744 static void update_refcount_discard(BlockDriverState *bs,
745                                     uint64_t offset, uint64_t length)
746 {
747     BDRVQcow2State *s = bs->opaque;
748     Qcow2DiscardRegion *d, *p, *next;
749 
750     QTAILQ_FOREACH(d, &s->discards, next) {
751         uint64_t new_start = MIN(offset, d->offset);
752         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
753 
754         if (new_end - new_start <= length + d->bytes) {
755             /* There can't be any overlap, areas ending up here have no
756              * references any more and therefore shouldn't get freed another
757              * time. */
758             assert(d->bytes + length == new_end - new_start);
759             d->offset = new_start;
760             d->bytes = new_end - new_start;
761             goto found;
762         }
763     }
764 
765     d = g_malloc(sizeof(*d));
766     *d = (Qcow2DiscardRegion) {
767         .bs     = bs,
768         .offset = offset,
769         .bytes  = length,
770     };
771     QTAILQ_INSERT_TAIL(&s->discards, d, next);
772 
773 found:
774     /* Merge discard requests if they are adjacent now */
775     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
776         if (p == d
777             || p->offset > d->offset + d->bytes
778             || d->offset > p->offset + p->bytes)
779         {
780             continue;
781         }
782 
783         /* Still no overlap possible */
784         assert(p->offset == d->offset + d->bytes
785             || d->offset == p->offset + p->bytes);
786 
787         QTAILQ_REMOVE(&s->discards, p, next);
788         d->offset = MIN(d->offset, p->offset);
789         d->bytes += p->bytes;
790         g_free(p);
791     }
792 }
793 
794 /* XXX: cache several refcount block clusters ? */
795 /* @addend is the absolute value of the addend; if @decrease is set, @addend
796  * will be subtracted from the current refcount, otherwise it will be added */
797 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
798                                                    int64_t offset,
799                                                    int64_t length,
800                                                    uint64_t addend,
801                                                    bool decrease,
802                                                    enum qcow2_discard_type type)
803 {
804     BDRVQcow2State *s = bs->opaque;
805     int64_t start, last, cluster_offset;
806     void *refcount_block = NULL;
807     int64_t old_table_index = -1;
808     int ret;
809 
810 #ifdef DEBUG_ALLOC2
811     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
812             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
813             addend);
814 #endif
815     if (length < 0) {
816         return -EINVAL;
817     } else if (length == 0) {
818         return 0;
819     }
820 
821     if (decrease) {
822         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
823             s->l2_table_cache);
824     }
825 
826     start = start_of_cluster(s, offset);
827     last = start_of_cluster(s, offset + length - 1);
828     for(cluster_offset = start; cluster_offset <= last;
829         cluster_offset += s->cluster_size)
830     {
831         int block_index;
832         uint64_t refcount;
833         int64_t cluster_index = cluster_offset >> s->cluster_bits;
834         int64_t table_index = cluster_index >> s->refcount_block_bits;
835 
836         /* Load the refcount block and allocate it if needed */
837         if (table_index != old_table_index) {
838             if (refcount_block) {
839                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
840             }
841             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
842             /* If the caller needs to restart the search for free clusters,
843              * try the same ones first to see if they're still free. */
844             if (ret == -EAGAIN) {
845                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
846                     s->free_cluster_index = (start >> s->cluster_bits);
847                 }
848             }
849             if (ret < 0) {
850                 goto fail;
851             }
852         }
853         old_table_index = table_index;
854 
855         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
856 
857         /* we can update the count and save it */
858         block_index = cluster_index & (s->refcount_block_size - 1);
859 
860         refcount = s->get_refcount(refcount_block, block_index);
861         if (decrease ? (refcount - addend > refcount)
862                      : (refcount + addend < refcount ||
863                         refcount + addend > s->refcount_max))
864         {
865             ret = -EINVAL;
866             goto fail;
867         }
868         if (decrease) {
869             refcount -= addend;
870         } else {
871             refcount += addend;
872         }
873         if (refcount == 0 && cluster_index < s->free_cluster_index) {
874             s->free_cluster_index = cluster_index;
875         }
876         s->set_refcount(refcount_block, block_index, refcount);
877 
878         if (refcount == 0) {
879             void *table;
880 
881             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
882                                                 offset);
883             if (table != NULL) {
884                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
885                 qcow2_cache_discard(s->refcount_block_cache, table);
886             }
887 
888             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
889             if (table != NULL) {
890                 qcow2_cache_discard(s->l2_table_cache, table);
891             }
892 
893             if (s->discard_passthrough[type]) {
894                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
895             }
896         }
897     }
898 
899     ret = 0;
900 fail:
901     if (!s->cache_discards) {
902         qcow2_process_discards(bs, ret);
903     }
904 
905     /* Write last changed block to disk */
906     if (refcount_block) {
907         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
908     }
909 
910     /*
911      * Try do undo any updates if an error is returned (This may succeed in
912      * some cases like ENOSPC for allocating a new refcount block)
913      */
914     if (ret < 0) {
915         int dummy;
916         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
917                                 !decrease, QCOW2_DISCARD_NEVER);
918         (void)dummy;
919     }
920 
921     return ret;
922 }
923 
924 /*
925  * Increases or decreases the refcount of a given cluster.
926  *
927  * @addend is the absolute value of the addend; if @decrease is set, @addend
928  * will be subtracted from the current refcount, otherwise it will be added.
929  *
930  * On success 0 is returned; on failure -errno is returned.
931  */
932 int qcow2_update_cluster_refcount(BlockDriverState *bs,
933                                   int64_t cluster_index,
934                                   uint64_t addend, bool decrease,
935                                   enum qcow2_discard_type type)
936 {
937     BDRVQcow2State *s = bs->opaque;
938     int ret;
939 
940     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
941                           decrease, type);
942     if (ret < 0) {
943         return ret;
944     }
945 
946     return 0;
947 }
948 
949 
950 
951 /*********************************************************/
952 /* cluster allocation functions */
953 
954 
955 
956 /* return < 0 if error */
957 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
958 {
959     BDRVQcow2State *s = bs->opaque;
960     uint64_t i, nb_clusters, refcount;
961     int ret;
962 
963     /* We can't allocate clusters if they may still be queued for discard. */
964     if (s->cache_discards) {
965         qcow2_process_discards(bs, 0);
966     }
967 
968     nb_clusters = size_to_clusters(s, size);
969 retry:
970     for(i = 0; i < nb_clusters; i++) {
971         uint64_t next_cluster_index = s->free_cluster_index++;
972         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
973 
974         if (ret < 0) {
975             return ret;
976         } else if (refcount != 0) {
977             goto retry;
978         }
979     }
980 
981     /* Make sure that all offsets in the "allocated" range are representable
982      * in an int64_t */
983     if (s->free_cluster_index > 0 &&
984         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
985     {
986         return -EFBIG;
987     }
988 
989 #ifdef DEBUG_ALLOC2
990     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
991             size,
992             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
993 #endif
994     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
995 }
996 
997 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
998 {
999     int64_t offset;
1000     int ret;
1001 
1002     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1003     do {
1004         offset = alloc_clusters_noref(bs, size);
1005         if (offset < 0) {
1006             return offset;
1007         }
1008 
1009         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1010     } while (ret == -EAGAIN);
1011 
1012     if (ret < 0) {
1013         return ret;
1014     }
1015 
1016     return offset;
1017 }
1018 
1019 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1020                                 int64_t nb_clusters)
1021 {
1022     BDRVQcow2State *s = bs->opaque;
1023     uint64_t cluster_index, refcount;
1024     uint64_t i;
1025     int ret;
1026 
1027     assert(nb_clusters >= 0);
1028     if (nb_clusters == 0) {
1029         return 0;
1030     }
1031 
1032     do {
1033         /* Check how many clusters there are free */
1034         cluster_index = offset >> s->cluster_bits;
1035         for(i = 0; i < nb_clusters; i++) {
1036             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1037             if (ret < 0) {
1038                 return ret;
1039             } else if (refcount != 0) {
1040                 break;
1041             }
1042         }
1043 
1044         /* And then allocate them */
1045         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1046                               QCOW2_DISCARD_NEVER);
1047     } while (ret == -EAGAIN);
1048 
1049     if (ret < 0) {
1050         return ret;
1051     }
1052 
1053     return i;
1054 }
1055 
1056 /* only used to allocate compressed sectors. We try to allocate
1057    contiguous sectors. size must be <= cluster_size */
1058 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1059 {
1060     BDRVQcow2State *s = bs->opaque;
1061     int64_t offset;
1062     size_t free_in_cluster;
1063     int ret;
1064 
1065     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1066     assert(size > 0 && size <= s->cluster_size);
1067     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1068 
1069     offset = s->free_byte_offset;
1070 
1071     if (offset) {
1072         uint64_t refcount;
1073         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1074         if (ret < 0) {
1075             return ret;
1076         }
1077 
1078         if (refcount == s->refcount_max) {
1079             offset = 0;
1080         }
1081     }
1082 
1083     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1084     do {
1085         if (!offset || free_in_cluster < size) {
1086             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1087             if (new_cluster < 0) {
1088                 return new_cluster;
1089             }
1090 
1091             if (new_cluster == 0) {
1092                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1093                                         "allocation of compressed cluster "
1094                                         "at offset 0");
1095                 return -EIO;
1096             }
1097 
1098             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1099                 offset = new_cluster;
1100                 free_in_cluster = s->cluster_size;
1101             } else {
1102                 free_in_cluster += s->cluster_size;
1103             }
1104         }
1105 
1106         assert(offset);
1107         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1108         if (ret < 0) {
1109             offset = 0;
1110         }
1111     } while (ret == -EAGAIN);
1112     if (ret < 0) {
1113         return ret;
1114     }
1115 
1116     /* The cluster refcount was incremented; refcount blocks must be flushed
1117      * before the caller's L2 table updates. */
1118     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1119 
1120     s->free_byte_offset = offset + size;
1121     if (!offset_into_cluster(s, s->free_byte_offset)) {
1122         s->free_byte_offset = 0;
1123     }
1124 
1125     return offset;
1126 }
1127 
1128 void qcow2_free_clusters(BlockDriverState *bs,
1129                           int64_t offset, int64_t size,
1130                           enum qcow2_discard_type type)
1131 {
1132     int ret;
1133 
1134     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1135     ret = update_refcount(bs, offset, size, 1, true, type);
1136     if (ret < 0) {
1137         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1138         /* TODO Remember the clusters to free them later and avoid leaking */
1139     }
1140 }
1141 
1142 /*
1143  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1144  * normal cluster, compressed cluster, etc.)
1145  */
1146 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1147                              int nb_clusters, enum qcow2_discard_type type)
1148 {
1149     BDRVQcow2State *s = bs->opaque;
1150 
1151     switch (qcow2_get_cluster_type(l2_entry)) {
1152     case QCOW2_CLUSTER_COMPRESSED:
1153         {
1154             int nb_csectors;
1155             nb_csectors = ((l2_entry >> s->csize_shift) &
1156                            s->csize_mask) + 1;
1157             qcow2_free_clusters(bs,
1158                 (l2_entry & s->cluster_offset_mask) & ~511,
1159                 nb_csectors * 512, type);
1160         }
1161         break;
1162     case QCOW2_CLUSTER_NORMAL:
1163     case QCOW2_CLUSTER_ZERO_ALLOC:
1164         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1165             qcow2_signal_corruption(bs, false, -1, -1,
1166                                     "Cannot free unaligned cluster %#llx",
1167                                     l2_entry & L2E_OFFSET_MASK);
1168         } else {
1169             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1170                                 nb_clusters << s->cluster_bits, type);
1171         }
1172         break;
1173     case QCOW2_CLUSTER_ZERO_PLAIN:
1174     case QCOW2_CLUSTER_UNALLOCATED:
1175         break;
1176     default:
1177         abort();
1178     }
1179 }
1180 
1181 int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1182 {
1183     BDRVQcow2State *s = bs->opaque;
1184     int ret;
1185 
1186     ret = qcow2_cache_write(bs, s->l2_table_cache);
1187     if (ret < 0) {
1188         return ret;
1189     }
1190 
1191     if (qcow2_need_accurate_refcounts(s)) {
1192         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1193         if (ret < 0) {
1194             return ret;
1195         }
1196     }
1197 
1198     return 0;
1199 }
1200 
1201 int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1202 {
1203     int ret = qcow2_write_caches(bs);
1204     if (ret < 0) {
1205         return ret;
1206     }
1207 
1208     return bdrv_flush(bs->file->bs);
1209 }
1210 
1211 /*********************************************************/
1212 /* snapshots and image creation */
1213 
1214 
1215 
1216 /* update the refcounts of snapshots and the copied flag */
1217 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1218     int64_t l1_table_offset, int l1_size, int addend)
1219 {
1220     BDRVQcow2State *s = bs->opaque;
1221     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1222     bool l1_allocated = false;
1223     int64_t old_entry, old_l2_offset;
1224     unsigned slice, slice_size2, n_slices;
1225     int i, j, l1_modified = 0, nb_csectors;
1226     int ret;
1227 
1228     assert(addend >= -1 && addend <= 1);
1229 
1230     l2_slice = NULL;
1231     l1_table = NULL;
1232     l1_size2 = l1_size * sizeof(uint64_t);
1233     slice_size2 = s->l2_slice_size * sizeof(uint64_t);
1234     n_slices = s->cluster_size / slice_size2;
1235 
1236     s->cache_discards = true;
1237 
1238     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1239      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1240      * when changing this! */
1241     if (l1_table_offset != s->l1_table_offset) {
1242         l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
1243         if (l1_size2 && l1_table == NULL) {
1244             ret = -ENOMEM;
1245             goto fail;
1246         }
1247         l1_allocated = true;
1248 
1249         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1250         if (ret < 0) {
1251             goto fail;
1252         }
1253 
1254         for (i = 0; i < l1_size; i++) {
1255             be64_to_cpus(&l1_table[i]);
1256         }
1257     } else {
1258         assert(l1_size == s->l1_size);
1259         l1_table = s->l1_table;
1260         l1_allocated = false;
1261     }
1262 
1263     for (i = 0; i < l1_size; i++) {
1264         l2_offset = l1_table[i];
1265         if (l2_offset) {
1266             old_l2_offset = l2_offset;
1267             l2_offset &= L1E_OFFSET_MASK;
1268 
1269             if (offset_into_cluster(s, l2_offset)) {
1270                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1271                                         PRIx64 " unaligned (L1 index: %#x)",
1272                                         l2_offset, i);
1273                 ret = -EIO;
1274                 goto fail;
1275             }
1276 
1277             for (slice = 0; slice < n_slices; slice++) {
1278                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1279                                       l2_offset + slice * slice_size2,
1280                                       (void **) &l2_slice);
1281                 if (ret < 0) {
1282                     goto fail;
1283                 }
1284 
1285                 for (j = 0; j < s->l2_slice_size; j++) {
1286                     uint64_t cluster_index;
1287                     uint64_t offset;
1288 
1289                     entry = be64_to_cpu(l2_slice[j]);
1290                     old_entry = entry;
1291                     entry &= ~QCOW_OFLAG_COPIED;
1292                     offset = entry & L2E_OFFSET_MASK;
1293 
1294                     switch (qcow2_get_cluster_type(entry)) {
1295                     case QCOW2_CLUSTER_COMPRESSED:
1296                         nb_csectors = ((entry >> s->csize_shift) &
1297                                        s->csize_mask) + 1;
1298                         if (addend != 0) {
1299                             ret = update_refcount(
1300                                 bs, (entry & s->cluster_offset_mask) & ~511,
1301                                 nb_csectors * 512, abs(addend), addend < 0,
1302                                 QCOW2_DISCARD_SNAPSHOT);
1303                             if (ret < 0) {
1304                                 goto fail;
1305                             }
1306                         }
1307                         /* compressed clusters are never modified */
1308                         refcount = 2;
1309                         break;
1310 
1311                     case QCOW2_CLUSTER_NORMAL:
1312                     case QCOW2_CLUSTER_ZERO_ALLOC:
1313                         if (offset_into_cluster(s, offset)) {
1314                             /* Here l2_index means table (not slice) index */
1315                             int l2_index = slice * s->l2_slice_size + j;
1316                             qcow2_signal_corruption(
1317                                 bs, true, -1, -1, "Cluster "
1318                                 "allocation offset %#" PRIx64
1319                                 " unaligned (L2 offset: %#"
1320                                 PRIx64 ", L2 index: %#x)",
1321                                 offset, l2_offset, l2_index);
1322                             ret = -EIO;
1323                             goto fail;
1324                         }
1325 
1326                         cluster_index = offset >> s->cluster_bits;
1327                         assert(cluster_index);
1328                         if (addend != 0) {
1329                             ret = qcow2_update_cluster_refcount(
1330                                 bs, cluster_index, abs(addend), addend < 0,
1331                                 QCOW2_DISCARD_SNAPSHOT);
1332                             if (ret < 0) {
1333                                 goto fail;
1334                             }
1335                         }
1336 
1337                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1338                         if (ret < 0) {
1339                             goto fail;
1340                         }
1341                         break;
1342 
1343                     case QCOW2_CLUSTER_ZERO_PLAIN:
1344                     case QCOW2_CLUSTER_UNALLOCATED:
1345                         refcount = 0;
1346                         break;
1347 
1348                     default:
1349                         abort();
1350                     }
1351 
1352                     if (refcount == 1) {
1353                         entry |= QCOW_OFLAG_COPIED;
1354                     }
1355                     if (entry != old_entry) {
1356                         if (addend > 0) {
1357                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1358                                                        s->refcount_block_cache);
1359                         }
1360                         l2_slice[j] = cpu_to_be64(entry);
1361                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1362                                                      l2_slice);
1363                     }
1364                 }
1365 
1366                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1367             }
1368 
1369             if (addend != 0) {
1370                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1371                                                         s->cluster_bits,
1372                                                     abs(addend), addend < 0,
1373                                                     QCOW2_DISCARD_SNAPSHOT);
1374                 if (ret < 0) {
1375                     goto fail;
1376                 }
1377             }
1378             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1379                                      &refcount);
1380             if (ret < 0) {
1381                 goto fail;
1382             } else if (refcount == 1) {
1383                 l2_offset |= QCOW_OFLAG_COPIED;
1384             }
1385             if (l2_offset != old_l2_offset) {
1386                 l1_table[i] = l2_offset;
1387                 l1_modified = 1;
1388             }
1389         }
1390     }
1391 
1392     ret = bdrv_flush(bs);
1393 fail:
1394     if (l2_slice) {
1395         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1396     }
1397 
1398     s->cache_discards = false;
1399     qcow2_process_discards(bs, ret);
1400 
1401     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1402     if (ret == 0 && addend >= 0 && l1_modified) {
1403         for (i = 0; i < l1_size; i++) {
1404             cpu_to_be64s(&l1_table[i]);
1405         }
1406 
1407         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1408                                l1_table, l1_size2);
1409 
1410         for (i = 0; i < l1_size; i++) {
1411             be64_to_cpus(&l1_table[i]);
1412         }
1413     }
1414     if (l1_allocated)
1415         g_free(l1_table);
1416     return ret;
1417 }
1418 
1419 
1420 
1421 
1422 /*********************************************************/
1423 /* refcount checking functions */
1424 
1425 
1426 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1427 {
1428     /* This assertion holds because there is no way we can address more than
1429      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1430      * offsets have to be representable in bytes); due to every cluster
1431      * corresponding to one refcount entry, we are well below that limit */
1432     assert(entries < (UINT64_C(1) << (64 - 9)));
1433 
1434     /* Thanks to the assertion this will not overflow, because
1435      * s->refcount_order < 7.
1436      * (note: x << s->refcount_order == x * s->refcount_bits) */
1437     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1438 }
1439 
1440 /**
1441  * Reallocates *array so that it can hold new_size entries. *size must contain
1442  * the current number of entries in *array. If the reallocation fails, *array
1443  * and *size will not be modified and -errno will be returned. If the
1444  * reallocation is successful, *array will be set to the new buffer, *size
1445  * will be set to new_size and 0 will be returned. The size of the reallocated
1446  * refcount array buffer will be aligned to a cluster boundary, and the newly
1447  * allocated area will be zeroed.
1448  */
1449 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1450                                   int64_t *size, int64_t new_size)
1451 {
1452     int64_t old_byte_size, new_byte_size;
1453     void *new_ptr;
1454 
1455     /* Round to clusters so the array can be directly written to disk */
1456     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1457                     * s->cluster_size;
1458     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1459                     * s->cluster_size;
1460 
1461     if (new_byte_size == old_byte_size) {
1462         *size = new_size;
1463         return 0;
1464     }
1465 
1466     assert(new_byte_size > 0);
1467 
1468     if (new_byte_size > SIZE_MAX) {
1469         return -ENOMEM;
1470     }
1471 
1472     new_ptr = g_try_realloc(*array, new_byte_size);
1473     if (!new_ptr) {
1474         return -ENOMEM;
1475     }
1476 
1477     if (new_byte_size > old_byte_size) {
1478         memset((char *)new_ptr + old_byte_size, 0,
1479                new_byte_size - old_byte_size);
1480     }
1481 
1482     *array = new_ptr;
1483     *size  = new_size;
1484 
1485     return 0;
1486 }
1487 
1488 /*
1489  * Increases the refcount for a range of clusters in a given refcount table.
1490  * This is used to construct a temporary refcount table out of L1 and L2 tables
1491  * which can be compared to the refcount table saved in the image.
1492  *
1493  * Modifies the number of errors in res.
1494  */
1495 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1496                              void **refcount_table,
1497                              int64_t *refcount_table_size,
1498                              int64_t offset, int64_t size)
1499 {
1500     BDRVQcow2State *s = bs->opaque;
1501     uint64_t start, last, cluster_offset, k, refcount;
1502     int ret;
1503 
1504     if (size <= 0) {
1505         return 0;
1506     }
1507 
1508     start = start_of_cluster(s, offset);
1509     last = start_of_cluster(s, offset + size - 1);
1510     for(cluster_offset = start; cluster_offset <= last;
1511         cluster_offset += s->cluster_size) {
1512         k = cluster_offset >> s->cluster_bits;
1513         if (k >= *refcount_table_size) {
1514             ret = realloc_refcount_array(s, refcount_table,
1515                                          refcount_table_size, k + 1);
1516             if (ret < 0) {
1517                 res->check_errors++;
1518                 return ret;
1519             }
1520         }
1521 
1522         refcount = s->get_refcount(*refcount_table, k);
1523         if (refcount == s->refcount_max) {
1524             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1525                     "\n", cluster_offset);
1526             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1527                     "width or qemu-img convert to create a clean copy if the "
1528                     "image cannot be opened for writing\n");
1529             res->corruptions++;
1530             continue;
1531         }
1532         s->set_refcount(*refcount_table, k, refcount + 1);
1533     }
1534 
1535     return 0;
1536 }
1537 
1538 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1539 enum {
1540     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1541 };
1542 
1543 /*
1544  * Increases the refcount in the given refcount table for the all clusters
1545  * referenced in the L2 table. While doing so, performs some checks on L2
1546  * entries.
1547  *
1548  * Returns the number of errors found by the checks or -errno if an internal
1549  * error occurred.
1550  */
1551 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1552                               void **refcount_table,
1553                               int64_t *refcount_table_size, int64_t l2_offset,
1554                               int flags, BdrvCheckMode fix)
1555 {
1556     BDRVQcow2State *s = bs->opaque;
1557     uint64_t *l2_table, l2_entry;
1558     uint64_t next_contiguous_offset = 0;
1559     int i, l2_size, nb_csectors, ret;
1560 
1561     /* Read L2 table from disk */
1562     l2_size = s->l2_size * sizeof(uint64_t);
1563     l2_table = g_malloc(l2_size);
1564 
1565     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1566     if (ret < 0) {
1567         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1568         res->check_errors++;
1569         goto fail;
1570     }
1571 
1572     /* Do the actual checks */
1573     for(i = 0; i < s->l2_size; i++) {
1574         l2_entry = be64_to_cpu(l2_table[i]);
1575 
1576         switch (qcow2_get_cluster_type(l2_entry)) {
1577         case QCOW2_CLUSTER_COMPRESSED:
1578             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1579             if (l2_entry & QCOW_OFLAG_COPIED) {
1580                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1581                     "copied flag must never be set for compressed "
1582                     "clusters\n", l2_entry & s->cluster_offset_mask);
1583                 l2_entry &= ~QCOW_OFLAG_COPIED;
1584                 res->corruptions++;
1585             }
1586 
1587             /* Mark cluster as used */
1588             nb_csectors = ((l2_entry >> s->csize_shift) &
1589                            s->csize_mask) + 1;
1590             l2_entry &= s->cluster_offset_mask;
1591             ret = qcow2_inc_refcounts_imrt(bs, res,
1592                                            refcount_table, refcount_table_size,
1593                                            l2_entry & ~511, nb_csectors * 512);
1594             if (ret < 0) {
1595                 goto fail;
1596             }
1597 
1598             if (flags & CHECK_FRAG_INFO) {
1599                 res->bfi.allocated_clusters++;
1600                 res->bfi.compressed_clusters++;
1601 
1602                 /* Compressed clusters are fragmented by nature.  Since they
1603                  * take up sub-sector space but we only have sector granularity
1604                  * I/O we need to re-read the same sectors even for adjacent
1605                  * compressed clusters.
1606                  */
1607                 res->bfi.fragmented_clusters++;
1608             }
1609             break;
1610 
1611         case QCOW2_CLUSTER_ZERO_ALLOC:
1612         case QCOW2_CLUSTER_NORMAL:
1613         {
1614             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1615 
1616             if (flags & CHECK_FRAG_INFO) {
1617                 res->bfi.allocated_clusters++;
1618                 if (next_contiguous_offset &&
1619                     offset != next_contiguous_offset) {
1620                     res->bfi.fragmented_clusters++;
1621                 }
1622                 next_contiguous_offset = offset + s->cluster_size;
1623             }
1624 
1625             /* Correct offsets are cluster aligned */
1626             if (offset_into_cluster(s, offset)) {
1627                 if (qcow2_get_cluster_type(l2_entry) ==
1628                     QCOW2_CLUSTER_ZERO_ALLOC)
1629                 {
1630                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
1631                             "cluster is not properly aligned; L2 entry "
1632                             "corrupted.\n",
1633                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1634                             offset);
1635                     if (fix & BDRV_FIX_ERRORS) {
1636                         uint64_t l2e_offset =
1637                             l2_offset + (uint64_t)i * sizeof(uint64_t);
1638 
1639                         l2_entry = QCOW_OFLAG_ZERO;
1640                         l2_table[i] = cpu_to_be64(l2_entry);
1641                         ret = qcow2_pre_write_overlap_check(bs,
1642                                 QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
1643                                 l2e_offset, sizeof(uint64_t));
1644                         if (ret < 0) {
1645                             fprintf(stderr, "ERROR: Overlap check failed\n");
1646                             res->check_errors++;
1647                             /* Something is seriously wrong, so abort checking
1648                              * this L2 table */
1649                             goto fail;
1650                         }
1651 
1652                         ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1653                                                &l2_table[i], sizeof(uint64_t));
1654                         if (ret < 0) {
1655                             fprintf(stderr, "ERROR: Failed to overwrite L2 "
1656                                     "table entry: %s\n", strerror(-ret));
1657                             res->check_errors++;
1658                             /* Do not abort, continue checking the rest of this
1659                              * L2 table's entries */
1660                         } else {
1661                             res->corruptions_fixed++;
1662                             /* Skip marking the cluster as used
1663                              * (it is unused now) */
1664                             continue;
1665                         }
1666                     } else {
1667                         res->corruptions++;
1668                     }
1669                 } else {
1670                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1671                         "not properly aligned; L2 entry corrupted.\n", offset);
1672                     res->corruptions++;
1673                 }
1674             }
1675 
1676             /* Mark cluster as used */
1677             ret = qcow2_inc_refcounts_imrt(bs, res,
1678                                            refcount_table, refcount_table_size,
1679                                            offset, s->cluster_size);
1680             if (ret < 0) {
1681                 goto fail;
1682             }
1683             break;
1684         }
1685 
1686         case QCOW2_CLUSTER_ZERO_PLAIN:
1687         case QCOW2_CLUSTER_UNALLOCATED:
1688             break;
1689 
1690         default:
1691             abort();
1692         }
1693     }
1694 
1695     g_free(l2_table);
1696     return 0;
1697 
1698 fail:
1699     g_free(l2_table);
1700     return ret;
1701 }
1702 
1703 /*
1704  * Increases the refcount for the L1 table, its L2 tables and all referenced
1705  * clusters in the given refcount table. While doing so, performs some checks
1706  * on L1 and L2 entries.
1707  *
1708  * Returns the number of errors found by the checks or -errno if an internal
1709  * error occurred.
1710  */
1711 static int check_refcounts_l1(BlockDriverState *bs,
1712                               BdrvCheckResult *res,
1713                               void **refcount_table,
1714                               int64_t *refcount_table_size,
1715                               int64_t l1_table_offset, int l1_size,
1716                               int flags, BdrvCheckMode fix)
1717 {
1718     BDRVQcow2State *s = bs->opaque;
1719     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1720     int i, ret;
1721 
1722     l1_size2 = l1_size * sizeof(uint64_t);
1723 
1724     /* Mark L1 table as used */
1725     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1726                                    l1_table_offset, l1_size2);
1727     if (ret < 0) {
1728         goto fail;
1729     }
1730 
1731     /* Read L1 table entries from disk */
1732     if (l1_size2 > 0) {
1733         l1_table = g_try_malloc(l1_size2);
1734         if (l1_table == NULL) {
1735             ret = -ENOMEM;
1736             res->check_errors++;
1737             goto fail;
1738         }
1739         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1740         if (ret < 0) {
1741             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1742             res->check_errors++;
1743             goto fail;
1744         }
1745         for(i = 0;i < l1_size; i++)
1746             be64_to_cpus(&l1_table[i]);
1747     }
1748 
1749     /* Do the actual checks */
1750     for(i = 0; i < l1_size; i++) {
1751         l2_offset = l1_table[i];
1752         if (l2_offset) {
1753             /* Mark L2 table as used */
1754             l2_offset &= L1E_OFFSET_MASK;
1755             ret = qcow2_inc_refcounts_imrt(bs, res,
1756                                            refcount_table, refcount_table_size,
1757                                            l2_offset, s->cluster_size);
1758             if (ret < 0) {
1759                 goto fail;
1760             }
1761 
1762             /* L2 tables are cluster aligned */
1763             if (offset_into_cluster(s, l2_offset)) {
1764                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1765                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1766                 res->corruptions++;
1767             }
1768 
1769             /* Process and check L2 entries */
1770             ret = check_refcounts_l2(bs, res, refcount_table,
1771                                      refcount_table_size, l2_offset, flags,
1772                                      fix);
1773             if (ret < 0) {
1774                 goto fail;
1775             }
1776         }
1777     }
1778     g_free(l1_table);
1779     return 0;
1780 
1781 fail:
1782     g_free(l1_table);
1783     return ret;
1784 }
1785 
1786 /*
1787  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1788  *
1789  * This function does not print an error message nor does it increment
1790  * check_errors if qcow2_get_refcount fails (this is because such an error will
1791  * have been already detected and sufficiently signaled by the calling function
1792  * (qcow2_check_refcounts) by the time this function is called).
1793  */
1794 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1795                               BdrvCheckMode fix)
1796 {
1797     BDRVQcow2State *s = bs->opaque;
1798     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1799     int ret;
1800     uint64_t refcount;
1801     int i, j;
1802     bool repair;
1803 
1804     if (fix & BDRV_FIX_ERRORS) {
1805         /* Always repair */
1806         repair = true;
1807     } else if (fix & BDRV_FIX_LEAKS) {
1808         /* Repair only if that seems safe: This function is always
1809          * called after the refcounts have been fixed, so the refcount
1810          * is accurate if that repair was successful */
1811         repair = !res->check_errors && !res->corruptions && !res->leaks;
1812     } else {
1813         repair = false;
1814     }
1815 
1816     for (i = 0; i < s->l1_size; i++) {
1817         uint64_t l1_entry = s->l1_table[i];
1818         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1819         bool l2_dirty = false;
1820 
1821         if (!l2_offset) {
1822             continue;
1823         }
1824 
1825         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1826                                  &refcount);
1827         if (ret < 0) {
1828             /* don't print message nor increment check_errors */
1829             continue;
1830         }
1831         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1832             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1833                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1834                     repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1835             if (repair) {
1836                 s->l1_table[i] = refcount == 1
1837                                ? l1_entry |  QCOW_OFLAG_COPIED
1838                                : l1_entry & ~QCOW_OFLAG_COPIED;
1839                 ret = qcow2_write_l1_entry(bs, i);
1840                 if (ret < 0) {
1841                     res->check_errors++;
1842                     goto fail;
1843                 }
1844                 res->corruptions_fixed++;
1845             } else {
1846                 res->corruptions++;
1847             }
1848         }
1849 
1850         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1851                          s->l2_size * sizeof(uint64_t));
1852         if (ret < 0) {
1853             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1854                     strerror(-ret));
1855             res->check_errors++;
1856             goto fail;
1857         }
1858 
1859         for (j = 0; j < s->l2_size; j++) {
1860             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1861             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1862             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1863 
1864             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1865                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1866                 ret = qcow2_get_refcount(bs,
1867                                          data_offset >> s->cluster_bits,
1868                                          &refcount);
1869                 if (ret < 0) {
1870                     /* don't print message nor increment check_errors */
1871                     continue;
1872                 }
1873                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1874                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1875                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1876                             repair ? "Repairing" : "ERROR", l2_entry, refcount);
1877                     if (repair) {
1878                         l2_table[j] = cpu_to_be64(refcount == 1
1879                                     ? l2_entry |  QCOW_OFLAG_COPIED
1880                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1881                         l2_dirty = true;
1882                         res->corruptions_fixed++;
1883                     } else {
1884                         res->corruptions++;
1885                     }
1886                 }
1887             }
1888         }
1889 
1890         if (l2_dirty) {
1891             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1892                                                 l2_offset, s->cluster_size);
1893             if (ret < 0) {
1894                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1895                         "overlap check failed: %s\n", strerror(-ret));
1896                 res->check_errors++;
1897                 goto fail;
1898             }
1899 
1900             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1901                               s->cluster_size);
1902             if (ret < 0) {
1903                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1904                         strerror(-ret));
1905                 res->check_errors++;
1906                 goto fail;
1907             }
1908         }
1909     }
1910 
1911     ret = 0;
1912 
1913 fail:
1914     qemu_vfree(l2_table);
1915     return ret;
1916 }
1917 
1918 /*
1919  * Checks consistency of refblocks and accounts for each refblock in
1920  * *refcount_table.
1921  */
1922 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1923                            BdrvCheckMode fix, bool *rebuild,
1924                            void **refcount_table, int64_t *nb_clusters)
1925 {
1926     BDRVQcow2State *s = bs->opaque;
1927     int64_t i, size;
1928     int ret;
1929 
1930     for(i = 0; i < s->refcount_table_size; i++) {
1931         uint64_t offset, cluster;
1932         offset = s->refcount_table[i];
1933         cluster = offset >> s->cluster_bits;
1934 
1935         /* Refcount blocks are cluster aligned */
1936         if (offset_into_cluster(s, offset)) {
1937             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1938                 "cluster aligned; refcount table entry corrupted\n", i);
1939             res->corruptions++;
1940             *rebuild = true;
1941             continue;
1942         }
1943 
1944         if (cluster >= *nb_clusters) {
1945             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1946                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1947 
1948             if (fix & BDRV_FIX_ERRORS) {
1949                 int64_t new_nb_clusters;
1950                 Error *local_err = NULL;
1951 
1952                 if (offset > INT64_MAX - s->cluster_size) {
1953                     ret = -EINVAL;
1954                     goto resize_fail;
1955                 }
1956 
1957                 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
1958                                     PREALLOC_MODE_OFF, &local_err);
1959                 if (ret < 0) {
1960                     error_report_err(local_err);
1961                     goto resize_fail;
1962                 }
1963                 size = bdrv_getlength(bs->file->bs);
1964                 if (size < 0) {
1965                     ret = size;
1966                     goto resize_fail;
1967                 }
1968 
1969                 new_nb_clusters = size_to_clusters(s, size);
1970                 assert(new_nb_clusters >= *nb_clusters);
1971 
1972                 ret = realloc_refcount_array(s, refcount_table,
1973                                              nb_clusters, new_nb_clusters);
1974                 if (ret < 0) {
1975                     res->check_errors++;
1976                     return ret;
1977                 }
1978 
1979                 if (cluster >= *nb_clusters) {
1980                     ret = -EINVAL;
1981                     goto resize_fail;
1982                 }
1983 
1984                 res->corruptions_fixed++;
1985                 ret = qcow2_inc_refcounts_imrt(bs, res,
1986                                                refcount_table, nb_clusters,
1987                                                offset, s->cluster_size);
1988                 if (ret < 0) {
1989                     return ret;
1990                 }
1991                 /* No need to check whether the refcount is now greater than 1:
1992                  * This area was just allocated and zeroed, so it can only be
1993                  * exactly 1 after qcow2_inc_refcounts_imrt() */
1994                 continue;
1995 
1996 resize_fail:
1997                 res->corruptions++;
1998                 *rebuild = true;
1999                 fprintf(stderr, "ERROR could not resize image: %s\n",
2000                         strerror(-ret));
2001             } else {
2002                 res->corruptions++;
2003             }
2004             continue;
2005         }
2006 
2007         if (offset != 0) {
2008             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2009                                            offset, s->cluster_size);
2010             if (ret < 0) {
2011                 return ret;
2012             }
2013             if (s->get_refcount(*refcount_table, cluster) != 1) {
2014                 fprintf(stderr, "ERROR refcount block %" PRId64
2015                         " refcount=%" PRIu64 "\n", i,
2016                         s->get_refcount(*refcount_table, cluster));
2017                 res->corruptions++;
2018                 *rebuild = true;
2019             }
2020         }
2021     }
2022 
2023     return 0;
2024 }
2025 
2026 /*
2027  * Calculates an in-memory refcount table.
2028  */
2029 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2030                                BdrvCheckMode fix, bool *rebuild,
2031                                void **refcount_table, int64_t *nb_clusters)
2032 {
2033     BDRVQcow2State *s = bs->opaque;
2034     int64_t i;
2035     QCowSnapshot *sn;
2036     int ret;
2037 
2038     if (!*refcount_table) {
2039         int64_t old_size = 0;
2040         ret = realloc_refcount_array(s, refcount_table,
2041                                      &old_size, *nb_clusters);
2042         if (ret < 0) {
2043             res->check_errors++;
2044             return ret;
2045         }
2046     }
2047 
2048     /* header */
2049     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2050                                    0, s->cluster_size);
2051     if (ret < 0) {
2052         return ret;
2053     }
2054 
2055     /* current L1 table */
2056     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2057                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2058                              fix);
2059     if (ret < 0) {
2060         return ret;
2061     }
2062 
2063     /* snapshots */
2064     for (i = 0; i < s->nb_snapshots; i++) {
2065         sn = s->snapshots + i;
2066         if (offset_into_cluster(s, sn->l1_table_offset)) {
2067             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2068                     "L1 table is not cluster aligned; snapshot table entry "
2069                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2070             res->corruptions++;
2071             continue;
2072         }
2073         if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
2074             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2075                     "L1 table is too large; snapshot table entry corrupted\n",
2076                     sn->id_str, sn->name, sn->l1_size);
2077             res->corruptions++;
2078             continue;
2079         }
2080         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2081                                  sn->l1_table_offset, sn->l1_size, 0, fix);
2082         if (ret < 0) {
2083             return ret;
2084         }
2085     }
2086     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2087                                    s->snapshots_offset, s->snapshots_size);
2088     if (ret < 0) {
2089         return ret;
2090     }
2091 
2092     /* refcount data */
2093     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2094                                    s->refcount_table_offset,
2095                                    s->refcount_table_size * sizeof(uint64_t));
2096     if (ret < 0) {
2097         return ret;
2098     }
2099 
2100     /* encryption */
2101     if (s->crypto_header.length) {
2102         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2103                                        s->crypto_header.offset,
2104                                        s->crypto_header.length);
2105         if (ret < 0) {
2106             return ret;
2107         }
2108     }
2109 
2110     /* bitmaps */
2111     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2112     if (ret < 0) {
2113         return ret;
2114     }
2115 
2116     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2117 }
2118 
2119 /*
2120  * Compares the actual reference count for each cluster in the image against the
2121  * refcount as reported by the refcount structures on-disk.
2122  */
2123 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2124                               BdrvCheckMode fix, bool *rebuild,
2125                               int64_t *highest_cluster,
2126                               void *refcount_table, int64_t nb_clusters)
2127 {
2128     BDRVQcow2State *s = bs->opaque;
2129     int64_t i;
2130     uint64_t refcount1, refcount2;
2131     int ret;
2132 
2133     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2134         ret = qcow2_get_refcount(bs, i, &refcount1);
2135         if (ret < 0) {
2136             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2137                     i, strerror(-ret));
2138             res->check_errors++;
2139             continue;
2140         }
2141 
2142         refcount2 = s->get_refcount(refcount_table, i);
2143 
2144         if (refcount1 > 0 || refcount2 > 0) {
2145             *highest_cluster = i;
2146         }
2147 
2148         if (refcount1 != refcount2) {
2149             /* Check if we're allowed to fix the mismatch */
2150             int *num_fixed = NULL;
2151             if (refcount1 == 0) {
2152                 *rebuild = true;
2153             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2154                 num_fixed = &res->leaks_fixed;
2155             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2156                 num_fixed = &res->corruptions_fixed;
2157             }
2158 
2159             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2160                     " reference=%" PRIu64 "\n",
2161                    num_fixed != NULL     ? "Repairing" :
2162                    refcount1 < refcount2 ? "ERROR" :
2163                                            "Leaked",
2164                    i, refcount1, refcount2);
2165 
2166             if (num_fixed) {
2167                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2168                                       refcount_diff(refcount1, refcount2),
2169                                       refcount1 > refcount2,
2170                                       QCOW2_DISCARD_ALWAYS);
2171                 if (ret >= 0) {
2172                     (*num_fixed)++;
2173                     continue;
2174                 }
2175             }
2176 
2177             /* And if we couldn't, print an error */
2178             if (refcount1 < refcount2) {
2179                 res->corruptions++;
2180             } else {
2181                 res->leaks++;
2182             }
2183         }
2184     }
2185 }
2186 
2187 /*
2188  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2189  * the on-disk refcount structures.
2190  *
2191  * On input, *first_free_cluster tells where to start looking, and need not
2192  * actually be a free cluster; the returned offset will not be before that
2193  * cluster.  On output, *first_free_cluster points to the first gap found, even
2194  * if that gap was too small to be used as the returned offset.
2195  *
2196  * Note that *first_free_cluster is a cluster index whereas the return value is
2197  * an offset.
2198  */
2199 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2200                                    int cluster_count,
2201                                    void **refcount_table,
2202                                    int64_t *imrt_nb_clusters,
2203                                    int64_t *first_free_cluster)
2204 {
2205     BDRVQcow2State *s = bs->opaque;
2206     int64_t cluster = *first_free_cluster, i;
2207     bool first_gap = true;
2208     int contiguous_free_clusters;
2209     int ret;
2210 
2211     /* Starting at *first_free_cluster, find a range of at least cluster_count
2212      * continuously free clusters */
2213     for (contiguous_free_clusters = 0;
2214          cluster < *imrt_nb_clusters &&
2215          contiguous_free_clusters < cluster_count;
2216          cluster++)
2217     {
2218         if (!s->get_refcount(*refcount_table, cluster)) {
2219             contiguous_free_clusters++;
2220             if (first_gap) {
2221                 /* If this is the first free cluster found, update
2222                  * *first_free_cluster accordingly */
2223                 *first_free_cluster = cluster;
2224                 first_gap = false;
2225             }
2226         } else if (contiguous_free_clusters) {
2227             contiguous_free_clusters = 0;
2228         }
2229     }
2230 
2231     /* If contiguous_free_clusters is greater than zero, it contains the number
2232      * of continuously free clusters until the current cluster; the first free
2233      * cluster in the current "gap" is therefore
2234      * cluster - contiguous_free_clusters */
2235 
2236     /* If no such range could be found, grow the in-memory refcount table
2237      * accordingly to append free clusters at the end of the image */
2238     if (contiguous_free_clusters < cluster_count) {
2239         /* contiguous_free_clusters clusters are already empty at the image end;
2240          * we need cluster_count clusters; therefore, we have to allocate
2241          * cluster_count - contiguous_free_clusters new clusters at the end of
2242          * the image (which is the current value of cluster; note that cluster
2243          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2244          * the image end) */
2245         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2246                                      cluster + cluster_count
2247                                      - contiguous_free_clusters);
2248         if (ret < 0) {
2249             return ret;
2250         }
2251     }
2252 
2253     /* Go back to the first free cluster */
2254     cluster -= contiguous_free_clusters;
2255     for (i = 0; i < cluster_count; i++) {
2256         s->set_refcount(*refcount_table, cluster + i, 1);
2257     }
2258 
2259     return cluster << s->cluster_bits;
2260 }
2261 
2262 /*
2263  * Creates a new refcount structure based solely on the in-memory information
2264  * given through *refcount_table. All necessary allocations will be reflected
2265  * in that array.
2266  *
2267  * On success, the old refcount structure is leaked (it will be covered by the
2268  * new refcount structure).
2269  */
2270 static int rebuild_refcount_structure(BlockDriverState *bs,
2271                                       BdrvCheckResult *res,
2272                                       void **refcount_table,
2273                                       int64_t *nb_clusters)
2274 {
2275     BDRVQcow2State *s = bs->opaque;
2276     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2277     int64_t refblock_offset, refblock_start, refblock_index;
2278     uint32_t reftable_size = 0;
2279     uint64_t *on_disk_reftable = NULL;
2280     void *on_disk_refblock;
2281     int ret = 0;
2282     struct {
2283         uint64_t reftable_offset;
2284         uint32_t reftable_clusters;
2285     } QEMU_PACKED reftable_offset_and_clusters;
2286 
2287     qcow2_cache_empty(bs, s->refcount_block_cache);
2288 
2289 write_refblocks:
2290     for (; cluster < *nb_clusters; cluster++) {
2291         if (!s->get_refcount(*refcount_table, cluster)) {
2292             continue;
2293         }
2294 
2295         refblock_index = cluster >> s->refcount_block_bits;
2296         refblock_start = refblock_index << s->refcount_block_bits;
2297 
2298         /* Don't allocate a cluster in a refblock already written to disk */
2299         if (first_free_cluster < refblock_start) {
2300             first_free_cluster = refblock_start;
2301         }
2302         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2303                                               nb_clusters, &first_free_cluster);
2304         if (refblock_offset < 0) {
2305             fprintf(stderr, "ERROR allocating refblock: %s\n",
2306                     strerror(-refblock_offset));
2307             res->check_errors++;
2308             ret = refblock_offset;
2309             goto fail;
2310         }
2311 
2312         if (reftable_size <= refblock_index) {
2313             uint32_t old_reftable_size = reftable_size;
2314             uint64_t *new_on_disk_reftable;
2315 
2316             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2317                                      s->cluster_size) / sizeof(uint64_t);
2318             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2319                                                  reftable_size *
2320                                                  sizeof(uint64_t));
2321             if (!new_on_disk_reftable) {
2322                 res->check_errors++;
2323                 ret = -ENOMEM;
2324                 goto fail;
2325             }
2326             on_disk_reftable = new_on_disk_reftable;
2327 
2328             memset(on_disk_reftable + old_reftable_size, 0,
2329                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2330 
2331             /* The offset we have for the reftable is now no longer valid;
2332              * this will leak that range, but we can easily fix that by running
2333              * a leak-fixing check after this rebuild operation */
2334             reftable_offset = -1;
2335         } else {
2336             assert(on_disk_reftable);
2337         }
2338         on_disk_reftable[refblock_index] = refblock_offset;
2339 
2340         /* If this is apparently the last refblock (for now), try to squeeze the
2341          * reftable in */
2342         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2343             reftable_offset < 0)
2344         {
2345             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2346                                                           sizeof(uint64_t));
2347             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2348                                                   refcount_table, nb_clusters,
2349                                                   &first_free_cluster);
2350             if (reftable_offset < 0) {
2351                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2352                         strerror(-reftable_offset));
2353                 res->check_errors++;
2354                 ret = reftable_offset;
2355                 goto fail;
2356             }
2357         }
2358 
2359         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2360                                             s->cluster_size);
2361         if (ret < 0) {
2362             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2363             goto fail;
2364         }
2365 
2366         /* The size of *refcount_table is always cluster-aligned, therefore the
2367          * write operation will not overflow */
2368         on_disk_refblock = (void *)((char *) *refcount_table +
2369                                     refblock_index * s->cluster_size);
2370 
2371         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2372                          on_disk_refblock, s->cluster_sectors);
2373         if (ret < 0) {
2374             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2375             goto fail;
2376         }
2377 
2378         /* Go to the end of this refblock */
2379         cluster = refblock_start + s->refcount_block_size - 1;
2380     }
2381 
2382     if (reftable_offset < 0) {
2383         uint64_t post_refblock_start, reftable_clusters;
2384 
2385         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2386         reftable_clusters = size_to_clusters(s,
2387                                              reftable_size * sizeof(uint64_t));
2388         /* Not pretty but simple */
2389         if (first_free_cluster < post_refblock_start) {
2390             first_free_cluster = post_refblock_start;
2391         }
2392         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2393                                               refcount_table, nb_clusters,
2394                                               &first_free_cluster);
2395         if (reftable_offset < 0) {
2396             fprintf(stderr, "ERROR allocating reftable: %s\n",
2397                     strerror(-reftable_offset));
2398             res->check_errors++;
2399             ret = reftable_offset;
2400             goto fail;
2401         }
2402 
2403         goto write_refblocks;
2404     }
2405 
2406     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2407         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2408     }
2409 
2410     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2411                                         reftable_size * sizeof(uint64_t));
2412     if (ret < 0) {
2413         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2414         goto fail;
2415     }
2416 
2417     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2418     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2419                       reftable_size * sizeof(uint64_t));
2420     if (ret < 0) {
2421         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2422         goto fail;
2423     }
2424 
2425     /* Enter new reftable into the image header */
2426     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2427     reftable_offset_and_clusters.reftable_clusters =
2428         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2429     ret = bdrv_pwrite_sync(bs->file,
2430                            offsetof(QCowHeader, refcount_table_offset),
2431                            &reftable_offset_and_clusters,
2432                            sizeof(reftable_offset_and_clusters));
2433     if (ret < 0) {
2434         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2435         goto fail;
2436     }
2437 
2438     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2439         be64_to_cpus(&on_disk_reftable[refblock_index]);
2440     }
2441     s->refcount_table = on_disk_reftable;
2442     s->refcount_table_offset = reftable_offset;
2443     s->refcount_table_size = reftable_size;
2444     update_max_refcount_table_index(s);
2445 
2446     return 0;
2447 
2448 fail:
2449     g_free(on_disk_reftable);
2450     return ret;
2451 }
2452 
2453 /*
2454  * Checks an image for refcount consistency.
2455  *
2456  * Returns 0 if no errors are found, the number of errors in case the image is
2457  * detected as corrupted, and -errno when an internal error occurred.
2458  */
2459 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2460                           BdrvCheckMode fix)
2461 {
2462     BDRVQcow2State *s = bs->opaque;
2463     BdrvCheckResult pre_compare_res;
2464     int64_t size, highest_cluster, nb_clusters;
2465     void *refcount_table = NULL;
2466     bool rebuild = false;
2467     int ret;
2468 
2469     size = bdrv_getlength(bs->file->bs);
2470     if (size < 0) {
2471         res->check_errors++;
2472         return size;
2473     }
2474 
2475     nb_clusters = size_to_clusters(s, size);
2476     if (nb_clusters > INT_MAX) {
2477         res->check_errors++;
2478         return -EFBIG;
2479     }
2480 
2481     res->bfi.total_clusters =
2482         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2483 
2484     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2485                               &nb_clusters);
2486     if (ret < 0) {
2487         goto fail;
2488     }
2489 
2490     /* In case we don't need to rebuild the refcount structure (but want to fix
2491      * something), this function is immediately called again, in which case the
2492      * result should be ignored */
2493     pre_compare_res = *res;
2494     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2495                       nb_clusters);
2496 
2497     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2498         BdrvCheckResult old_res = *res;
2499         int fresh_leaks = 0;
2500 
2501         fprintf(stderr, "Rebuilding refcount structure\n");
2502         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2503                                          &nb_clusters);
2504         if (ret < 0) {
2505             goto fail;
2506         }
2507 
2508         res->corruptions = 0;
2509         res->leaks = 0;
2510 
2511         /* Because the old reftable has been exchanged for a new one the
2512          * references have to be recalculated */
2513         rebuild = false;
2514         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2515         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2516                                   &nb_clusters);
2517         if (ret < 0) {
2518             goto fail;
2519         }
2520 
2521         if (fix & BDRV_FIX_LEAKS) {
2522             /* The old refcount structures are now leaked, fix it; the result
2523              * can be ignored, aside from leaks which were introduced by
2524              * rebuild_refcount_structure() that could not be fixed */
2525             BdrvCheckResult saved_res = *res;
2526             *res = (BdrvCheckResult){ 0 };
2527 
2528             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2529                               &highest_cluster, refcount_table, nb_clusters);
2530             if (rebuild) {
2531                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2532                         "broken\n");
2533             }
2534 
2535             /* Any leaks accounted for here were introduced by
2536              * rebuild_refcount_structure() because that function has created a
2537              * new refcount structure from scratch */
2538             fresh_leaks = res->leaks;
2539             *res = saved_res;
2540         }
2541 
2542         if (res->corruptions < old_res.corruptions) {
2543             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2544         }
2545         if (res->leaks < old_res.leaks) {
2546             res->leaks_fixed += old_res.leaks - res->leaks;
2547         }
2548         res->leaks += fresh_leaks;
2549     } else if (fix) {
2550         if (rebuild) {
2551             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2552             res->check_errors++;
2553             ret = -EIO;
2554             goto fail;
2555         }
2556 
2557         if (res->leaks || res->corruptions) {
2558             *res = pre_compare_res;
2559             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2560                               refcount_table, nb_clusters);
2561         }
2562     }
2563 
2564     /* check OFLAG_COPIED */
2565     ret = check_oflag_copied(bs, res, fix);
2566     if (ret < 0) {
2567         goto fail;
2568     }
2569 
2570     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2571     ret = 0;
2572 
2573 fail:
2574     g_free(refcount_table);
2575 
2576     return ret;
2577 }
2578 
2579 #define overlaps_with(ofs, sz) \
2580     ranges_overlap(offset, size, ofs, sz)
2581 
2582 /*
2583  * Checks if the given offset into the image file is actually free to use by
2584  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2585  * i.e. a sanity check without relying on the refcount tables.
2586  *
2587  * The ign parameter specifies what checks not to perform (being a bitmask of
2588  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2589  *
2590  * Returns:
2591  * - 0 if writing to this offset will not affect the mentioned metadata
2592  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2593  * - a negative value (-errno) indicating an error while performing a check,
2594  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2595  */
2596 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2597                                  int64_t size)
2598 {
2599     BDRVQcow2State *s = bs->opaque;
2600     int chk = s->overlap_check & ~ign;
2601     int i, j;
2602 
2603     if (!size) {
2604         return 0;
2605     }
2606 
2607     if (chk & QCOW2_OL_MAIN_HEADER) {
2608         if (offset < s->cluster_size) {
2609             return QCOW2_OL_MAIN_HEADER;
2610         }
2611     }
2612 
2613     /* align range to test to cluster boundaries */
2614     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2615     offset = start_of_cluster(s, offset);
2616 
2617     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2618         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2619             return QCOW2_OL_ACTIVE_L1;
2620         }
2621     }
2622 
2623     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2624         if (overlaps_with(s->refcount_table_offset,
2625             s->refcount_table_size * sizeof(uint64_t))) {
2626             return QCOW2_OL_REFCOUNT_TABLE;
2627         }
2628     }
2629 
2630     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2631         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2632             return QCOW2_OL_SNAPSHOT_TABLE;
2633         }
2634     }
2635 
2636     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2637         for (i = 0; i < s->nb_snapshots; i++) {
2638             if (s->snapshots[i].l1_size &&
2639                 overlaps_with(s->snapshots[i].l1_table_offset,
2640                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2641                 return QCOW2_OL_INACTIVE_L1;
2642             }
2643         }
2644     }
2645 
2646     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2647         for (i = 0; i < s->l1_size; i++) {
2648             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2649                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2650                 s->cluster_size)) {
2651                 return QCOW2_OL_ACTIVE_L2;
2652             }
2653         }
2654     }
2655 
2656     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2657         unsigned last_entry = s->max_refcount_table_index;
2658         assert(last_entry < s->refcount_table_size);
2659         assert(last_entry + 1 == s->refcount_table_size ||
2660                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2661         for (i = 0; i <= last_entry; i++) {
2662             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2663                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2664                 s->cluster_size)) {
2665                 return QCOW2_OL_REFCOUNT_BLOCK;
2666             }
2667         }
2668     }
2669 
2670     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2671         for (i = 0; i < s->nb_snapshots; i++) {
2672             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2673             uint32_t l1_sz  = s->snapshots[i].l1_size;
2674             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2675             uint64_t *l1;
2676             int ret;
2677 
2678             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t),
2679                                        QCOW_MAX_L1_SIZE, "", NULL);
2680             if (ret < 0) {
2681                 return ret;
2682             }
2683 
2684             l1 = g_try_malloc(l1_sz2);
2685 
2686             if (l1_sz2 && l1 == NULL) {
2687                 return -ENOMEM;
2688             }
2689 
2690             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2691             if (ret < 0) {
2692                 g_free(l1);
2693                 return ret;
2694             }
2695 
2696             for (j = 0; j < l1_sz; j++) {
2697                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2698                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2699                     g_free(l1);
2700                     return QCOW2_OL_INACTIVE_L2;
2701                 }
2702             }
2703 
2704             g_free(l1);
2705         }
2706     }
2707 
2708     if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
2709         (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
2710     {
2711         if (overlaps_with(s->bitmap_directory_offset,
2712                           s->bitmap_directory_size))
2713         {
2714             return QCOW2_OL_BITMAP_DIRECTORY;
2715         }
2716     }
2717 
2718     return 0;
2719 }
2720 
2721 static const char *metadata_ol_names[] = {
2722     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2723     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2724     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2725     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2726     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2727     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2728     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2729     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2730 };
2731 
2732 /*
2733  * First performs a check for metadata overlaps (through
2734  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2735  * while performing a check), that value is returned. If an impending overlap
2736  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2737  * and -EIO returned.
2738  *
2739  * Returns 0 if there were neither overlaps nor errors while checking for
2740  * overlaps; or a negative value (-errno) on error.
2741  */
2742 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2743                                   int64_t size)
2744 {
2745     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2746 
2747     if (ret < 0) {
2748         return ret;
2749     } else if (ret > 0) {
2750         int metadata_ol_bitnr = ctz32(ret);
2751         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2752 
2753         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2754                                 "write on metadata (overlaps with %s)",
2755                                 metadata_ol_names[metadata_ol_bitnr]);
2756         return -EIO;
2757     }
2758 
2759     return 0;
2760 }
2761 
2762 /* A pointer to a function of this type is given to walk_over_reftable(). That
2763  * function will create refblocks and pass them to a RefblockFinishOp once they
2764  * are completed (@refblock). @refblock_empty is set if the refblock is
2765  * completely empty.
2766  *
2767  * Along with the refblock, a corresponding reftable entry is passed, in the
2768  * reftable @reftable (which may be reallocated) at @reftable_index.
2769  *
2770  * @allocated should be set to true if a new cluster has been allocated.
2771  */
2772 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2773                                uint64_t reftable_index, uint64_t *reftable_size,
2774                                void *refblock, bool refblock_empty,
2775                                bool *allocated, Error **errp);
2776 
2777 /**
2778  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2779  * it is not empty) and inserts its offset into the new reftable. The size of
2780  * this new reftable is increased as required.
2781  */
2782 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2783                           uint64_t reftable_index, uint64_t *reftable_size,
2784                           void *refblock, bool refblock_empty, bool *allocated,
2785                           Error **errp)
2786 {
2787     BDRVQcow2State *s = bs->opaque;
2788     int64_t offset;
2789 
2790     if (!refblock_empty && reftable_index >= *reftable_size) {
2791         uint64_t *new_reftable;
2792         uint64_t new_reftable_size;
2793 
2794         new_reftable_size = ROUND_UP(reftable_index + 1,
2795                                      s->cluster_size / sizeof(uint64_t));
2796         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2797             error_setg(errp,
2798                        "This operation would make the refcount table grow "
2799                        "beyond the maximum size supported by QEMU, aborting");
2800             return -ENOTSUP;
2801         }
2802 
2803         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2804                                                 sizeof(uint64_t));
2805         if (!new_reftable) {
2806             error_setg(errp, "Failed to increase reftable buffer size");
2807             return -ENOMEM;
2808         }
2809 
2810         memset(new_reftable + *reftable_size, 0,
2811                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2812 
2813         *reftable      = new_reftable;
2814         *reftable_size = new_reftable_size;
2815     }
2816 
2817     if (!refblock_empty && !(*reftable)[reftable_index]) {
2818         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2819         if (offset < 0) {
2820             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2821             return offset;
2822         }
2823         (*reftable)[reftable_index] = offset;
2824         *allocated = true;
2825     }
2826 
2827     return 0;
2828 }
2829 
2830 /**
2831  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2832  * offset specified by the new reftable's entry. It does not modify the new
2833  * reftable or change any refcounts.
2834  */
2835 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2836                           uint64_t reftable_index, uint64_t *reftable_size,
2837                           void *refblock, bool refblock_empty, bool *allocated,
2838                           Error **errp)
2839 {
2840     BDRVQcow2State *s = bs->opaque;
2841     int64_t offset;
2842     int ret;
2843 
2844     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2845         offset = (*reftable)[reftable_index];
2846 
2847         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2848         if (ret < 0) {
2849             error_setg_errno(errp, -ret, "Overlap check failed");
2850             return ret;
2851         }
2852 
2853         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2854         if (ret < 0) {
2855             error_setg_errno(errp, -ret, "Failed to write refblock");
2856             return ret;
2857         }
2858     } else {
2859         assert(refblock_empty);
2860     }
2861 
2862     return 0;
2863 }
2864 
2865 /**
2866  * This function walks over the existing reftable and every referenced refblock;
2867  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2868  * create an equal new entry in the passed @new_refblock. Once that
2869  * @new_refblock is completely filled, @operation will be called.
2870  *
2871  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2872  * @index is the index of the walk_over_reftable() calls and @total is the total
2873  * number of walk_over_reftable() calls per amend operation. Both are used for
2874  * calculating the parameters for the status callback.
2875  *
2876  * @allocated is set to true if a new cluster has been allocated.
2877  */
2878 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2879                               uint64_t *new_reftable_index,
2880                               uint64_t *new_reftable_size,
2881                               void *new_refblock, int new_refblock_size,
2882                               int new_refcount_bits,
2883                               RefblockFinishOp *operation, bool *allocated,
2884                               Qcow2SetRefcountFunc *new_set_refcount,
2885                               BlockDriverAmendStatusCB *status_cb,
2886                               void *cb_opaque, int index, int total,
2887                               Error **errp)
2888 {
2889     BDRVQcow2State *s = bs->opaque;
2890     uint64_t reftable_index;
2891     bool new_refblock_empty = true;
2892     int refblock_index;
2893     int new_refblock_index = 0;
2894     int ret;
2895 
2896     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2897          reftable_index++)
2898     {
2899         uint64_t refblock_offset = s->refcount_table[reftable_index]
2900                                  & REFT_OFFSET_MASK;
2901 
2902         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2903                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2904 
2905         if (refblock_offset) {
2906             void *refblock;
2907 
2908             if (offset_into_cluster(s, refblock_offset)) {
2909                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2910                                         PRIx64 " unaligned (reftable index: %#"
2911                                         PRIx64 ")", refblock_offset,
2912                                         reftable_index);
2913                 error_setg(errp,
2914                            "Image is corrupt (unaligned refblock offset)");
2915                 return -EIO;
2916             }
2917 
2918             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2919                                   &refblock);
2920             if (ret < 0) {
2921                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2922                 return ret;
2923             }
2924 
2925             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2926                  refblock_index++)
2927             {
2928                 uint64_t refcount;
2929 
2930                 if (new_refblock_index >= new_refblock_size) {
2931                     /* new_refblock is now complete */
2932                     ret = operation(bs, new_reftable, *new_reftable_index,
2933                                     new_reftable_size, new_refblock,
2934                                     new_refblock_empty, allocated, errp);
2935                     if (ret < 0) {
2936                         qcow2_cache_put(s->refcount_block_cache, &refblock);
2937                         return ret;
2938                     }
2939 
2940                     (*new_reftable_index)++;
2941                     new_refblock_index = 0;
2942                     new_refblock_empty = true;
2943                 }
2944 
2945                 refcount = s->get_refcount(refblock, refblock_index);
2946                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2947                     uint64_t offset;
2948 
2949                     qcow2_cache_put(s->refcount_block_cache, &refblock);
2950 
2951                     offset = ((reftable_index << s->refcount_block_bits)
2952                               + refblock_index) << s->cluster_bits;
2953 
2954                     error_setg(errp, "Cannot decrease refcount entry width to "
2955                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2956                                "refcount of %" PRIu64, new_refcount_bits,
2957                                offset, refcount);
2958                     return -EINVAL;
2959                 }
2960 
2961                 if (new_set_refcount) {
2962                     new_set_refcount(new_refblock, new_refblock_index++,
2963                                      refcount);
2964                 } else {
2965                     new_refblock_index++;
2966                 }
2967                 new_refblock_empty = new_refblock_empty && refcount == 0;
2968             }
2969 
2970             qcow2_cache_put(s->refcount_block_cache, &refblock);
2971         } else {
2972             /* No refblock means every refcount is 0 */
2973             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2974                  refblock_index++)
2975             {
2976                 if (new_refblock_index >= new_refblock_size) {
2977                     /* new_refblock is now complete */
2978                     ret = operation(bs, new_reftable, *new_reftable_index,
2979                                     new_reftable_size, new_refblock,
2980                                     new_refblock_empty, allocated, errp);
2981                     if (ret < 0) {
2982                         return ret;
2983                     }
2984 
2985                     (*new_reftable_index)++;
2986                     new_refblock_index = 0;
2987                     new_refblock_empty = true;
2988                 }
2989 
2990                 if (new_set_refcount) {
2991                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2992                 } else {
2993                     new_refblock_index++;
2994                 }
2995             }
2996         }
2997     }
2998 
2999     if (new_refblock_index > 0) {
3000         /* Complete the potentially existing partially filled final refblock */
3001         if (new_set_refcount) {
3002             for (; new_refblock_index < new_refblock_size;
3003                  new_refblock_index++)
3004             {
3005                 new_set_refcount(new_refblock, new_refblock_index, 0);
3006             }
3007         }
3008 
3009         ret = operation(bs, new_reftable, *new_reftable_index,
3010                         new_reftable_size, new_refblock, new_refblock_empty,
3011                         allocated, errp);
3012         if (ret < 0) {
3013             return ret;
3014         }
3015 
3016         (*new_reftable_index)++;
3017     }
3018 
3019     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3020               (uint64_t)total * s->refcount_table_size, cb_opaque);
3021 
3022     return 0;
3023 }
3024 
3025 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3026                                 BlockDriverAmendStatusCB *status_cb,
3027                                 void *cb_opaque, Error **errp)
3028 {
3029     BDRVQcow2State *s = bs->opaque;
3030     Qcow2GetRefcountFunc *new_get_refcount;
3031     Qcow2SetRefcountFunc *new_set_refcount;
3032     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3033     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3034     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3035     uint64_t new_reftable_index = 0;
3036     uint64_t i;
3037     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3038     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3039     int old_refcount_order;
3040     int walk_index = 0;
3041     int ret;
3042     bool new_allocation;
3043 
3044     assert(s->qcow_version >= 3);
3045     assert(refcount_order >= 0 && refcount_order <= 6);
3046 
3047     /* see qcow2_open() */
3048     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3049 
3050     new_get_refcount = get_refcount_funcs[refcount_order];
3051     new_set_refcount = set_refcount_funcs[refcount_order];
3052 
3053 
3054     do {
3055         int total_walks;
3056 
3057         new_allocation = false;
3058 
3059         /* At least we have to do this walk and the one which writes the
3060          * refblocks; also, at least we have to do this loop here at least
3061          * twice (normally), first to do the allocations, and second to
3062          * determine that everything is correctly allocated, this then makes
3063          * three walks in total */
3064         total_walks = MAX(walk_index + 2, 3);
3065 
3066         /* First, allocate the structures so they are present in the refcount
3067          * structures */
3068         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3069                                  &new_reftable_size, NULL, new_refblock_size,
3070                                  new_refcount_bits, &alloc_refblock,
3071                                  &new_allocation, NULL, status_cb, cb_opaque,
3072                                  walk_index++, total_walks, errp);
3073         if (ret < 0) {
3074             goto done;
3075         }
3076 
3077         new_reftable_index = 0;
3078 
3079         if (new_allocation) {
3080             if (new_reftable_offset) {
3081                 qcow2_free_clusters(bs, new_reftable_offset,
3082                                     allocated_reftable_size * sizeof(uint64_t),
3083                                     QCOW2_DISCARD_NEVER);
3084             }
3085 
3086             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3087                                                            sizeof(uint64_t));
3088             if (new_reftable_offset < 0) {
3089                 error_setg_errno(errp, -new_reftable_offset,
3090                                  "Failed to allocate the new reftable");
3091                 ret = new_reftable_offset;
3092                 goto done;
3093             }
3094             allocated_reftable_size = new_reftable_size;
3095         }
3096     } while (new_allocation);
3097 
3098     /* Second, write the new refblocks */
3099     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3100                              &new_reftable_size, new_refblock,
3101                              new_refblock_size, new_refcount_bits,
3102                              &flush_refblock, &new_allocation, new_set_refcount,
3103                              status_cb, cb_opaque, walk_index, walk_index + 1,
3104                              errp);
3105     if (ret < 0) {
3106         goto done;
3107     }
3108     assert(!new_allocation);
3109 
3110 
3111     /* Write the new reftable */
3112     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3113                                         new_reftable_size * sizeof(uint64_t));
3114     if (ret < 0) {
3115         error_setg_errno(errp, -ret, "Overlap check failed");
3116         goto done;
3117     }
3118 
3119     for (i = 0; i < new_reftable_size; i++) {
3120         cpu_to_be64s(&new_reftable[i]);
3121     }
3122 
3123     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3124                       new_reftable_size * sizeof(uint64_t));
3125 
3126     for (i = 0; i < new_reftable_size; i++) {
3127         be64_to_cpus(&new_reftable[i]);
3128     }
3129 
3130     if (ret < 0) {
3131         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3132         goto done;
3133     }
3134 
3135 
3136     /* Empty the refcount cache */
3137     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3138     if (ret < 0) {
3139         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3140         goto done;
3141     }
3142 
3143     /* Update the image header to point to the new reftable; this only updates
3144      * the fields which are relevant to qcow2_update_header(); other fields
3145      * such as s->refcount_table or s->refcount_bits stay stale for now
3146      * (because we have to restore everything if qcow2_update_header() fails) */
3147     old_refcount_order  = s->refcount_order;
3148     old_reftable_size   = s->refcount_table_size;
3149     old_reftable_offset = s->refcount_table_offset;
3150 
3151     s->refcount_order        = refcount_order;
3152     s->refcount_table_size   = new_reftable_size;
3153     s->refcount_table_offset = new_reftable_offset;
3154 
3155     ret = qcow2_update_header(bs);
3156     if (ret < 0) {
3157         s->refcount_order        = old_refcount_order;
3158         s->refcount_table_size   = old_reftable_size;
3159         s->refcount_table_offset = old_reftable_offset;
3160         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3161         goto done;
3162     }
3163 
3164     /* Now update the rest of the in-memory information */
3165     old_reftable = s->refcount_table;
3166     s->refcount_table = new_reftable;
3167     update_max_refcount_table_index(s);
3168 
3169     s->refcount_bits = 1 << refcount_order;
3170     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3171     s->refcount_max += s->refcount_max - 1;
3172 
3173     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3174     s->refcount_block_size = 1 << s->refcount_block_bits;
3175 
3176     s->get_refcount = new_get_refcount;
3177     s->set_refcount = new_set_refcount;
3178 
3179     /* For cleaning up all old refblocks and the old reftable below the "done"
3180      * label */
3181     new_reftable        = old_reftable;
3182     new_reftable_size   = old_reftable_size;
3183     new_reftable_offset = old_reftable_offset;
3184 
3185 done:
3186     if (new_reftable) {
3187         /* On success, new_reftable actually points to the old reftable (and
3188          * new_reftable_size is the old reftable's size); but that is just
3189          * fine */
3190         for (i = 0; i < new_reftable_size; i++) {
3191             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3192             if (offset) {
3193                 qcow2_free_clusters(bs, offset, s->cluster_size,
3194                                     QCOW2_DISCARD_OTHER);
3195             }
3196         }
3197         g_free(new_reftable);
3198 
3199         if (new_reftable_offset > 0) {
3200             qcow2_free_clusters(bs, new_reftable_offset,
3201                                 new_reftable_size * sizeof(uint64_t),
3202                                 QCOW2_DISCARD_OTHER);
3203         }
3204     }
3205 
3206     qemu_vfree(new_refblock);
3207     return ret;
3208 }
3209 
3210 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3211 {
3212     BDRVQcow2State *s = bs->opaque;
3213     uint32_t index = offset_to_reftable_index(s, offset);
3214     int64_t covering_refblock_offset = 0;
3215 
3216     if (index < s->refcount_table_size) {
3217         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3218     }
3219     if (!covering_refblock_offset) {
3220         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3221                                 "not covered by the refcount structures",
3222                                 offset);
3223         return -EIO;
3224     }
3225 
3226     return covering_refblock_offset;
3227 }
3228 
3229 static int qcow2_discard_refcount_block(BlockDriverState *bs,
3230                                         uint64_t discard_block_offs)
3231 {
3232     BDRVQcow2State *s = bs->opaque;
3233     int64_t refblock_offs;
3234     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3235     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3236     void *refblock;
3237     int ret;
3238 
3239     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3240     if (refblock_offs < 0) {
3241         return refblock_offs;
3242     }
3243 
3244     assert(discard_block_offs != 0);
3245 
3246     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3247                           &refblock);
3248     if (ret < 0) {
3249         return ret;
3250     }
3251 
3252     if (s->get_refcount(refblock, block_index) != 1) {
3253         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3254                                 " refblock offset %#" PRIx64
3255                                 ", reftable index %u"
3256                                 ", block offset %#" PRIx64
3257                                 ", refcount %#" PRIx64,
3258                                 refblock_offs,
3259                                 offset_to_reftable_index(s, discard_block_offs),
3260                                 discard_block_offs,
3261                                 s->get_refcount(refblock, block_index));
3262         qcow2_cache_put(s->refcount_block_cache, &refblock);
3263         return -EINVAL;
3264     }
3265     s->set_refcount(refblock, block_index, 0);
3266 
3267     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3268 
3269     qcow2_cache_put(s->refcount_block_cache, &refblock);
3270 
3271     if (cluster_index < s->free_cluster_index) {
3272         s->free_cluster_index = cluster_index;
3273     }
3274 
3275     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3276                                            discard_block_offs);
3277     if (refblock) {
3278         /* discard refblock from the cache if refblock is cached */
3279         qcow2_cache_discard(s->refcount_block_cache, refblock);
3280     }
3281     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3282 
3283     return 0;
3284 }
3285 
3286 int qcow2_shrink_reftable(BlockDriverState *bs)
3287 {
3288     BDRVQcow2State *s = bs->opaque;
3289     uint64_t *reftable_tmp =
3290         g_malloc(s->refcount_table_size * sizeof(uint64_t));
3291     int i, ret;
3292 
3293     for (i = 0; i < s->refcount_table_size; i++) {
3294         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3295         void *refblock;
3296         bool unused_block;
3297 
3298         if (refblock_offs == 0) {
3299             reftable_tmp[i] = 0;
3300             continue;
3301         }
3302         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3303                               &refblock);
3304         if (ret < 0) {
3305             goto out;
3306         }
3307 
3308         /* the refblock has own reference */
3309         if (i == offset_to_reftable_index(s, refblock_offs)) {
3310             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3311                                    (s->refcount_block_size - 1);
3312             uint64_t refcount = s->get_refcount(refblock, block_index);
3313 
3314             s->set_refcount(refblock, block_index, 0);
3315 
3316             unused_block = buffer_is_zero(refblock, s->cluster_size);
3317 
3318             s->set_refcount(refblock, block_index, refcount);
3319         } else {
3320             unused_block = buffer_is_zero(refblock, s->cluster_size);
3321         }
3322         qcow2_cache_put(s->refcount_block_cache, &refblock);
3323 
3324         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3325     }
3326 
3327     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3328                            s->refcount_table_size * sizeof(uint64_t));
3329     /*
3330      * If the write in the reftable failed the image may contain a partially
3331      * overwritten reftable. In this case it would be better to clear the
3332      * reftable in memory to avoid possible image corruption.
3333      */
3334     for (i = 0; i < s->refcount_table_size; i++) {
3335         if (s->refcount_table[i] && !reftable_tmp[i]) {
3336             if (ret == 0) {
3337                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3338                                                        REFT_OFFSET_MASK);
3339             }
3340             s->refcount_table[i] = 0;
3341         }
3342     }
3343 
3344     if (!s->cache_discards) {
3345         qcow2_process_discards(bs, ret);
3346     }
3347 
3348 out:
3349     g_free(reftable_tmp);
3350     return ret;
3351 }
3352 
3353 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3354 {
3355     BDRVQcow2State *s = bs->opaque;
3356     int64_t i;
3357 
3358     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3359         uint64_t refcount;
3360         int ret = qcow2_get_refcount(bs, i, &refcount);
3361         if (ret < 0) {
3362             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3363                     i, strerror(-ret));
3364             return ret;
3365         }
3366         if (refcount > 0) {
3367             return i;
3368         }
3369     }
3370     qcow2_signal_corruption(bs, true, -1, -1,
3371                             "There are no references in the refcount table.");
3372     return -EIO;
3373 }
3374