xref: /openbmc/qemu/block/qcow2-refcount.c (revision 9cdd2a73)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 #include "qemu/cutils.h"
33 
34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
35 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
36                             int64_t offset, int64_t length, uint64_t addend,
37                             bool decrease, enum qcow2_discard_type type);
38 
39 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
46 
47 static void set_refcount_ro0(void *refcount_array, uint64_t index,
48                              uint64_t value);
49 static void set_refcount_ro1(void *refcount_array, uint64_t index,
50                              uint64_t value);
51 static void set_refcount_ro2(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro3(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro4(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro5(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro6(void *refcount_array, uint64_t index,
60                              uint64_t value);
61 
62 
63 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
64     &get_refcount_ro0,
65     &get_refcount_ro1,
66     &get_refcount_ro2,
67     &get_refcount_ro3,
68     &get_refcount_ro4,
69     &get_refcount_ro5,
70     &get_refcount_ro6
71 };
72 
73 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
74     &set_refcount_ro0,
75     &set_refcount_ro1,
76     &set_refcount_ro2,
77     &set_refcount_ro3,
78     &set_refcount_ro4,
79     &set_refcount_ro5,
80     &set_refcount_ro6
81 };
82 
83 
84 /*********************************************************/
85 /* refcount handling */
86 
87 static void update_max_refcount_table_index(BDRVQcow2State *s)
88 {
89     unsigned i = s->refcount_table_size - 1;
90     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
91         i--;
92     }
93     /* Set s->max_refcount_table_index to the index of the last used entry */
94     s->max_refcount_table_index = i;
95 }
96 
97 int qcow2_refcount_init(BlockDriverState *bs)
98 {
99     BDRVQcow2State *s = bs->opaque;
100     unsigned int refcount_table_size2, i;
101     int ret;
102 
103     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
104 
105     s->get_refcount = get_refcount_funcs[s->refcount_order];
106     s->set_refcount = set_refcount_funcs[s->refcount_order];
107 
108     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
109     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
110     s->refcount_table = g_try_malloc(refcount_table_size2);
111 
112     if (s->refcount_table_size > 0) {
113         if (s->refcount_table == NULL) {
114             ret = -ENOMEM;
115             goto fail;
116         }
117         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
118         ret = bdrv_pread(bs->file, s->refcount_table_offset,
119                          s->refcount_table, refcount_table_size2);
120         if (ret < 0) {
121             goto fail;
122         }
123         for(i = 0; i < s->refcount_table_size; i++)
124             be64_to_cpus(&s->refcount_table[i]);
125         update_max_refcount_table_index(s);
126     }
127     return 0;
128  fail:
129     return ret;
130 }
131 
132 void qcow2_refcount_close(BlockDriverState *bs)
133 {
134     BDRVQcow2State *s = bs->opaque;
135     g_free(s->refcount_table);
136 }
137 
138 
139 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
140 {
141     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
142 }
143 
144 static void set_refcount_ro0(void *refcount_array, uint64_t index,
145                              uint64_t value)
146 {
147     assert(!(value >> 1));
148     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
149     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
150 }
151 
152 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
153 {
154     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
155            & 0x3;
156 }
157 
158 static void set_refcount_ro1(void *refcount_array, uint64_t index,
159                              uint64_t value)
160 {
161     assert(!(value >> 2));
162     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
163     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
164 }
165 
166 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
167 {
168     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
169            & 0xf;
170 }
171 
172 static void set_refcount_ro2(void *refcount_array, uint64_t index,
173                              uint64_t value)
174 {
175     assert(!(value >> 4));
176     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
177     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
178 }
179 
180 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
181 {
182     return ((const uint8_t *)refcount_array)[index];
183 }
184 
185 static void set_refcount_ro3(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 8));
189     ((uint8_t *)refcount_array)[index] = value;
190 }
191 
192 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
193 {
194     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
195 }
196 
197 static void set_refcount_ro4(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 16));
201     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
202 }
203 
204 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
205 {
206     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
207 }
208 
209 static void set_refcount_ro5(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     assert(!(value >> 32));
213     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
214 }
215 
216 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
217 {
218     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
219 }
220 
221 static void set_refcount_ro6(void *refcount_array, uint64_t index,
222                              uint64_t value)
223 {
224     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
225 }
226 
227 
228 static int load_refcount_block(BlockDriverState *bs,
229                                int64_t refcount_block_offset,
230                                void **refcount_block)
231 {
232     BDRVQcow2State *s = bs->opaque;
233 
234     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
235     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
236                            refcount_block);
237 }
238 
239 /*
240  * Retrieves the refcount of the cluster given by its index and stores it in
241  * *refcount. Returns 0 on success and -errno on failure.
242  */
243 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
244                        uint64_t *refcount)
245 {
246     BDRVQcow2State *s = bs->opaque;
247     uint64_t refcount_table_index, block_index;
248     int64_t refcount_block_offset;
249     int ret;
250     void *refcount_block;
251 
252     refcount_table_index = cluster_index >> s->refcount_block_bits;
253     if (refcount_table_index >= s->refcount_table_size) {
254         *refcount = 0;
255         return 0;
256     }
257     refcount_block_offset =
258         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
259     if (!refcount_block_offset) {
260         *refcount = 0;
261         return 0;
262     }
263 
264     if (offset_into_cluster(s, refcount_block_offset)) {
265         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
266                                 " unaligned (reftable index: %#" PRIx64 ")",
267                                 refcount_block_offset, refcount_table_index);
268         return -EIO;
269     }
270 
271     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
272                           &refcount_block);
273     if (ret < 0) {
274         return ret;
275     }
276 
277     block_index = cluster_index & (s->refcount_block_size - 1);
278     *refcount = s->get_refcount(refcount_block, block_index);
279 
280     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
281 
282     return 0;
283 }
284 
285 /* Checks if two offsets are described by the same refcount block */
286 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
287     uint64_t offset_b)
288 {
289     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
290     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
291 
292     return (block_a == block_b);
293 }
294 
295 /*
296  * Loads a refcount block. If it doesn't exist yet, it is allocated first
297  * (including growing the refcount table if needed).
298  *
299  * Returns 0 on success or -errno in error case
300  */
301 static int alloc_refcount_block(BlockDriverState *bs,
302                                 int64_t cluster_index, void **refcount_block)
303 {
304     BDRVQcow2State *s = bs->opaque;
305     unsigned int refcount_table_index;
306     int64_t ret;
307 
308     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
309 
310     /* Find the refcount block for the given cluster */
311     refcount_table_index = cluster_index >> s->refcount_block_bits;
312 
313     if (refcount_table_index < s->refcount_table_size) {
314 
315         uint64_t refcount_block_offset =
316             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
317 
318         /* If it's already there, we're done */
319         if (refcount_block_offset) {
320             if (offset_into_cluster(s, refcount_block_offset)) {
321                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
322                                         PRIx64 " unaligned (reftable index: "
323                                         "%#x)", refcount_block_offset,
324                                         refcount_table_index);
325                 return -EIO;
326             }
327 
328              return load_refcount_block(bs, refcount_block_offset,
329                                         refcount_block);
330         }
331     }
332 
333     /*
334      * If we came here, we need to allocate something. Something is at least
335      * a cluster for the new refcount block. It may also include a new refcount
336      * table if the old refcount table is too small.
337      *
338      * Note that allocating clusters here needs some special care:
339      *
340      * - We can't use the normal qcow2_alloc_clusters(), it would try to
341      *   increase the refcount and very likely we would end up with an endless
342      *   recursion. Instead we must place the refcount blocks in a way that
343      *   they can describe them themselves.
344      *
345      * - We need to consider that at this point we are inside update_refcounts
346      *   and potentially doing an initial refcount increase. This means that
347      *   some clusters have already been allocated by the caller, but their
348      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
349      *   need to return -EAGAIN to signal the caller that it needs to restart
350      *   the search for free clusters.
351      *
352      * - alloc_clusters_noref and qcow2_free_clusters may load a different
353      *   refcount block into the cache
354      */
355 
356     *refcount_block = NULL;
357 
358     /* We write to the refcount table, so we might depend on L2 tables */
359     ret = qcow2_cache_flush(bs, s->l2_table_cache);
360     if (ret < 0) {
361         return ret;
362     }
363 
364     /* Allocate the refcount block itself and mark it as used */
365     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
366     if (new_block < 0) {
367         return new_block;
368     }
369 
370     /* If we're allocating the block at offset 0 then something is wrong */
371     if (new_block == 0) {
372         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
373                                 "allocation of refcount block at offset 0");
374         return -EIO;
375     }
376 
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382 
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail;
389         }
390 
391         memset(*refcount_block, 0, s->cluster_size);
392 
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail;
404         }
405 
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail;
409         }
410 
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail;
417         }
418 
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421 
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail;
428     }
429 
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail;
439         }
440 
441         s->refcount_table[refcount_table_index] = new_block;
442         /* If there's a hole in s->refcount_table then it can happen
443          * that refcount_table_index < s->max_refcount_table_index */
444         s->max_refcount_table_index =
445             MAX(s->max_refcount_table_index, refcount_table_index);
446 
447         /* The new refcount block may be where the caller intended to put its
448          * data, so let it restart the search. */
449         return -EAGAIN;
450     }
451 
452     qcow2_cache_put(s->refcount_block_cache, refcount_block);
453 
454     /*
455      * If we come here, we need to grow the refcount table. Again, a new
456      * refcount table needs some space and we can't simply allocate to avoid
457      * endless recursion.
458      *
459      * Therefore let's grab new refcount blocks at the end of the image, which
460      * will describe themselves and the new refcount table. This way we can
461      * reference them only in the new table and do the switch to the new
462      * refcount table at once without producing an inconsistent state in
463      * between.
464      */
465     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
466 
467     /* Calculate the number of refcount blocks needed so far; this will be the
468      * basis for calculating the index of the first cluster used for the
469      * self-describing refcount structures which we are about to create.
470      *
471      * Because we reached this point, there cannot be any refcount entries for
472      * cluster_index or higher indices yet. However, because new_block has been
473      * allocated to describe that cluster (and it will assume this role later
474      * on), we cannot use that index; also, new_block may actually have a higher
475      * cluster index than cluster_index, so it needs to be taken into account
476      * here (and 1 needs to be added to its value because that cluster is used).
477      */
478     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
479                                             (new_block >> s->cluster_bits) + 1),
480                                         s->refcount_block_size);
481 
482     /* Create the new refcount table and blocks */
483     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
484         s->cluster_size;
485 
486     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
487                               refcount_table_index, new_block);
488     if (ret < 0) {
489         return ret;
490     }
491 
492     ret = load_refcount_block(bs, new_block, refcount_block);
493     if (ret < 0) {
494         return ret;
495     }
496 
497     /* If we were trying to do the initial refcount update for some cluster
498      * allocation, we might have used the same clusters to store newly
499      * allocated metadata. Make the caller search some new space. */
500     return -EAGAIN;
501 
502 fail:
503     if (*refcount_block != NULL) {
504         qcow2_cache_put(s->refcount_block_cache, refcount_block);
505     }
506     return ret;
507 }
508 
509 /*
510  * Starting at @start_offset, this function creates new self-covering refcount
511  * structures: A new refcount table and refcount blocks which cover all of
512  * themselves, and a number of @additional_clusters beyond their end.
513  * @start_offset must be at the end of the image file, that is, there must be
514  * only empty space beyond it.
515  * If @exact_size is false, the refcount table will have 50 % more entries than
516  * necessary so it will not need to grow again soon.
517  * If @new_refblock_offset is not zero, it contains the offset of a refcount
518  * block that should be entered into the new refcount table at index
519  * @new_refblock_index.
520  *
521  * Returns: The offset after the new refcount structures (i.e. where the
522  *          @additional_clusters may be placed) on success, -errno on error.
523  */
524 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
525                             uint64_t additional_clusters, bool exact_size,
526                             int new_refblock_index,
527                             uint64_t new_refblock_offset)
528 {
529     BDRVQcow2State *s = bs->opaque;
530     uint64_t total_refblock_count_u64, additional_refblock_count;
531     int total_refblock_count, table_size, area_reftable_index, table_clusters;
532     int i;
533     uint64_t table_offset, block_offset, end_offset;
534     int ret;
535     uint64_t *new_table;
536 
537     assert(!(start_offset % s->cluster_size));
538 
539     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
540                                  additional_clusters,
541                                  s->cluster_size, s->refcount_order,
542                                  !exact_size, &total_refblock_count_u64);
543     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
544         return -EFBIG;
545     }
546     total_refblock_count = total_refblock_count_u64;
547 
548     /* Index in the refcount table of the first refcount block to cover the area
549      * of refcount structures we are about to create; we know that
550      * @total_refblock_count can cover @start_offset, so this will definitely
551      * fit into an int. */
552     area_reftable_index = (start_offset / s->cluster_size) /
553                           s->refcount_block_size;
554 
555     if (exact_size) {
556         table_size = total_refblock_count;
557     } else {
558         table_size = total_refblock_count +
559                      DIV_ROUND_UP(total_refblock_count, 2);
560     }
561     /* The qcow2 file can only store the reftable size in number of clusters */
562     table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
563     table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
564 
565     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
566         return -EFBIG;
567     }
568 
569     new_table = g_try_new0(uint64_t, table_size);
570 
571     assert(table_size > 0);
572     if (new_table == NULL) {
573         ret = -ENOMEM;
574         goto fail;
575     }
576 
577     /* Fill the new refcount table */
578     if (table_size > s->max_refcount_table_index) {
579         /* We're actually growing the reftable */
580         memcpy(new_table, s->refcount_table,
581                (s->max_refcount_table_index + 1) * sizeof(uint64_t));
582     } else {
583         /* Improbable case: We're shrinking the reftable. However, the caller
584          * has assured us that there is only empty space beyond @start_offset,
585          * so we can simply drop all of the refblocks that won't fit into the
586          * new reftable. */
587         memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
588     }
589 
590     if (new_refblock_offset) {
591         assert(new_refblock_index < total_refblock_count);
592         new_table[new_refblock_index] = new_refblock_offset;
593     }
594 
595     /* Count how many new refblocks we have to create */
596     additional_refblock_count = 0;
597     for (i = area_reftable_index; i < total_refblock_count; i++) {
598         if (!new_table[i]) {
599             additional_refblock_count++;
600         }
601     }
602 
603     table_offset = start_offset + additional_refblock_count * s->cluster_size;
604     end_offset = table_offset + table_clusters * s->cluster_size;
605 
606     /* Fill the refcount blocks, and create new ones, if necessary */
607     block_offset = start_offset;
608     for (i = area_reftable_index; i < total_refblock_count; i++) {
609         void *refblock_data;
610         uint64_t first_offset_covered;
611 
612         /* Reuse an existing refblock if possible, create a new one otherwise */
613         if (new_table[i]) {
614             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
615                                   &refblock_data);
616             if (ret < 0) {
617                 goto fail;
618             }
619         } else {
620             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
621                                         block_offset, &refblock_data);
622             if (ret < 0) {
623                 goto fail;
624             }
625             memset(refblock_data, 0, s->cluster_size);
626             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
627                                          refblock_data);
628 
629             new_table[i] = block_offset;
630             block_offset += s->cluster_size;
631         }
632 
633         /* First host offset covered by this refblock */
634         first_offset_covered = (uint64_t)i * s->refcount_block_size *
635                                s->cluster_size;
636         if (first_offset_covered < end_offset) {
637             int j, end_index;
638 
639             /* Set the refcount of all of the new refcount structures to 1 */
640 
641             if (first_offset_covered < start_offset) {
642                 assert(i == area_reftable_index);
643                 j = (start_offset - first_offset_covered) / s->cluster_size;
644                 assert(j < s->refcount_block_size);
645             } else {
646                 j = 0;
647             }
648 
649             end_index = MIN((end_offset - first_offset_covered) /
650                             s->cluster_size,
651                             s->refcount_block_size);
652 
653             for (; j < end_index; j++) {
654                 /* The caller guaranteed us this space would be empty */
655                 assert(s->get_refcount(refblock_data, j) == 0);
656                 s->set_refcount(refblock_data, j, 1);
657             }
658 
659             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
660                                          refblock_data);
661         }
662 
663         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
664     }
665 
666     assert(block_offset == table_offset);
667 
668     /* Write refcount blocks to disk */
669     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
670     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
671     if (ret < 0) {
672         goto fail;
673     }
674 
675     /* Write refcount table to disk */
676     for (i = 0; i < total_refblock_count; i++) {
677         cpu_to_be64s(&new_table[i]);
678     }
679 
680     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
681     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
682         table_size * sizeof(uint64_t));
683     if (ret < 0) {
684         goto fail;
685     }
686 
687     for (i = 0; i < total_refblock_count; i++) {
688         be64_to_cpus(&new_table[i]);
689     }
690 
691     /* Hook up the new refcount table in the qcow2 header */
692     struct QEMU_PACKED {
693         uint64_t d64;
694         uint32_t d32;
695     } data;
696     data.d64 = cpu_to_be64(table_offset);
697     data.d32 = cpu_to_be32(table_clusters);
698     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
699     ret = bdrv_pwrite_sync(bs->file,
700                            offsetof(QCowHeader, refcount_table_offset),
701                            &data, sizeof(data));
702     if (ret < 0) {
703         goto fail;
704     }
705 
706     /* And switch it in memory */
707     uint64_t old_table_offset = s->refcount_table_offset;
708     uint64_t old_table_size = s->refcount_table_size;
709 
710     g_free(s->refcount_table);
711     s->refcount_table = new_table;
712     s->refcount_table_size = table_size;
713     s->refcount_table_offset = table_offset;
714     update_max_refcount_table_index(s);
715 
716     /* Free old table. */
717     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
718                         QCOW2_DISCARD_OTHER);
719 
720     return end_offset;
721 
722 fail:
723     g_free(new_table);
724     return ret;
725 }
726 
727 void qcow2_process_discards(BlockDriverState *bs, int ret)
728 {
729     BDRVQcow2State *s = bs->opaque;
730     Qcow2DiscardRegion *d, *next;
731 
732     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
733         QTAILQ_REMOVE(&s->discards, d, next);
734 
735         /* Discard is optional, ignore the return value */
736         if (ret >= 0) {
737             bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
738         }
739 
740         g_free(d);
741     }
742 }
743 
744 static void update_refcount_discard(BlockDriverState *bs,
745                                     uint64_t offset, uint64_t length)
746 {
747     BDRVQcow2State *s = bs->opaque;
748     Qcow2DiscardRegion *d, *p, *next;
749 
750     QTAILQ_FOREACH(d, &s->discards, next) {
751         uint64_t new_start = MIN(offset, d->offset);
752         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
753 
754         if (new_end - new_start <= length + d->bytes) {
755             /* There can't be any overlap, areas ending up here have no
756              * references any more and therefore shouldn't get freed another
757              * time. */
758             assert(d->bytes + length == new_end - new_start);
759             d->offset = new_start;
760             d->bytes = new_end - new_start;
761             goto found;
762         }
763     }
764 
765     d = g_malloc(sizeof(*d));
766     *d = (Qcow2DiscardRegion) {
767         .bs     = bs,
768         .offset = offset,
769         .bytes  = length,
770     };
771     QTAILQ_INSERT_TAIL(&s->discards, d, next);
772 
773 found:
774     /* Merge discard requests if they are adjacent now */
775     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
776         if (p == d
777             || p->offset > d->offset + d->bytes
778             || d->offset > p->offset + p->bytes)
779         {
780             continue;
781         }
782 
783         /* Still no overlap possible */
784         assert(p->offset == d->offset + d->bytes
785             || d->offset == p->offset + p->bytes);
786 
787         QTAILQ_REMOVE(&s->discards, p, next);
788         d->offset = MIN(d->offset, p->offset);
789         d->bytes += p->bytes;
790         g_free(p);
791     }
792 }
793 
794 /* XXX: cache several refcount block clusters ? */
795 /* @addend is the absolute value of the addend; if @decrease is set, @addend
796  * will be subtracted from the current refcount, otherwise it will be added */
797 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
798                                                    int64_t offset,
799                                                    int64_t length,
800                                                    uint64_t addend,
801                                                    bool decrease,
802                                                    enum qcow2_discard_type type)
803 {
804     BDRVQcow2State *s = bs->opaque;
805     int64_t start, last, cluster_offset;
806     void *refcount_block = NULL;
807     int64_t old_table_index = -1;
808     int ret;
809 
810 #ifdef DEBUG_ALLOC2
811     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
812             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
813             addend);
814 #endif
815     if (length < 0) {
816         return -EINVAL;
817     } else if (length == 0) {
818         return 0;
819     }
820 
821     if (decrease) {
822         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
823             s->l2_table_cache);
824     }
825 
826     start = start_of_cluster(s, offset);
827     last = start_of_cluster(s, offset + length - 1);
828     for(cluster_offset = start; cluster_offset <= last;
829         cluster_offset += s->cluster_size)
830     {
831         int block_index;
832         uint64_t refcount;
833         int64_t cluster_index = cluster_offset >> s->cluster_bits;
834         int64_t table_index = cluster_index >> s->refcount_block_bits;
835 
836         /* Load the refcount block and allocate it if needed */
837         if (table_index != old_table_index) {
838             if (refcount_block) {
839                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
840             }
841             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
842             if (ret < 0) {
843                 goto fail;
844             }
845         }
846         old_table_index = table_index;
847 
848         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
849 
850         /* we can update the count and save it */
851         block_index = cluster_index & (s->refcount_block_size - 1);
852 
853         refcount = s->get_refcount(refcount_block, block_index);
854         if (decrease ? (refcount - addend > refcount)
855                      : (refcount + addend < refcount ||
856                         refcount + addend > s->refcount_max))
857         {
858             ret = -EINVAL;
859             goto fail;
860         }
861         if (decrease) {
862             refcount -= addend;
863         } else {
864             refcount += addend;
865         }
866         if (refcount == 0 && cluster_index < s->free_cluster_index) {
867             s->free_cluster_index = cluster_index;
868         }
869         s->set_refcount(refcount_block, block_index, refcount);
870 
871         if (refcount == 0) {
872             void *table;
873 
874             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
875                                                 offset);
876             if (table != NULL) {
877                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
878                 qcow2_cache_discard(s->refcount_block_cache, table);
879             }
880 
881             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
882             if (table != NULL) {
883                 qcow2_cache_discard(s->l2_table_cache, table);
884             }
885 
886             if (s->discard_passthrough[type]) {
887                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
888             }
889         }
890     }
891 
892     ret = 0;
893 fail:
894     if (!s->cache_discards) {
895         qcow2_process_discards(bs, ret);
896     }
897 
898     /* Write last changed block to disk */
899     if (refcount_block) {
900         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
901     }
902 
903     /*
904      * Try do undo any updates if an error is returned (This may succeed in
905      * some cases like ENOSPC for allocating a new refcount block)
906      */
907     if (ret < 0) {
908         int dummy;
909         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
910                                 !decrease, QCOW2_DISCARD_NEVER);
911         (void)dummy;
912     }
913 
914     return ret;
915 }
916 
917 /*
918  * Increases or decreases the refcount of a given cluster.
919  *
920  * @addend is the absolute value of the addend; if @decrease is set, @addend
921  * will be subtracted from the current refcount, otherwise it will be added.
922  *
923  * On success 0 is returned; on failure -errno is returned.
924  */
925 int qcow2_update_cluster_refcount(BlockDriverState *bs,
926                                   int64_t cluster_index,
927                                   uint64_t addend, bool decrease,
928                                   enum qcow2_discard_type type)
929 {
930     BDRVQcow2State *s = bs->opaque;
931     int ret;
932 
933     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
934                           decrease, type);
935     if (ret < 0) {
936         return ret;
937     }
938 
939     return 0;
940 }
941 
942 
943 
944 /*********************************************************/
945 /* cluster allocation functions */
946 
947 
948 
949 /* return < 0 if error */
950 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
951 {
952     BDRVQcow2State *s = bs->opaque;
953     uint64_t i, nb_clusters, refcount;
954     int ret;
955 
956     /* We can't allocate clusters if they may still be queued for discard. */
957     if (s->cache_discards) {
958         qcow2_process_discards(bs, 0);
959     }
960 
961     nb_clusters = size_to_clusters(s, size);
962 retry:
963     for(i = 0; i < nb_clusters; i++) {
964         uint64_t next_cluster_index = s->free_cluster_index++;
965         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
966 
967         if (ret < 0) {
968             return ret;
969         } else if (refcount != 0) {
970             goto retry;
971         }
972     }
973 
974     /* Make sure that all offsets in the "allocated" range are representable
975      * in an int64_t */
976     if (s->free_cluster_index > 0 &&
977         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
978     {
979         return -EFBIG;
980     }
981 
982 #ifdef DEBUG_ALLOC2
983     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
984             size,
985             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
986 #endif
987     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
988 }
989 
990 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
991 {
992     int64_t offset;
993     int ret;
994 
995     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
996     do {
997         offset = alloc_clusters_noref(bs, size);
998         if (offset < 0) {
999             return offset;
1000         }
1001 
1002         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1003     } while (ret == -EAGAIN);
1004 
1005     if (ret < 0) {
1006         return ret;
1007     }
1008 
1009     return offset;
1010 }
1011 
1012 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1013                                 int64_t nb_clusters)
1014 {
1015     BDRVQcow2State *s = bs->opaque;
1016     uint64_t cluster_index, refcount;
1017     uint64_t i;
1018     int ret;
1019 
1020     assert(nb_clusters >= 0);
1021     if (nb_clusters == 0) {
1022         return 0;
1023     }
1024 
1025     do {
1026         /* Check how many clusters there are free */
1027         cluster_index = offset >> s->cluster_bits;
1028         for(i = 0; i < nb_clusters; i++) {
1029             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1030             if (ret < 0) {
1031                 return ret;
1032             } else if (refcount != 0) {
1033                 break;
1034             }
1035         }
1036 
1037         /* And then allocate them */
1038         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1039                               QCOW2_DISCARD_NEVER);
1040     } while (ret == -EAGAIN);
1041 
1042     if (ret < 0) {
1043         return ret;
1044     }
1045 
1046     return i;
1047 }
1048 
1049 /* only used to allocate compressed sectors. We try to allocate
1050    contiguous sectors. size must be <= cluster_size */
1051 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1052 {
1053     BDRVQcow2State *s = bs->opaque;
1054     int64_t offset;
1055     size_t free_in_cluster;
1056     int ret;
1057 
1058     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1059     assert(size > 0 && size <= s->cluster_size);
1060     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1061 
1062     offset = s->free_byte_offset;
1063 
1064     if (offset) {
1065         uint64_t refcount;
1066         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1067         if (ret < 0) {
1068             return ret;
1069         }
1070 
1071         if (refcount == s->refcount_max) {
1072             offset = 0;
1073         }
1074     }
1075 
1076     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1077     do {
1078         if (!offset || free_in_cluster < size) {
1079             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1080             if (new_cluster < 0) {
1081                 return new_cluster;
1082             }
1083 
1084             if (new_cluster == 0) {
1085                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1086                                         "allocation of compressed cluster "
1087                                         "at offset 0");
1088                 return -EIO;
1089             }
1090 
1091             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1092                 offset = new_cluster;
1093                 free_in_cluster = s->cluster_size;
1094             } else {
1095                 free_in_cluster += s->cluster_size;
1096             }
1097         }
1098 
1099         assert(offset);
1100         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1101         if (ret < 0) {
1102             offset = 0;
1103         }
1104     } while (ret == -EAGAIN);
1105     if (ret < 0) {
1106         return ret;
1107     }
1108 
1109     /* The cluster refcount was incremented; refcount blocks must be flushed
1110      * before the caller's L2 table updates. */
1111     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1112 
1113     s->free_byte_offset = offset + size;
1114     if (!offset_into_cluster(s, s->free_byte_offset)) {
1115         s->free_byte_offset = 0;
1116     }
1117 
1118     return offset;
1119 }
1120 
1121 void qcow2_free_clusters(BlockDriverState *bs,
1122                           int64_t offset, int64_t size,
1123                           enum qcow2_discard_type type)
1124 {
1125     int ret;
1126 
1127     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1128     ret = update_refcount(bs, offset, size, 1, true, type);
1129     if (ret < 0) {
1130         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1131         /* TODO Remember the clusters to free them later and avoid leaking */
1132     }
1133 }
1134 
1135 /*
1136  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1137  * normal cluster, compressed cluster, etc.)
1138  */
1139 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1140                              int nb_clusters, enum qcow2_discard_type type)
1141 {
1142     BDRVQcow2State *s = bs->opaque;
1143 
1144     switch (qcow2_get_cluster_type(l2_entry)) {
1145     case QCOW2_CLUSTER_COMPRESSED:
1146         {
1147             int nb_csectors;
1148             nb_csectors = ((l2_entry >> s->csize_shift) &
1149                            s->csize_mask) + 1;
1150             qcow2_free_clusters(bs,
1151                 (l2_entry & s->cluster_offset_mask) & ~511,
1152                 nb_csectors * 512, type);
1153         }
1154         break;
1155     case QCOW2_CLUSTER_NORMAL:
1156     case QCOW2_CLUSTER_ZERO_ALLOC:
1157         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1158             qcow2_signal_corruption(bs, false, -1, -1,
1159                                     "Cannot free unaligned cluster %#llx",
1160                                     l2_entry & L2E_OFFSET_MASK);
1161         } else {
1162             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1163                                 nb_clusters << s->cluster_bits, type);
1164         }
1165         break;
1166     case QCOW2_CLUSTER_ZERO_PLAIN:
1167     case QCOW2_CLUSTER_UNALLOCATED:
1168         break;
1169     default:
1170         abort();
1171     }
1172 }
1173 
1174 int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1175 {
1176     BDRVQcow2State *s = bs->opaque;
1177     int ret;
1178 
1179     ret = qcow2_cache_write(bs, s->l2_table_cache);
1180     if (ret < 0) {
1181         return ret;
1182     }
1183 
1184     if (qcow2_need_accurate_refcounts(s)) {
1185         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1186         if (ret < 0) {
1187             return ret;
1188         }
1189     }
1190 
1191     return 0;
1192 }
1193 
1194 int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1195 {
1196     int ret = qcow2_write_caches(bs);
1197     if (ret < 0) {
1198         return ret;
1199     }
1200 
1201     return bdrv_flush(bs->file->bs);
1202 }
1203 
1204 /*********************************************************/
1205 /* snapshots and image creation */
1206 
1207 
1208 
1209 /* update the refcounts of snapshots and the copied flag */
1210 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1211     int64_t l1_table_offset, int l1_size, int addend)
1212 {
1213     BDRVQcow2State *s = bs->opaque;
1214     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1215     bool l1_allocated = false;
1216     int64_t old_entry, old_l2_offset;
1217     unsigned slice, slice_size2, n_slices;
1218     int i, j, l1_modified = 0, nb_csectors;
1219     int ret;
1220 
1221     assert(addend >= -1 && addend <= 1);
1222 
1223     l2_slice = NULL;
1224     l1_table = NULL;
1225     l1_size2 = l1_size * sizeof(uint64_t);
1226     slice_size2 = s->l2_slice_size * sizeof(uint64_t);
1227     n_slices = s->cluster_size / slice_size2;
1228 
1229     s->cache_discards = true;
1230 
1231     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1232      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1233      * when changing this! */
1234     if (l1_table_offset != s->l1_table_offset) {
1235         l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
1236         if (l1_size2 && l1_table == NULL) {
1237             ret = -ENOMEM;
1238             goto fail;
1239         }
1240         l1_allocated = true;
1241 
1242         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1243         if (ret < 0) {
1244             goto fail;
1245         }
1246 
1247         for (i = 0; i < l1_size; i++) {
1248             be64_to_cpus(&l1_table[i]);
1249         }
1250     } else {
1251         assert(l1_size == s->l1_size);
1252         l1_table = s->l1_table;
1253         l1_allocated = false;
1254     }
1255 
1256     for (i = 0; i < l1_size; i++) {
1257         l2_offset = l1_table[i];
1258         if (l2_offset) {
1259             old_l2_offset = l2_offset;
1260             l2_offset &= L1E_OFFSET_MASK;
1261 
1262             if (offset_into_cluster(s, l2_offset)) {
1263                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1264                                         PRIx64 " unaligned (L1 index: %#x)",
1265                                         l2_offset, i);
1266                 ret = -EIO;
1267                 goto fail;
1268             }
1269 
1270             for (slice = 0; slice < n_slices; slice++) {
1271                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1272                                       l2_offset + slice * slice_size2,
1273                                       (void **) &l2_slice);
1274                 if (ret < 0) {
1275                     goto fail;
1276                 }
1277 
1278                 for (j = 0; j < s->l2_slice_size; j++) {
1279                     uint64_t cluster_index;
1280                     uint64_t offset;
1281 
1282                     entry = be64_to_cpu(l2_slice[j]);
1283                     old_entry = entry;
1284                     entry &= ~QCOW_OFLAG_COPIED;
1285                     offset = entry & L2E_OFFSET_MASK;
1286 
1287                     switch (qcow2_get_cluster_type(entry)) {
1288                     case QCOW2_CLUSTER_COMPRESSED:
1289                         nb_csectors = ((entry >> s->csize_shift) &
1290                                        s->csize_mask) + 1;
1291                         if (addend != 0) {
1292                             ret = update_refcount(
1293                                 bs, (entry & s->cluster_offset_mask) & ~511,
1294                                 nb_csectors * 512, abs(addend), addend < 0,
1295                                 QCOW2_DISCARD_SNAPSHOT);
1296                             if (ret < 0) {
1297                                 goto fail;
1298                             }
1299                         }
1300                         /* compressed clusters are never modified */
1301                         refcount = 2;
1302                         break;
1303 
1304                     case QCOW2_CLUSTER_NORMAL:
1305                     case QCOW2_CLUSTER_ZERO_ALLOC:
1306                         if (offset_into_cluster(s, offset)) {
1307                             /* Here l2_index means table (not slice) index */
1308                             int l2_index = slice * s->l2_slice_size + j;
1309                             qcow2_signal_corruption(
1310                                 bs, true, -1, -1, "Cluster "
1311                                 "allocation offset %#" PRIx64
1312                                 " unaligned (L2 offset: %#"
1313                                 PRIx64 ", L2 index: %#x)",
1314                                 offset, l2_offset, l2_index);
1315                             ret = -EIO;
1316                             goto fail;
1317                         }
1318 
1319                         cluster_index = offset >> s->cluster_bits;
1320                         assert(cluster_index);
1321                         if (addend != 0) {
1322                             ret = qcow2_update_cluster_refcount(
1323                                 bs, cluster_index, abs(addend), addend < 0,
1324                                 QCOW2_DISCARD_SNAPSHOT);
1325                             if (ret < 0) {
1326                                 goto fail;
1327                             }
1328                         }
1329 
1330                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1331                         if (ret < 0) {
1332                             goto fail;
1333                         }
1334                         break;
1335 
1336                     case QCOW2_CLUSTER_ZERO_PLAIN:
1337                     case QCOW2_CLUSTER_UNALLOCATED:
1338                         refcount = 0;
1339                         break;
1340 
1341                     default:
1342                         abort();
1343                     }
1344 
1345                     if (refcount == 1) {
1346                         entry |= QCOW_OFLAG_COPIED;
1347                     }
1348                     if (entry != old_entry) {
1349                         if (addend > 0) {
1350                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1351                                                        s->refcount_block_cache);
1352                         }
1353                         l2_slice[j] = cpu_to_be64(entry);
1354                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1355                                                      l2_slice);
1356                     }
1357                 }
1358 
1359                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1360             }
1361 
1362             if (addend != 0) {
1363                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1364                                                         s->cluster_bits,
1365                                                     abs(addend), addend < 0,
1366                                                     QCOW2_DISCARD_SNAPSHOT);
1367                 if (ret < 0) {
1368                     goto fail;
1369                 }
1370             }
1371             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1372                                      &refcount);
1373             if (ret < 0) {
1374                 goto fail;
1375             } else if (refcount == 1) {
1376                 l2_offset |= QCOW_OFLAG_COPIED;
1377             }
1378             if (l2_offset != old_l2_offset) {
1379                 l1_table[i] = l2_offset;
1380                 l1_modified = 1;
1381             }
1382         }
1383     }
1384 
1385     ret = bdrv_flush(bs);
1386 fail:
1387     if (l2_slice) {
1388         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1389     }
1390 
1391     s->cache_discards = false;
1392     qcow2_process_discards(bs, ret);
1393 
1394     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1395     if (ret == 0 && addend >= 0 && l1_modified) {
1396         for (i = 0; i < l1_size; i++) {
1397             cpu_to_be64s(&l1_table[i]);
1398         }
1399 
1400         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1401                                l1_table, l1_size2);
1402 
1403         for (i = 0; i < l1_size; i++) {
1404             be64_to_cpus(&l1_table[i]);
1405         }
1406     }
1407     if (l1_allocated)
1408         g_free(l1_table);
1409     return ret;
1410 }
1411 
1412 
1413 
1414 
1415 /*********************************************************/
1416 /* refcount checking functions */
1417 
1418 
1419 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1420 {
1421     /* This assertion holds because there is no way we can address more than
1422      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1423      * offsets have to be representable in bytes); due to every cluster
1424      * corresponding to one refcount entry, we are well below that limit */
1425     assert(entries < (UINT64_C(1) << (64 - 9)));
1426 
1427     /* Thanks to the assertion this will not overflow, because
1428      * s->refcount_order < 7.
1429      * (note: x << s->refcount_order == x * s->refcount_bits) */
1430     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1431 }
1432 
1433 /**
1434  * Reallocates *array so that it can hold new_size entries. *size must contain
1435  * the current number of entries in *array. If the reallocation fails, *array
1436  * and *size will not be modified and -errno will be returned. If the
1437  * reallocation is successful, *array will be set to the new buffer, *size
1438  * will be set to new_size and 0 will be returned. The size of the reallocated
1439  * refcount array buffer will be aligned to a cluster boundary, and the newly
1440  * allocated area will be zeroed.
1441  */
1442 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1443                                   int64_t *size, int64_t new_size)
1444 {
1445     int64_t old_byte_size, new_byte_size;
1446     void *new_ptr;
1447 
1448     /* Round to clusters so the array can be directly written to disk */
1449     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1450                     * s->cluster_size;
1451     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1452                     * s->cluster_size;
1453 
1454     if (new_byte_size == old_byte_size) {
1455         *size = new_size;
1456         return 0;
1457     }
1458 
1459     assert(new_byte_size > 0);
1460 
1461     if (new_byte_size > SIZE_MAX) {
1462         return -ENOMEM;
1463     }
1464 
1465     new_ptr = g_try_realloc(*array, new_byte_size);
1466     if (!new_ptr) {
1467         return -ENOMEM;
1468     }
1469 
1470     if (new_byte_size > old_byte_size) {
1471         memset((char *)new_ptr + old_byte_size, 0,
1472                new_byte_size - old_byte_size);
1473     }
1474 
1475     *array = new_ptr;
1476     *size  = new_size;
1477 
1478     return 0;
1479 }
1480 
1481 /*
1482  * Increases the refcount for a range of clusters in a given refcount table.
1483  * This is used to construct a temporary refcount table out of L1 and L2 tables
1484  * which can be compared to the refcount table saved in the image.
1485  *
1486  * Modifies the number of errors in res.
1487  */
1488 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1489                              void **refcount_table,
1490                              int64_t *refcount_table_size,
1491                              int64_t offset, int64_t size)
1492 {
1493     BDRVQcow2State *s = bs->opaque;
1494     uint64_t start, last, cluster_offset, k, refcount;
1495     int ret;
1496 
1497     if (size <= 0) {
1498         return 0;
1499     }
1500 
1501     start = start_of_cluster(s, offset);
1502     last = start_of_cluster(s, offset + size - 1);
1503     for(cluster_offset = start; cluster_offset <= last;
1504         cluster_offset += s->cluster_size) {
1505         k = cluster_offset >> s->cluster_bits;
1506         if (k >= *refcount_table_size) {
1507             ret = realloc_refcount_array(s, refcount_table,
1508                                          refcount_table_size, k + 1);
1509             if (ret < 0) {
1510                 res->check_errors++;
1511                 return ret;
1512             }
1513         }
1514 
1515         refcount = s->get_refcount(*refcount_table, k);
1516         if (refcount == s->refcount_max) {
1517             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1518                     "\n", cluster_offset);
1519             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1520                     "width or qemu-img convert to create a clean copy if the "
1521                     "image cannot be opened for writing\n");
1522             res->corruptions++;
1523             continue;
1524         }
1525         s->set_refcount(*refcount_table, k, refcount + 1);
1526     }
1527 
1528     return 0;
1529 }
1530 
1531 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1532 enum {
1533     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1534 };
1535 
1536 /*
1537  * Increases the refcount in the given refcount table for the all clusters
1538  * referenced in the L2 table. While doing so, performs some checks on L2
1539  * entries.
1540  *
1541  * Returns the number of errors found by the checks or -errno if an internal
1542  * error occurred.
1543  */
1544 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1545                               void **refcount_table,
1546                               int64_t *refcount_table_size, int64_t l2_offset,
1547                               int flags, BdrvCheckMode fix)
1548 {
1549     BDRVQcow2State *s = bs->opaque;
1550     uint64_t *l2_table, l2_entry;
1551     uint64_t next_contiguous_offset = 0;
1552     int i, l2_size, nb_csectors, ret;
1553 
1554     /* Read L2 table from disk */
1555     l2_size = s->l2_size * sizeof(uint64_t);
1556     l2_table = g_malloc(l2_size);
1557 
1558     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1559     if (ret < 0) {
1560         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1561         res->check_errors++;
1562         goto fail;
1563     }
1564 
1565     /* Do the actual checks */
1566     for(i = 0; i < s->l2_size; i++) {
1567         l2_entry = be64_to_cpu(l2_table[i]);
1568 
1569         switch (qcow2_get_cluster_type(l2_entry)) {
1570         case QCOW2_CLUSTER_COMPRESSED:
1571             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1572             if (l2_entry & QCOW_OFLAG_COPIED) {
1573                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1574                     "copied flag must never be set for compressed "
1575                     "clusters\n", l2_entry >> s->cluster_bits);
1576                 l2_entry &= ~QCOW_OFLAG_COPIED;
1577                 res->corruptions++;
1578             }
1579 
1580             /* Mark cluster as used */
1581             nb_csectors = ((l2_entry >> s->csize_shift) &
1582                            s->csize_mask) + 1;
1583             l2_entry &= s->cluster_offset_mask;
1584             ret = qcow2_inc_refcounts_imrt(bs, res,
1585                                            refcount_table, refcount_table_size,
1586                                            l2_entry & ~511, nb_csectors * 512);
1587             if (ret < 0) {
1588                 goto fail;
1589             }
1590 
1591             if (flags & CHECK_FRAG_INFO) {
1592                 res->bfi.allocated_clusters++;
1593                 res->bfi.compressed_clusters++;
1594 
1595                 /* Compressed clusters are fragmented by nature.  Since they
1596                  * take up sub-sector space but we only have sector granularity
1597                  * I/O we need to re-read the same sectors even for adjacent
1598                  * compressed clusters.
1599                  */
1600                 res->bfi.fragmented_clusters++;
1601             }
1602             break;
1603 
1604         case QCOW2_CLUSTER_ZERO_ALLOC:
1605         case QCOW2_CLUSTER_NORMAL:
1606         {
1607             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1608 
1609             if (flags & CHECK_FRAG_INFO) {
1610                 res->bfi.allocated_clusters++;
1611                 if (next_contiguous_offset &&
1612                     offset != next_contiguous_offset) {
1613                     res->bfi.fragmented_clusters++;
1614                 }
1615                 next_contiguous_offset = offset + s->cluster_size;
1616             }
1617 
1618             /* Correct offsets are cluster aligned */
1619             if (offset_into_cluster(s, offset)) {
1620                 if (qcow2_get_cluster_type(l2_entry) ==
1621                     QCOW2_CLUSTER_ZERO_ALLOC)
1622                 {
1623                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
1624                             "cluster is not properly aligned; L2 entry "
1625                             "corrupted.\n",
1626                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1627                             offset);
1628                     if (fix & BDRV_FIX_ERRORS) {
1629                         uint64_t l2e_offset =
1630                             l2_offset + (uint64_t)i * sizeof(uint64_t);
1631 
1632                         l2_entry = QCOW_OFLAG_ZERO;
1633                         l2_table[i] = cpu_to_be64(l2_entry);
1634                         ret = qcow2_pre_write_overlap_check(bs,
1635                                 QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
1636                                 l2e_offset, sizeof(uint64_t));
1637                         if (ret < 0) {
1638                             fprintf(stderr, "ERROR: Overlap check failed\n");
1639                             res->check_errors++;
1640                             /* Something is seriously wrong, so abort checking
1641                              * this L2 table */
1642                             goto fail;
1643                         }
1644 
1645                         ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1646                                                &l2_table[i], sizeof(uint64_t));
1647                         if (ret < 0) {
1648                             fprintf(stderr, "ERROR: Failed to overwrite L2 "
1649                                     "table entry: %s\n", strerror(-ret));
1650                             res->check_errors++;
1651                             /* Do not abort, continue checking the rest of this
1652                              * L2 table's entries */
1653                         } else {
1654                             res->corruptions_fixed++;
1655                             /* Skip marking the cluster as used
1656                              * (it is unused now) */
1657                             continue;
1658                         }
1659                     } else {
1660                         res->corruptions++;
1661                     }
1662                 } else {
1663                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1664                         "not properly aligned; L2 entry corrupted.\n", offset);
1665                     res->corruptions++;
1666                 }
1667             }
1668 
1669             /* Mark cluster as used */
1670             ret = qcow2_inc_refcounts_imrt(bs, res,
1671                                            refcount_table, refcount_table_size,
1672                                            offset, s->cluster_size);
1673             if (ret < 0) {
1674                 goto fail;
1675             }
1676             break;
1677         }
1678 
1679         case QCOW2_CLUSTER_ZERO_PLAIN:
1680         case QCOW2_CLUSTER_UNALLOCATED:
1681             break;
1682 
1683         default:
1684             abort();
1685         }
1686     }
1687 
1688     g_free(l2_table);
1689     return 0;
1690 
1691 fail:
1692     g_free(l2_table);
1693     return ret;
1694 }
1695 
1696 /*
1697  * Increases the refcount for the L1 table, its L2 tables and all referenced
1698  * clusters in the given refcount table. While doing so, performs some checks
1699  * on L1 and L2 entries.
1700  *
1701  * Returns the number of errors found by the checks or -errno if an internal
1702  * error occurred.
1703  */
1704 static int check_refcounts_l1(BlockDriverState *bs,
1705                               BdrvCheckResult *res,
1706                               void **refcount_table,
1707                               int64_t *refcount_table_size,
1708                               int64_t l1_table_offset, int l1_size,
1709                               int flags, BdrvCheckMode fix)
1710 {
1711     BDRVQcow2State *s = bs->opaque;
1712     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1713     int i, ret;
1714 
1715     l1_size2 = l1_size * sizeof(uint64_t);
1716 
1717     /* Mark L1 table as used */
1718     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1719                                    l1_table_offset, l1_size2);
1720     if (ret < 0) {
1721         goto fail;
1722     }
1723 
1724     /* Read L1 table entries from disk */
1725     if (l1_size2 > 0) {
1726         l1_table = g_try_malloc(l1_size2);
1727         if (l1_table == NULL) {
1728             ret = -ENOMEM;
1729             res->check_errors++;
1730             goto fail;
1731         }
1732         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1733         if (ret < 0) {
1734             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1735             res->check_errors++;
1736             goto fail;
1737         }
1738         for(i = 0;i < l1_size; i++)
1739             be64_to_cpus(&l1_table[i]);
1740     }
1741 
1742     /* Do the actual checks */
1743     for(i = 0; i < l1_size; i++) {
1744         l2_offset = l1_table[i];
1745         if (l2_offset) {
1746             /* Mark L2 table as used */
1747             l2_offset &= L1E_OFFSET_MASK;
1748             ret = qcow2_inc_refcounts_imrt(bs, res,
1749                                            refcount_table, refcount_table_size,
1750                                            l2_offset, s->cluster_size);
1751             if (ret < 0) {
1752                 goto fail;
1753             }
1754 
1755             /* L2 tables are cluster aligned */
1756             if (offset_into_cluster(s, l2_offset)) {
1757                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1758                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1759                 res->corruptions++;
1760             }
1761 
1762             /* Process and check L2 entries */
1763             ret = check_refcounts_l2(bs, res, refcount_table,
1764                                      refcount_table_size, l2_offset, flags,
1765                                      fix);
1766             if (ret < 0) {
1767                 goto fail;
1768             }
1769         }
1770     }
1771     g_free(l1_table);
1772     return 0;
1773 
1774 fail:
1775     g_free(l1_table);
1776     return ret;
1777 }
1778 
1779 /*
1780  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1781  *
1782  * This function does not print an error message nor does it increment
1783  * check_errors if qcow2_get_refcount fails (this is because such an error will
1784  * have been already detected and sufficiently signaled by the calling function
1785  * (qcow2_check_refcounts) by the time this function is called).
1786  */
1787 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1788                               BdrvCheckMode fix)
1789 {
1790     BDRVQcow2State *s = bs->opaque;
1791     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1792     int ret;
1793     uint64_t refcount;
1794     int i, j;
1795 
1796     for (i = 0; i < s->l1_size; i++) {
1797         uint64_t l1_entry = s->l1_table[i];
1798         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1799         bool l2_dirty = false;
1800 
1801         if (!l2_offset) {
1802             continue;
1803         }
1804 
1805         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1806                                  &refcount);
1807         if (ret < 0) {
1808             /* don't print message nor increment check_errors */
1809             continue;
1810         }
1811         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1812             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1813                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1814                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1815                                             "ERROR",
1816                     i, l1_entry, refcount);
1817             if (fix & BDRV_FIX_ERRORS) {
1818                 s->l1_table[i] = refcount == 1
1819                                ? l1_entry |  QCOW_OFLAG_COPIED
1820                                : l1_entry & ~QCOW_OFLAG_COPIED;
1821                 ret = qcow2_write_l1_entry(bs, i);
1822                 if (ret < 0) {
1823                     res->check_errors++;
1824                     goto fail;
1825                 }
1826                 res->corruptions_fixed++;
1827             } else {
1828                 res->corruptions++;
1829             }
1830         }
1831 
1832         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1833                          s->l2_size * sizeof(uint64_t));
1834         if (ret < 0) {
1835             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1836                     strerror(-ret));
1837             res->check_errors++;
1838             goto fail;
1839         }
1840 
1841         for (j = 0; j < s->l2_size; j++) {
1842             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1843             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1844             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1845 
1846             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1847                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1848                 ret = qcow2_get_refcount(bs,
1849                                          data_offset >> s->cluster_bits,
1850                                          &refcount);
1851                 if (ret < 0) {
1852                     /* don't print message nor increment check_errors */
1853                     continue;
1854                 }
1855                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1856                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1857                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1858                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1859                                                     "ERROR",
1860                             l2_entry, refcount);
1861                     if (fix & BDRV_FIX_ERRORS) {
1862                         l2_table[j] = cpu_to_be64(refcount == 1
1863                                     ? l2_entry |  QCOW_OFLAG_COPIED
1864                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1865                         l2_dirty = true;
1866                         res->corruptions_fixed++;
1867                     } else {
1868                         res->corruptions++;
1869                     }
1870                 }
1871             }
1872         }
1873 
1874         if (l2_dirty) {
1875             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1876                                                 l2_offset, s->cluster_size);
1877             if (ret < 0) {
1878                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1879                         "overlap check failed: %s\n", strerror(-ret));
1880                 res->check_errors++;
1881                 goto fail;
1882             }
1883 
1884             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1885                               s->cluster_size);
1886             if (ret < 0) {
1887                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1888                         strerror(-ret));
1889                 res->check_errors++;
1890                 goto fail;
1891             }
1892         }
1893     }
1894 
1895     ret = 0;
1896 
1897 fail:
1898     qemu_vfree(l2_table);
1899     return ret;
1900 }
1901 
1902 /*
1903  * Checks consistency of refblocks and accounts for each refblock in
1904  * *refcount_table.
1905  */
1906 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1907                            BdrvCheckMode fix, bool *rebuild,
1908                            void **refcount_table, int64_t *nb_clusters)
1909 {
1910     BDRVQcow2State *s = bs->opaque;
1911     int64_t i, size;
1912     int ret;
1913 
1914     for(i = 0; i < s->refcount_table_size; i++) {
1915         uint64_t offset, cluster;
1916         offset = s->refcount_table[i];
1917         cluster = offset >> s->cluster_bits;
1918 
1919         /* Refcount blocks are cluster aligned */
1920         if (offset_into_cluster(s, offset)) {
1921             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1922                 "cluster aligned; refcount table entry corrupted\n", i);
1923             res->corruptions++;
1924             *rebuild = true;
1925             continue;
1926         }
1927 
1928         if (cluster >= *nb_clusters) {
1929             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1930                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1931 
1932             if (fix & BDRV_FIX_ERRORS) {
1933                 int64_t new_nb_clusters;
1934                 Error *local_err = NULL;
1935 
1936                 if (offset > INT64_MAX - s->cluster_size) {
1937                     ret = -EINVAL;
1938                     goto resize_fail;
1939                 }
1940 
1941                 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
1942                                     PREALLOC_MODE_OFF, &local_err);
1943                 if (ret < 0) {
1944                     error_report_err(local_err);
1945                     goto resize_fail;
1946                 }
1947                 size = bdrv_getlength(bs->file->bs);
1948                 if (size < 0) {
1949                     ret = size;
1950                     goto resize_fail;
1951                 }
1952 
1953                 new_nb_clusters = size_to_clusters(s, size);
1954                 assert(new_nb_clusters >= *nb_clusters);
1955 
1956                 ret = realloc_refcount_array(s, refcount_table,
1957                                              nb_clusters, new_nb_clusters);
1958                 if (ret < 0) {
1959                     res->check_errors++;
1960                     return ret;
1961                 }
1962 
1963                 if (cluster >= *nb_clusters) {
1964                     ret = -EINVAL;
1965                     goto resize_fail;
1966                 }
1967 
1968                 res->corruptions_fixed++;
1969                 ret = qcow2_inc_refcounts_imrt(bs, res,
1970                                                refcount_table, nb_clusters,
1971                                                offset, s->cluster_size);
1972                 if (ret < 0) {
1973                     return ret;
1974                 }
1975                 /* No need to check whether the refcount is now greater than 1:
1976                  * This area was just allocated and zeroed, so it can only be
1977                  * exactly 1 after qcow2_inc_refcounts_imrt() */
1978                 continue;
1979 
1980 resize_fail:
1981                 res->corruptions++;
1982                 *rebuild = true;
1983                 fprintf(stderr, "ERROR could not resize image: %s\n",
1984                         strerror(-ret));
1985             } else {
1986                 res->corruptions++;
1987             }
1988             continue;
1989         }
1990 
1991         if (offset != 0) {
1992             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1993                                            offset, s->cluster_size);
1994             if (ret < 0) {
1995                 return ret;
1996             }
1997             if (s->get_refcount(*refcount_table, cluster) != 1) {
1998                 fprintf(stderr, "ERROR refcount block %" PRId64
1999                         " refcount=%" PRIu64 "\n", i,
2000                         s->get_refcount(*refcount_table, cluster));
2001                 res->corruptions++;
2002                 *rebuild = true;
2003             }
2004         }
2005     }
2006 
2007     return 0;
2008 }
2009 
2010 /*
2011  * Calculates an in-memory refcount table.
2012  */
2013 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2014                                BdrvCheckMode fix, bool *rebuild,
2015                                void **refcount_table, int64_t *nb_clusters)
2016 {
2017     BDRVQcow2State *s = bs->opaque;
2018     int64_t i;
2019     QCowSnapshot *sn;
2020     int ret;
2021 
2022     if (!*refcount_table) {
2023         int64_t old_size = 0;
2024         ret = realloc_refcount_array(s, refcount_table,
2025                                      &old_size, *nb_clusters);
2026         if (ret < 0) {
2027             res->check_errors++;
2028             return ret;
2029         }
2030     }
2031 
2032     /* header */
2033     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2034                                    0, s->cluster_size);
2035     if (ret < 0) {
2036         return ret;
2037     }
2038 
2039     /* current L1 table */
2040     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2041                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2042                              fix);
2043     if (ret < 0) {
2044         return ret;
2045     }
2046 
2047     /* snapshots */
2048     for (i = 0; i < s->nb_snapshots; i++) {
2049         sn = s->snapshots + i;
2050         if (offset_into_cluster(s, sn->l1_table_offset)) {
2051             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2052                     "L1 table is not cluster aligned; snapshot table entry "
2053                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2054             res->corruptions++;
2055             continue;
2056         }
2057         if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
2058             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2059                     "L1 table is too large; snapshot table entry corrupted\n",
2060                     sn->id_str, sn->name, sn->l1_size);
2061             res->corruptions++;
2062             continue;
2063         }
2064         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2065                                  sn->l1_table_offset, sn->l1_size, 0, fix);
2066         if (ret < 0) {
2067             return ret;
2068         }
2069     }
2070     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2071                                    s->snapshots_offset, s->snapshots_size);
2072     if (ret < 0) {
2073         return ret;
2074     }
2075 
2076     /* refcount data */
2077     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2078                                    s->refcount_table_offset,
2079                                    s->refcount_table_size * sizeof(uint64_t));
2080     if (ret < 0) {
2081         return ret;
2082     }
2083 
2084     /* encryption */
2085     if (s->crypto_header.length) {
2086         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2087                                        s->crypto_header.offset,
2088                                        s->crypto_header.length);
2089         if (ret < 0) {
2090             return ret;
2091         }
2092     }
2093 
2094     /* bitmaps */
2095     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2096     if (ret < 0) {
2097         return ret;
2098     }
2099 
2100     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2101 }
2102 
2103 /*
2104  * Compares the actual reference count for each cluster in the image against the
2105  * refcount as reported by the refcount structures on-disk.
2106  */
2107 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2108                               BdrvCheckMode fix, bool *rebuild,
2109                               int64_t *highest_cluster,
2110                               void *refcount_table, int64_t nb_clusters)
2111 {
2112     BDRVQcow2State *s = bs->opaque;
2113     int64_t i;
2114     uint64_t refcount1, refcount2;
2115     int ret;
2116 
2117     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2118         ret = qcow2_get_refcount(bs, i, &refcount1);
2119         if (ret < 0) {
2120             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2121                     i, strerror(-ret));
2122             res->check_errors++;
2123             continue;
2124         }
2125 
2126         refcount2 = s->get_refcount(refcount_table, i);
2127 
2128         if (refcount1 > 0 || refcount2 > 0) {
2129             *highest_cluster = i;
2130         }
2131 
2132         if (refcount1 != refcount2) {
2133             /* Check if we're allowed to fix the mismatch */
2134             int *num_fixed = NULL;
2135             if (refcount1 == 0) {
2136                 *rebuild = true;
2137             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2138                 num_fixed = &res->leaks_fixed;
2139             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2140                 num_fixed = &res->corruptions_fixed;
2141             }
2142 
2143             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2144                     " reference=%" PRIu64 "\n",
2145                    num_fixed != NULL     ? "Repairing" :
2146                    refcount1 < refcount2 ? "ERROR" :
2147                                            "Leaked",
2148                    i, refcount1, refcount2);
2149 
2150             if (num_fixed) {
2151                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2152                                       refcount_diff(refcount1, refcount2),
2153                                       refcount1 > refcount2,
2154                                       QCOW2_DISCARD_ALWAYS);
2155                 if (ret >= 0) {
2156                     (*num_fixed)++;
2157                     continue;
2158                 }
2159             }
2160 
2161             /* And if we couldn't, print an error */
2162             if (refcount1 < refcount2) {
2163                 res->corruptions++;
2164             } else {
2165                 res->leaks++;
2166             }
2167         }
2168     }
2169 }
2170 
2171 /*
2172  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2173  * the on-disk refcount structures.
2174  *
2175  * On input, *first_free_cluster tells where to start looking, and need not
2176  * actually be a free cluster; the returned offset will not be before that
2177  * cluster.  On output, *first_free_cluster points to the first gap found, even
2178  * if that gap was too small to be used as the returned offset.
2179  *
2180  * Note that *first_free_cluster is a cluster index whereas the return value is
2181  * an offset.
2182  */
2183 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2184                                    int cluster_count,
2185                                    void **refcount_table,
2186                                    int64_t *imrt_nb_clusters,
2187                                    int64_t *first_free_cluster)
2188 {
2189     BDRVQcow2State *s = bs->opaque;
2190     int64_t cluster = *first_free_cluster, i;
2191     bool first_gap = true;
2192     int contiguous_free_clusters;
2193     int ret;
2194 
2195     /* Starting at *first_free_cluster, find a range of at least cluster_count
2196      * continuously free clusters */
2197     for (contiguous_free_clusters = 0;
2198          cluster < *imrt_nb_clusters &&
2199          contiguous_free_clusters < cluster_count;
2200          cluster++)
2201     {
2202         if (!s->get_refcount(*refcount_table, cluster)) {
2203             contiguous_free_clusters++;
2204             if (first_gap) {
2205                 /* If this is the first free cluster found, update
2206                  * *first_free_cluster accordingly */
2207                 *first_free_cluster = cluster;
2208                 first_gap = false;
2209             }
2210         } else if (contiguous_free_clusters) {
2211             contiguous_free_clusters = 0;
2212         }
2213     }
2214 
2215     /* If contiguous_free_clusters is greater than zero, it contains the number
2216      * of continuously free clusters until the current cluster; the first free
2217      * cluster in the current "gap" is therefore
2218      * cluster - contiguous_free_clusters */
2219 
2220     /* If no such range could be found, grow the in-memory refcount table
2221      * accordingly to append free clusters at the end of the image */
2222     if (contiguous_free_clusters < cluster_count) {
2223         /* contiguous_free_clusters clusters are already empty at the image end;
2224          * we need cluster_count clusters; therefore, we have to allocate
2225          * cluster_count - contiguous_free_clusters new clusters at the end of
2226          * the image (which is the current value of cluster; note that cluster
2227          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2228          * the image end) */
2229         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2230                                      cluster + cluster_count
2231                                      - contiguous_free_clusters);
2232         if (ret < 0) {
2233             return ret;
2234         }
2235     }
2236 
2237     /* Go back to the first free cluster */
2238     cluster -= contiguous_free_clusters;
2239     for (i = 0; i < cluster_count; i++) {
2240         s->set_refcount(*refcount_table, cluster + i, 1);
2241     }
2242 
2243     return cluster << s->cluster_bits;
2244 }
2245 
2246 /*
2247  * Creates a new refcount structure based solely on the in-memory information
2248  * given through *refcount_table. All necessary allocations will be reflected
2249  * in that array.
2250  *
2251  * On success, the old refcount structure is leaked (it will be covered by the
2252  * new refcount structure).
2253  */
2254 static int rebuild_refcount_structure(BlockDriverState *bs,
2255                                       BdrvCheckResult *res,
2256                                       void **refcount_table,
2257                                       int64_t *nb_clusters)
2258 {
2259     BDRVQcow2State *s = bs->opaque;
2260     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2261     int64_t refblock_offset, refblock_start, refblock_index;
2262     uint32_t reftable_size = 0;
2263     uint64_t *on_disk_reftable = NULL;
2264     void *on_disk_refblock;
2265     int ret = 0;
2266     struct {
2267         uint64_t reftable_offset;
2268         uint32_t reftable_clusters;
2269     } QEMU_PACKED reftable_offset_and_clusters;
2270 
2271     qcow2_cache_empty(bs, s->refcount_block_cache);
2272 
2273 write_refblocks:
2274     for (; cluster < *nb_clusters; cluster++) {
2275         if (!s->get_refcount(*refcount_table, cluster)) {
2276             continue;
2277         }
2278 
2279         refblock_index = cluster >> s->refcount_block_bits;
2280         refblock_start = refblock_index << s->refcount_block_bits;
2281 
2282         /* Don't allocate a cluster in a refblock already written to disk */
2283         if (first_free_cluster < refblock_start) {
2284             first_free_cluster = refblock_start;
2285         }
2286         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2287                                               nb_clusters, &first_free_cluster);
2288         if (refblock_offset < 0) {
2289             fprintf(stderr, "ERROR allocating refblock: %s\n",
2290                     strerror(-refblock_offset));
2291             res->check_errors++;
2292             ret = refblock_offset;
2293             goto fail;
2294         }
2295 
2296         if (reftable_size <= refblock_index) {
2297             uint32_t old_reftable_size = reftable_size;
2298             uint64_t *new_on_disk_reftable;
2299 
2300             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2301                                      s->cluster_size) / sizeof(uint64_t);
2302             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2303                                                  reftable_size *
2304                                                  sizeof(uint64_t));
2305             if (!new_on_disk_reftable) {
2306                 res->check_errors++;
2307                 ret = -ENOMEM;
2308                 goto fail;
2309             }
2310             on_disk_reftable = new_on_disk_reftable;
2311 
2312             memset(on_disk_reftable + old_reftable_size, 0,
2313                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2314 
2315             /* The offset we have for the reftable is now no longer valid;
2316              * this will leak that range, but we can easily fix that by running
2317              * a leak-fixing check after this rebuild operation */
2318             reftable_offset = -1;
2319         } else {
2320             assert(on_disk_reftable);
2321         }
2322         on_disk_reftable[refblock_index] = refblock_offset;
2323 
2324         /* If this is apparently the last refblock (for now), try to squeeze the
2325          * reftable in */
2326         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2327             reftable_offset < 0)
2328         {
2329             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2330                                                           sizeof(uint64_t));
2331             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2332                                                   refcount_table, nb_clusters,
2333                                                   &first_free_cluster);
2334             if (reftable_offset < 0) {
2335                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2336                         strerror(-reftable_offset));
2337                 res->check_errors++;
2338                 ret = reftable_offset;
2339                 goto fail;
2340             }
2341         }
2342 
2343         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2344                                             s->cluster_size);
2345         if (ret < 0) {
2346             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2347             goto fail;
2348         }
2349 
2350         /* The size of *refcount_table is always cluster-aligned, therefore the
2351          * write operation will not overflow */
2352         on_disk_refblock = (void *)((char *) *refcount_table +
2353                                     refblock_index * s->cluster_size);
2354 
2355         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2356                          on_disk_refblock, s->cluster_sectors);
2357         if (ret < 0) {
2358             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2359             goto fail;
2360         }
2361 
2362         /* Go to the end of this refblock */
2363         cluster = refblock_start + s->refcount_block_size - 1;
2364     }
2365 
2366     if (reftable_offset < 0) {
2367         uint64_t post_refblock_start, reftable_clusters;
2368 
2369         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2370         reftable_clusters = size_to_clusters(s,
2371                                              reftable_size * sizeof(uint64_t));
2372         /* Not pretty but simple */
2373         if (first_free_cluster < post_refblock_start) {
2374             first_free_cluster = post_refblock_start;
2375         }
2376         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2377                                               refcount_table, nb_clusters,
2378                                               &first_free_cluster);
2379         if (reftable_offset < 0) {
2380             fprintf(stderr, "ERROR allocating reftable: %s\n",
2381                     strerror(-reftable_offset));
2382             res->check_errors++;
2383             ret = reftable_offset;
2384             goto fail;
2385         }
2386 
2387         goto write_refblocks;
2388     }
2389 
2390     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2391         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2392     }
2393 
2394     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2395                                         reftable_size * sizeof(uint64_t));
2396     if (ret < 0) {
2397         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2398         goto fail;
2399     }
2400 
2401     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2402     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2403                       reftable_size * sizeof(uint64_t));
2404     if (ret < 0) {
2405         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2406         goto fail;
2407     }
2408 
2409     /* Enter new reftable into the image header */
2410     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2411     reftable_offset_and_clusters.reftable_clusters =
2412         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2413     ret = bdrv_pwrite_sync(bs->file,
2414                            offsetof(QCowHeader, refcount_table_offset),
2415                            &reftable_offset_and_clusters,
2416                            sizeof(reftable_offset_and_clusters));
2417     if (ret < 0) {
2418         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2419         goto fail;
2420     }
2421 
2422     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2423         be64_to_cpus(&on_disk_reftable[refblock_index]);
2424     }
2425     s->refcount_table = on_disk_reftable;
2426     s->refcount_table_offset = reftable_offset;
2427     s->refcount_table_size = reftable_size;
2428     update_max_refcount_table_index(s);
2429 
2430     return 0;
2431 
2432 fail:
2433     g_free(on_disk_reftable);
2434     return ret;
2435 }
2436 
2437 /*
2438  * Checks an image for refcount consistency.
2439  *
2440  * Returns 0 if no errors are found, the number of errors in case the image is
2441  * detected as corrupted, and -errno when an internal error occurred.
2442  */
2443 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2444                           BdrvCheckMode fix)
2445 {
2446     BDRVQcow2State *s = bs->opaque;
2447     BdrvCheckResult pre_compare_res;
2448     int64_t size, highest_cluster, nb_clusters;
2449     void *refcount_table = NULL;
2450     bool rebuild = false;
2451     int ret;
2452 
2453     size = bdrv_getlength(bs->file->bs);
2454     if (size < 0) {
2455         res->check_errors++;
2456         return size;
2457     }
2458 
2459     nb_clusters = size_to_clusters(s, size);
2460     if (nb_clusters > INT_MAX) {
2461         res->check_errors++;
2462         return -EFBIG;
2463     }
2464 
2465     res->bfi.total_clusters =
2466         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2467 
2468     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2469                               &nb_clusters);
2470     if (ret < 0) {
2471         goto fail;
2472     }
2473 
2474     /* In case we don't need to rebuild the refcount structure (but want to fix
2475      * something), this function is immediately called again, in which case the
2476      * result should be ignored */
2477     pre_compare_res = *res;
2478     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2479                       nb_clusters);
2480 
2481     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2482         BdrvCheckResult old_res = *res;
2483         int fresh_leaks = 0;
2484 
2485         fprintf(stderr, "Rebuilding refcount structure\n");
2486         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2487                                          &nb_clusters);
2488         if (ret < 0) {
2489             goto fail;
2490         }
2491 
2492         res->corruptions = 0;
2493         res->leaks = 0;
2494 
2495         /* Because the old reftable has been exchanged for a new one the
2496          * references have to be recalculated */
2497         rebuild = false;
2498         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2499         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2500                                   &nb_clusters);
2501         if (ret < 0) {
2502             goto fail;
2503         }
2504 
2505         if (fix & BDRV_FIX_LEAKS) {
2506             /* The old refcount structures are now leaked, fix it; the result
2507              * can be ignored, aside from leaks which were introduced by
2508              * rebuild_refcount_structure() that could not be fixed */
2509             BdrvCheckResult saved_res = *res;
2510             *res = (BdrvCheckResult){ 0 };
2511 
2512             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2513                               &highest_cluster, refcount_table, nb_clusters);
2514             if (rebuild) {
2515                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2516                         "broken\n");
2517             }
2518 
2519             /* Any leaks accounted for here were introduced by
2520              * rebuild_refcount_structure() because that function has created a
2521              * new refcount structure from scratch */
2522             fresh_leaks = res->leaks;
2523             *res = saved_res;
2524         }
2525 
2526         if (res->corruptions < old_res.corruptions) {
2527             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2528         }
2529         if (res->leaks < old_res.leaks) {
2530             res->leaks_fixed += old_res.leaks - res->leaks;
2531         }
2532         res->leaks += fresh_leaks;
2533     } else if (fix) {
2534         if (rebuild) {
2535             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2536             res->check_errors++;
2537             ret = -EIO;
2538             goto fail;
2539         }
2540 
2541         if (res->leaks || res->corruptions) {
2542             *res = pre_compare_res;
2543             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2544                               refcount_table, nb_clusters);
2545         }
2546     }
2547 
2548     /* check OFLAG_COPIED */
2549     ret = check_oflag_copied(bs, res, fix);
2550     if (ret < 0) {
2551         goto fail;
2552     }
2553 
2554     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2555     ret = 0;
2556 
2557 fail:
2558     g_free(refcount_table);
2559 
2560     return ret;
2561 }
2562 
2563 #define overlaps_with(ofs, sz) \
2564     ranges_overlap(offset, size, ofs, sz)
2565 
2566 /*
2567  * Checks if the given offset into the image file is actually free to use by
2568  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2569  * i.e. a sanity check without relying on the refcount tables.
2570  *
2571  * The ign parameter specifies what checks not to perform (being a bitmask of
2572  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2573  *
2574  * Returns:
2575  * - 0 if writing to this offset will not affect the mentioned metadata
2576  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2577  * - a negative value (-errno) indicating an error while performing a check,
2578  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2579  */
2580 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2581                                  int64_t size)
2582 {
2583     BDRVQcow2State *s = bs->opaque;
2584     int chk = s->overlap_check & ~ign;
2585     int i, j;
2586 
2587     if (!size) {
2588         return 0;
2589     }
2590 
2591     if (chk & QCOW2_OL_MAIN_HEADER) {
2592         if (offset < s->cluster_size) {
2593             return QCOW2_OL_MAIN_HEADER;
2594         }
2595     }
2596 
2597     /* align range to test to cluster boundaries */
2598     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2599     offset = start_of_cluster(s, offset);
2600 
2601     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2602         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2603             return QCOW2_OL_ACTIVE_L1;
2604         }
2605     }
2606 
2607     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2608         if (overlaps_with(s->refcount_table_offset,
2609             s->refcount_table_size * sizeof(uint64_t))) {
2610             return QCOW2_OL_REFCOUNT_TABLE;
2611         }
2612     }
2613 
2614     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2615         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2616             return QCOW2_OL_SNAPSHOT_TABLE;
2617         }
2618     }
2619 
2620     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2621         for (i = 0; i < s->nb_snapshots; i++) {
2622             if (s->snapshots[i].l1_size &&
2623                 overlaps_with(s->snapshots[i].l1_table_offset,
2624                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2625                 return QCOW2_OL_INACTIVE_L1;
2626             }
2627         }
2628     }
2629 
2630     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2631         for (i = 0; i < s->l1_size; i++) {
2632             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2633                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2634                 s->cluster_size)) {
2635                 return QCOW2_OL_ACTIVE_L2;
2636             }
2637         }
2638     }
2639 
2640     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2641         unsigned last_entry = s->max_refcount_table_index;
2642         assert(last_entry < s->refcount_table_size);
2643         assert(last_entry + 1 == s->refcount_table_size ||
2644                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2645         for (i = 0; i <= last_entry; i++) {
2646             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2647                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2648                 s->cluster_size)) {
2649                 return QCOW2_OL_REFCOUNT_BLOCK;
2650             }
2651         }
2652     }
2653 
2654     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2655         for (i = 0; i < s->nb_snapshots; i++) {
2656             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2657             uint32_t l1_sz  = s->snapshots[i].l1_size;
2658             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2659             uint64_t *l1;
2660             int ret;
2661 
2662             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t),
2663                                        QCOW_MAX_L1_SIZE, "", NULL);
2664             if (ret < 0) {
2665                 return ret;
2666             }
2667 
2668             l1 = g_try_malloc(l1_sz2);
2669 
2670             if (l1_sz2 && l1 == NULL) {
2671                 return -ENOMEM;
2672             }
2673 
2674             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2675             if (ret < 0) {
2676                 g_free(l1);
2677                 return ret;
2678             }
2679 
2680             for (j = 0; j < l1_sz; j++) {
2681                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2682                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2683                     g_free(l1);
2684                     return QCOW2_OL_INACTIVE_L2;
2685                 }
2686             }
2687 
2688             g_free(l1);
2689         }
2690     }
2691 
2692     return 0;
2693 }
2694 
2695 static const char *metadata_ol_names[] = {
2696     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2697     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2698     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2699     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2700     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2701     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2702     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2703     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2704 };
2705 
2706 /*
2707  * First performs a check for metadata overlaps (through
2708  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2709  * while performing a check), that value is returned. If an impending overlap
2710  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2711  * and -EIO returned.
2712  *
2713  * Returns 0 if there were neither overlaps nor errors while checking for
2714  * overlaps; or a negative value (-errno) on error.
2715  */
2716 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2717                                   int64_t size)
2718 {
2719     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2720 
2721     if (ret < 0) {
2722         return ret;
2723     } else if (ret > 0) {
2724         int metadata_ol_bitnr = ctz32(ret);
2725         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2726 
2727         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2728                                 "write on metadata (overlaps with %s)",
2729                                 metadata_ol_names[metadata_ol_bitnr]);
2730         return -EIO;
2731     }
2732 
2733     return 0;
2734 }
2735 
2736 /* A pointer to a function of this type is given to walk_over_reftable(). That
2737  * function will create refblocks and pass them to a RefblockFinishOp once they
2738  * are completed (@refblock). @refblock_empty is set if the refblock is
2739  * completely empty.
2740  *
2741  * Along with the refblock, a corresponding reftable entry is passed, in the
2742  * reftable @reftable (which may be reallocated) at @reftable_index.
2743  *
2744  * @allocated should be set to true if a new cluster has been allocated.
2745  */
2746 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2747                                uint64_t reftable_index, uint64_t *reftable_size,
2748                                void *refblock, bool refblock_empty,
2749                                bool *allocated, Error **errp);
2750 
2751 /**
2752  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2753  * it is not empty) and inserts its offset into the new reftable. The size of
2754  * this new reftable is increased as required.
2755  */
2756 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2757                           uint64_t reftable_index, uint64_t *reftable_size,
2758                           void *refblock, bool refblock_empty, bool *allocated,
2759                           Error **errp)
2760 {
2761     BDRVQcow2State *s = bs->opaque;
2762     int64_t offset;
2763 
2764     if (!refblock_empty && reftable_index >= *reftable_size) {
2765         uint64_t *new_reftable;
2766         uint64_t new_reftable_size;
2767 
2768         new_reftable_size = ROUND_UP(reftable_index + 1,
2769                                      s->cluster_size / sizeof(uint64_t));
2770         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2771             error_setg(errp,
2772                        "This operation would make the refcount table grow "
2773                        "beyond the maximum size supported by QEMU, aborting");
2774             return -ENOTSUP;
2775         }
2776 
2777         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2778                                                 sizeof(uint64_t));
2779         if (!new_reftable) {
2780             error_setg(errp, "Failed to increase reftable buffer size");
2781             return -ENOMEM;
2782         }
2783 
2784         memset(new_reftable + *reftable_size, 0,
2785                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2786 
2787         *reftable      = new_reftable;
2788         *reftable_size = new_reftable_size;
2789     }
2790 
2791     if (!refblock_empty && !(*reftable)[reftable_index]) {
2792         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2793         if (offset < 0) {
2794             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2795             return offset;
2796         }
2797         (*reftable)[reftable_index] = offset;
2798         *allocated = true;
2799     }
2800 
2801     return 0;
2802 }
2803 
2804 /**
2805  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2806  * offset specified by the new reftable's entry. It does not modify the new
2807  * reftable or change any refcounts.
2808  */
2809 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2810                           uint64_t reftable_index, uint64_t *reftable_size,
2811                           void *refblock, bool refblock_empty, bool *allocated,
2812                           Error **errp)
2813 {
2814     BDRVQcow2State *s = bs->opaque;
2815     int64_t offset;
2816     int ret;
2817 
2818     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2819         offset = (*reftable)[reftable_index];
2820 
2821         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2822         if (ret < 0) {
2823             error_setg_errno(errp, -ret, "Overlap check failed");
2824             return ret;
2825         }
2826 
2827         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2828         if (ret < 0) {
2829             error_setg_errno(errp, -ret, "Failed to write refblock");
2830             return ret;
2831         }
2832     } else {
2833         assert(refblock_empty);
2834     }
2835 
2836     return 0;
2837 }
2838 
2839 /**
2840  * This function walks over the existing reftable and every referenced refblock;
2841  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2842  * create an equal new entry in the passed @new_refblock. Once that
2843  * @new_refblock is completely filled, @operation will be called.
2844  *
2845  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2846  * @index is the index of the walk_over_reftable() calls and @total is the total
2847  * number of walk_over_reftable() calls per amend operation. Both are used for
2848  * calculating the parameters for the status callback.
2849  *
2850  * @allocated is set to true if a new cluster has been allocated.
2851  */
2852 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2853                               uint64_t *new_reftable_index,
2854                               uint64_t *new_reftable_size,
2855                               void *new_refblock, int new_refblock_size,
2856                               int new_refcount_bits,
2857                               RefblockFinishOp *operation, bool *allocated,
2858                               Qcow2SetRefcountFunc *new_set_refcount,
2859                               BlockDriverAmendStatusCB *status_cb,
2860                               void *cb_opaque, int index, int total,
2861                               Error **errp)
2862 {
2863     BDRVQcow2State *s = bs->opaque;
2864     uint64_t reftable_index;
2865     bool new_refblock_empty = true;
2866     int refblock_index;
2867     int new_refblock_index = 0;
2868     int ret;
2869 
2870     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2871          reftable_index++)
2872     {
2873         uint64_t refblock_offset = s->refcount_table[reftable_index]
2874                                  & REFT_OFFSET_MASK;
2875 
2876         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2877                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2878 
2879         if (refblock_offset) {
2880             void *refblock;
2881 
2882             if (offset_into_cluster(s, refblock_offset)) {
2883                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2884                                         PRIx64 " unaligned (reftable index: %#"
2885                                         PRIx64 ")", refblock_offset,
2886                                         reftable_index);
2887                 error_setg(errp,
2888                            "Image is corrupt (unaligned refblock offset)");
2889                 return -EIO;
2890             }
2891 
2892             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2893                                   &refblock);
2894             if (ret < 0) {
2895                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2896                 return ret;
2897             }
2898 
2899             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2900                  refblock_index++)
2901             {
2902                 uint64_t refcount;
2903 
2904                 if (new_refblock_index >= new_refblock_size) {
2905                     /* new_refblock is now complete */
2906                     ret = operation(bs, new_reftable, *new_reftable_index,
2907                                     new_reftable_size, new_refblock,
2908                                     new_refblock_empty, allocated, errp);
2909                     if (ret < 0) {
2910                         qcow2_cache_put(s->refcount_block_cache, &refblock);
2911                         return ret;
2912                     }
2913 
2914                     (*new_reftable_index)++;
2915                     new_refblock_index = 0;
2916                     new_refblock_empty = true;
2917                 }
2918 
2919                 refcount = s->get_refcount(refblock, refblock_index);
2920                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2921                     uint64_t offset;
2922 
2923                     qcow2_cache_put(s->refcount_block_cache, &refblock);
2924 
2925                     offset = ((reftable_index << s->refcount_block_bits)
2926                               + refblock_index) << s->cluster_bits;
2927 
2928                     error_setg(errp, "Cannot decrease refcount entry width to "
2929                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2930                                "refcount of %" PRIu64, new_refcount_bits,
2931                                offset, refcount);
2932                     return -EINVAL;
2933                 }
2934 
2935                 if (new_set_refcount) {
2936                     new_set_refcount(new_refblock, new_refblock_index++,
2937                                      refcount);
2938                 } else {
2939                     new_refblock_index++;
2940                 }
2941                 new_refblock_empty = new_refblock_empty && refcount == 0;
2942             }
2943 
2944             qcow2_cache_put(s->refcount_block_cache, &refblock);
2945         } else {
2946             /* No refblock means every refcount is 0 */
2947             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2948                  refblock_index++)
2949             {
2950                 if (new_refblock_index >= new_refblock_size) {
2951                     /* new_refblock is now complete */
2952                     ret = operation(bs, new_reftable, *new_reftable_index,
2953                                     new_reftable_size, new_refblock,
2954                                     new_refblock_empty, allocated, errp);
2955                     if (ret < 0) {
2956                         return ret;
2957                     }
2958 
2959                     (*new_reftable_index)++;
2960                     new_refblock_index = 0;
2961                     new_refblock_empty = true;
2962                 }
2963 
2964                 if (new_set_refcount) {
2965                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2966                 } else {
2967                     new_refblock_index++;
2968                 }
2969             }
2970         }
2971     }
2972 
2973     if (new_refblock_index > 0) {
2974         /* Complete the potentially existing partially filled final refblock */
2975         if (new_set_refcount) {
2976             for (; new_refblock_index < new_refblock_size;
2977                  new_refblock_index++)
2978             {
2979                 new_set_refcount(new_refblock, new_refblock_index, 0);
2980             }
2981         }
2982 
2983         ret = operation(bs, new_reftable, *new_reftable_index,
2984                         new_reftable_size, new_refblock, new_refblock_empty,
2985                         allocated, errp);
2986         if (ret < 0) {
2987             return ret;
2988         }
2989 
2990         (*new_reftable_index)++;
2991     }
2992 
2993     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2994               (uint64_t)total * s->refcount_table_size, cb_opaque);
2995 
2996     return 0;
2997 }
2998 
2999 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3000                                 BlockDriverAmendStatusCB *status_cb,
3001                                 void *cb_opaque, Error **errp)
3002 {
3003     BDRVQcow2State *s = bs->opaque;
3004     Qcow2GetRefcountFunc *new_get_refcount;
3005     Qcow2SetRefcountFunc *new_set_refcount;
3006     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3007     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3008     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3009     uint64_t new_reftable_index = 0;
3010     uint64_t i;
3011     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3012     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3013     int old_refcount_order;
3014     int walk_index = 0;
3015     int ret;
3016     bool new_allocation;
3017 
3018     assert(s->qcow_version >= 3);
3019     assert(refcount_order >= 0 && refcount_order <= 6);
3020 
3021     /* see qcow2_open() */
3022     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3023 
3024     new_get_refcount = get_refcount_funcs[refcount_order];
3025     new_set_refcount = set_refcount_funcs[refcount_order];
3026 
3027 
3028     do {
3029         int total_walks;
3030 
3031         new_allocation = false;
3032 
3033         /* At least we have to do this walk and the one which writes the
3034          * refblocks; also, at least we have to do this loop here at least
3035          * twice (normally), first to do the allocations, and second to
3036          * determine that everything is correctly allocated, this then makes
3037          * three walks in total */
3038         total_walks = MAX(walk_index + 2, 3);
3039 
3040         /* First, allocate the structures so they are present in the refcount
3041          * structures */
3042         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3043                                  &new_reftable_size, NULL, new_refblock_size,
3044                                  new_refcount_bits, &alloc_refblock,
3045                                  &new_allocation, NULL, status_cb, cb_opaque,
3046                                  walk_index++, total_walks, errp);
3047         if (ret < 0) {
3048             goto done;
3049         }
3050 
3051         new_reftable_index = 0;
3052 
3053         if (new_allocation) {
3054             if (new_reftable_offset) {
3055                 qcow2_free_clusters(bs, new_reftable_offset,
3056                                     allocated_reftable_size * sizeof(uint64_t),
3057                                     QCOW2_DISCARD_NEVER);
3058             }
3059 
3060             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3061                                                            sizeof(uint64_t));
3062             if (new_reftable_offset < 0) {
3063                 error_setg_errno(errp, -new_reftable_offset,
3064                                  "Failed to allocate the new reftable");
3065                 ret = new_reftable_offset;
3066                 goto done;
3067             }
3068             allocated_reftable_size = new_reftable_size;
3069         }
3070     } while (new_allocation);
3071 
3072     /* Second, write the new refblocks */
3073     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3074                              &new_reftable_size, new_refblock,
3075                              new_refblock_size, new_refcount_bits,
3076                              &flush_refblock, &new_allocation, new_set_refcount,
3077                              status_cb, cb_opaque, walk_index, walk_index + 1,
3078                              errp);
3079     if (ret < 0) {
3080         goto done;
3081     }
3082     assert(!new_allocation);
3083 
3084 
3085     /* Write the new reftable */
3086     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3087                                         new_reftable_size * sizeof(uint64_t));
3088     if (ret < 0) {
3089         error_setg_errno(errp, -ret, "Overlap check failed");
3090         goto done;
3091     }
3092 
3093     for (i = 0; i < new_reftable_size; i++) {
3094         cpu_to_be64s(&new_reftable[i]);
3095     }
3096 
3097     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3098                       new_reftable_size * sizeof(uint64_t));
3099 
3100     for (i = 0; i < new_reftable_size; i++) {
3101         be64_to_cpus(&new_reftable[i]);
3102     }
3103 
3104     if (ret < 0) {
3105         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3106         goto done;
3107     }
3108 
3109 
3110     /* Empty the refcount cache */
3111     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3112     if (ret < 0) {
3113         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3114         goto done;
3115     }
3116 
3117     /* Update the image header to point to the new reftable; this only updates
3118      * the fields which are relevant to qcow2_update_header(); other fields
3119      * such as s->refcount_table or s->refcount_bits stay stale for now
3120      * (because we have to restore everything if qcow2_update_header() fails) */
3121     old_refcount_order  = s->refcount_order;
3122     old_reftable_size   = s->refcount_table_size;
3123     old_reftable_offset = s->refcount_table_offset;
3124 
3125     s->refcount_order        = refcount_order;
3126     s->refcount_table_size   = new_reftable_size;
3127     s->refcount_table_offset = new_reftable_offset;
3128 
3129     ret = qcow2_update_header(bs);
3130     if (ret < 0) {
3131         s->refcount_order        = old_refcount_order;
3132         s->refcount_table_size   = old_reftable_size;
3133         s->refcount_table_offset = old_reftable_offset;
3134         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3135         goto done;
3136     }
3137 
3138     /* Now update the rest of the in-memory information */
3139     old_reftable = s->refcount_table;
3140     s->refcount_table = new_reftable;
3141     update_max_refcount_table_index(s);
3142 
3143     s->refcount_bits = 1 << refcount_order;
3144     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3145     s->refcount_max += s->refcount_max - 1;
3146 
3147     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3148     s->refcount_block_size = 1 << s->refcount_block_bits;
3149 
3150     s->get_refcount = new_get_refcount;
3151     s->set_refcount = new_set_refcount;
3152 
3153     /* For cleaning up all old refblocks and the old reftable below the "done"
3154      * label */
3155     new_reftable        = old_reftable;
3156     new_reftable_size   = old_reftable_size;
3157     new_reftable_offset = old_reftable_offset;
3158 
3159 done:
3160     if (new_reftable) {
3161         /* On success, new_reftable actually points to the old reftable (and
3162          * new_reftable_size is the old reftable's size); but that is just
3163          * fine */
3164         for (i = 0; i < new_reftable_size; i++) {
3165             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3166             if (offset) {
3167                 qcow2_free_clusters(bs, offset, s->cluster_size,
3168                                     QCOW2_DISCARD_OTHER);
3169             }
3170         }
3171         g_free(new_reftable);
3172 
3173         if (new_reftable_offset > 0) {
3174             qcow2_free_clusters(bs, new_reftable_offset,
3175                                 new_reftable_size * sizeof(uint64_t),
3176                                 QCOW2_DISCARD_OTHER);
3177         }
3178     }
3179 
3180     qemu_vfree(new_refblock);
3181     return ret;
3182 }
3183 
3184 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3185 {
3186     BDRVQcow2State *s = bs->opaque;
3187     uint32_t index = offset_to_reftable_index(s, offset);
3188     int64_t covering_refblock_offset = 0;
3189 
3190     if (index < s->refcount_table_size) {
3191         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3192     }
3193     if (!covering_refblock_offset) {
3194         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3195                                 "not covered by the refcount structures",
3196                                 offset);
3197         return -EIO;
3198     }
3199 
3200     return covering_refblock_offset;
3201 }
3202 
3203 static int qcow2_discard_refcount_block(BlockDriverState *bs,
3204                                         uint64_t discard_block_offs)
3205 {
3206     BDRVQcow2State *s = bs->opaque;
3207     int64_t refblock_offs;
3208     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3209     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3210     void *refblock;
3211     int ret;
3212 
3213     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3214     if (refblock_offs < 0) {
3215         return refblock_offs;
3216     }
3217 
3218     assert(discard_block_offs != 0);
3219 
3220     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3221                           &refblock);
3222     if (ret < 0) {
3223         return ret;
3224     }
3225 
3226     if (s->get_refcount(refblock, block_index) != 1) {
3227         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3228                                 " refblock offset %#" PRIx64
3229                                 ", reftable index %u"
3230                                 ", block offset %#" PRIx64
3231                                 ", refcount %#" PRIx64,
3232                                 refblock_offs,
3233                                 offset_to_reftable_index(s, discard_block_offs),
3234                                 discard_block_offs,
3235                                 s->get_refcount(refblock, block_index));
3236         qcow2_cache_put(s->refcount_block_cache, &refblock);
3237         return -EINVAL;
3238     }
3239     s->set_refcount(refblock, block_index, 0);
3240 
3241     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3242 
3243     qcow2_cache_put(s->refcount_block_cache, &refblock);
3244 
3245     if (cluster_index < s->free_cluster_index) {
3246         s->free_cluster_index = cluster_index;
3247     }
3248 
3249     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3250                                            discard_block_offs);
3251     if (refblock) {
3252         /* discard refblock from the cache if refblock is cached */
3253         qcow2_cache_discard(s->refcount_block_cache, refblock);
3254     }
3255     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3256 
3257     return 0;
3258 }
3259 
3260 int qcow2_shrink_reftable(BlockDriverState *bs)
3261 {
3262     BDRVQcow2State *s = bs->opaque;
3263     uint64_t *reftable_tmp =
3264         g_malloc(s->refcount_table_size * sizeof(uint64_t));
3265     int i, ret;
3266 
3267     for (i = 0; i < s->refcount_table_size; i++) {
3268         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3269         void *refblock;
3270         bool unused_block;
3271 
3272         if (refblock_offs == 0) {
3273             reftable_tmp[i] = 0;
3274             continue;
3275         }
3276         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3277                               &refblock);
3278         if (ret < 0) {
3279             goto out;
3280         }
3281 
3282         /* the refblock has own reference */
3283         if (i == offset_to_reftable_index(s, refblock_offs)) {
3284             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3285                                    (s->refcount_block_size - 1);
3286             uint64_t refcount = s->get_refcount(refblock, block_index);
3287 
3288             s->set_refcount(refblock, block_index, 0);
3289 
3290             unused_block = buffer_is_zero(refblock, s->cluster_size);
3291 
3292             s->set_refcount(refblock, block_index, refcount);
3293         } else {
3294             unused_block = buffer_is_zero(refblock, s->cluster_size);
3295         }
3296         qcow2_cache_put(s->refcount_block_cache, &refblock);
3297 
3298         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3299     }
3300 
3301     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3302                            s->refcount_table_size * sizeof(uint64_t));
3303     /*
3304      * If the write in the reftable failed the image may contain a partially
3305      * overwritten reftable. In this case it would be better to clear the
3306      * reftable in memory to avoid possible image corruption.
3307      */
3308     for (i = 0; i < s->refcount_table_size; i++) {
3309         if (s->refcount_table[i] && !reftable_tmp[i]) {
3310             if (ret == 0) {
3311                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3312                                                        REFT_OFFSET_MASK);
3313             }
3314             s->refcount_table[i] = 0;
3315         }
3316     }
3317 
3318     if (!s->cache_discards) {
3319         qcow2_process_discards(bs, ret);
3320     }
3321 
3322 out:
3323     g_free(reftable_tmp);
3324     return ret;
3325 }
3326 
3327 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3328 {
3329     BDRVQcow2State *s = bs->opaque;
3330     int64_t i;
3331 
3332     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3333         uint64_t refcount;
3334         int ret = qcow2_get_refcount(bs, i, &refcount);
3335         if (ret < 0) {
3336             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3337                     i, strerror(-ret));
3338             return ret;
3339         }
3340         if (refcount > 0) {
3341             return i;
3342         }
3343     }
3344     qcow2_signal_corruption(bs, true, -1, -1,
3345                             "There are no references in the refcount table.");
3346     return -EIO;
3347 }
3348