xref: /openbmc/qemu/block/qcow2-refcount.c (revision 10df8ff1)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 #include "qemu/cutils.h"
33 
34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
35                                     uint64_t max);
36 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
37                             int64_t offset, int64_t length, uint64_t addend,
38                             bool decrease, enum qcow2_discard_type type);
39 
40 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
46 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
47 
48 static void set_refcount_ro0(void *refcount_array, uint64_t index,
49                              uint64_t value);
50 static void set_refcount_ro1(void *refcount_array, uint64_t index,
51                              uint64_t value);
52 static void set_refcount_ro2(void *refcount_array, uint64_t index,
53                              uint64_t value);
54 static void set_refcount_ro3(void *refcount_array, uint64_t index,
55                              uint64_t value);
56 static void set_refcount_ro4(void *refcount_array, uint64_t index,
57                              uint64_t value);
58 static void set_refcount_ro5(void *refcount_array, uint64_t index,
59                              uint64_t value);
60 static void set_refcount_ro6(void *refcount_array, uint64_t index,
61                              uint64_t value);
62 
63 
64 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
65     &get_refcount_ro0,
66     &get_refcount_ro1,
67     &get_refcount_ro2,
68     &get_refcount_ro3,
69     &get_refcount_ro4,
70     &get_refcount_ro5,
71     &get_refcount_ro6
72 };
73 
74 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
75     &set_refcount_ro0,
76     &set_refcount_ro1,
77     &set_refcount_ro2,
78     &set_refcount_ro3,
79     &set_refcount_ro4,
80     &set_refcount_ro5,
81     &set_refcount_ro6
82 };
83 
84 
85 /*********************************************************/
86 /* refcount handling */
87 
88 static void update_max_refcount_table_index(BDRVQcow2State *s)
89 {
90     unsigned i = s->refcount_table_size - 1;
91     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
92         i--;
93     }
94     /* Set s->max_refcount_table_index to the index of the last used entry */
95     s->max_refcount_table_index = i;
96 }
97 
98 int qcow2_refcount_init(BlockDriverState *bs)
99 {
100     BDRVQcow2State *s = bs->opaque;
101     unsigned int refcount_table_size2, i;
102     int ret;
103 
104     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
105 
106     s->get_refcount = get_refcount_funcs[s->refcount_order];
107     s->set_refcount = set_refcount_funcs[s->refcount_order];
108 
109     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
110     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
111     s->refcount_table = g_try_malloc(refcount_table_size2);
112 
113     if (s->refcount_table_size > 0) {
114         if (s->refcount_table == NULL) {
115             ret = -ENOMEM;
116             goto fail;
117         }
118         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
119         ret = bdrv_pread(bs->file, s->refcount_table_offset,
120                          s->refcount_table, refcount_table_size2);
121         if (ret < 0) {
122             goto fail;
123         }
124         for(i = 0; i < s->refcount_table_size; i++)
125             be64_to_cpus(&s->refcount_table[i]);
126         update_max_refcount_table_index(s);
127     }
128     return 0;
129  fail:
130     return ret;
131 }
132 
133 void qcow2_refcount_close(BlockDriverState *bs)
134 {
135     BDRVQcow2State *s = bs->opaque;
136     g_free(s->refcount_table);
137 }
138 
139 
140 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
141 {
142     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
143 }
144 
145 static void set_refcount_ro0(void *refcount_array, uint64_t index,
146                              uint64_t value)
147 {
148     assert(!(value >> 1));
149     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
150     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
151 }
152 
153 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
154 {
155     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
156            & 0x3;
157 }
158 
159 static void set_refcount_ro1(void *refcount_array, uint64_t index,
160                              uint64_t value)
161 {
162     assert(!(value >> 2));
163     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
164     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
165 }
166 
167 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
168 {
169     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
170            & 0xf;
171 }
172 
173 static void set_refcount_ro2(void *refcount_array, uint64_t index,
174                              uint64_t value)
175 {
176     assert(!(value >> 4));
177     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
178     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
179 }
180 
181 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
182 {
183     return ((const uint8_t *)refcount_array)[index];
184 }
185 
186 static void set_refcount_ro3(void *refcount_array, uint64_t index,
187                              uint64_t value)
188 {
189     assert(!(value >> 8));
190     ((uint8_t *)refcount_array)[index] = value;
191 }
192 
193 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
194 {
195     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
196 }
197 
198 static void set_refcount_ro4(void *refcount_array, uint64_t index,
199                              uint64_t value)
200 {
201     assert(!(value >> 16));
202     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
203 }
204 
205 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
206 {
207     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
208 }
209 
210 static void set_refcount_ro5(void *refcount_array, uint64_t index,
211                              uint64_t value)
212 {
213     assert(!(value >> 32));
214     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
215 }
216 
217 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
218 {
219     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
220 }
221 
222 static void set_refcount_ro6(void *refcount_array, uint64_t index,
223                              uint64_t value)
224 {
225     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
226 }
227 
228 
229 static int load_refcount_block(BlockDriverState *bs,
230                                int64_t refcount_block_offset,
231                                void **refcount_block)
232 {
233     BDRVQcow2State *s = bs->opaque;
234 
235     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
236     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
237                            refcount_block);
238 }
239 
240 /*
241  * Retrieves the refcount of the cluster given by its index and stores it in
242  * *refcount. Returns 0 on success and -errno on failure.
243  */
244 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
245                        uint64_t *refcount)
246 {
247     BDRVQcow2State *s = bs->opaque;
248     uint64_t refcount_table_index, block_index;
249     int64_t refcount_block_offset;
250     int ret;
251     void *refcount_block;
252 
253     refcount_table_index = cluster_index >> s->refcount_block_bits;
254     if (refcount_table_index >= s->refcount_table_size) {
255         *refcount = 0;
256         return 0;
257     }
258     refcount_block_offset =
259         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
260     if (!refcount_block_offset) {
261         *refcount = 0;
262         return 0;
263     }
264 
265     if (offset_into_cluster(s, refcount_block_offset)) {
266         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
267                                 " unaligned (reftable index: %#" PRIx64 ")",
268                                 refcount_block_offset, refcount_table_index);
269         return -EIO;
270     }
271 
272     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
273                           &refcount_block);
274     if (ret < 0) {
275         return ret;
276     }
277 
278     block_index = cluster_index & (s->refcount_block_size - 1);
279     *refcount = s->get_refcount(refcount_block, block_index);
280 
281     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
282 
283     return 0;
284 }
285 
286 /* Checks if two offsets are described by the same refcount block */
287 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
288     uint64_t offset_b)
289 {
290     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
291     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
292 
293     return (block_a == block_b);
294 }
295 
296 /*
297  * Loads a refcount block. If it doesn't exist yet, it is allocated first
298  * (including growing the refcount table if needed).
299  *
300  * Returns 0 on success or -errno in error case
301  */
302 static int alloc_refcount_block(BlockDriverState *bs,
303                                 int64_t cluster_index, void **refcount_block)
304 {
305     BDRVQcow2State *s = bs->opaque;
306     unsigned int refcount_table_index;
307     int64_t ret;
308 
309     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
310 
311     /* Find the refcount block for the given cluster */
312     refcount_table_index = cluster_index >> s->refcount_block_bits;
313 
314     if (refcount_table_index < s->refcount_table_size) {
315 
316         uint64_t refcount_block_offset =
317             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
318 
319         /* If it's already there, we're done */
320         if (refcount_block_offset) {
321             if (offset_into_cluster(s, refcount_block_offset)) {
322                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
323                                         PRIx64 " unaligned (reftable index: "
324                                         "%#x)", refcount_block_offset,
325                                         refcount_table_index);
326                 return -EIO;
327             }
328 
329              return load_refcount_block(bs, refcount_block_offset,
330                                         refcount_block);
331         }
332     }
333 
334     /*
335      * If we came here, we need to allocate something. Something is at least
336      * a cluster for the new refcount block. It may also include a new refcount
337      * table if the old refcount table is too small.
338      *
339      * Note that allocating clusters here needs some special care:
340      *
341      * - We can't use the normal qcow2_alloc_clusters(), it would try to
342      *   increase the refcount and very likely we would end up with an endless
343      *   recursion. Instead we must place the refcount blocks in a way that
344      *   they can describe them themselves.
345      *
346      * - We need to consider that at this point we are inside update_refcounts
347      *   and potentially doing an initial refcount increase. This means that
348      *   some clusters have already been allocated by the caller, but their
349      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
350      *   need to return -EAGAIN to signal the caller that it needs to restart
351      *   the search for free clusters.
352      *
353      * - alloc_clusters_noref and qcow2_free_clusters may load a different
354      *   refcount block into the cache
355      */
356 
357     *refcount_block = NULL;
358 
359     /* We write to the refcount table, so we might depend on L2 tables */
360     ret = qcow2_cache_flush(bs, s->l2_table_cache);
361     if (ret < 0) {
362         return ret;
363     }
364 
365     /* Allocate the refcount block itself and mark it as used */
366     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
367     if (new_block < 0) {
368         return new_block;
369     }
370 
371     /* If we're allocating the block at offset 0 then something is wrong */
372     if (new_block == 0) {
373         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
374                                 "allocation of refcount block at offset 0");
375         return -EIO;
376     }
377 
378 #ifdef DEBUG_ALLOC2
379     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
380         " at %" PRIx64 "\n",
381         refcount_table_index, cluster_index << s->cluster_bits, new_block);
382 #endif
383 
384     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
385         /* Zero the new refcount block before updating it */
386         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
387                                     refcount_block);
388         if (ret < 0) {
389             goto fail;
390         }
391 
392         memset(*refcount_block, 0, s->cluster_size);
393 
394         /* The block describes itself, need to update the cache */
395         int block_index = (new_block >> s->cluster_bits) &
396             (s->refcount_block_size - 1);
397         s->set_refcount(*refcount_block, block_index, 1);
398     } else {
399         /* Described somewhere else. This can recurse at most twice before we
400          * arrive at a block that describes itself. */
401         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
402                               QCOW2_DISCARD_NEVER);
403         if (ret < 0) {
404             goto fail;
405         }
406 
407         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
408         if (ret < 0) {
409             goto fail;
410         }
411 
412         /* Initialize the new refcount block only after updating its refcount,
413          * update_refcount uses the refcount cache itself */
414         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
415                                     refcount_block);
416         if (ret < 0) {
417             goto fail;
418         }
419 
420         memset(*refcount_block, 0, s->cluster_size);
421     }
422 
423     /* Now the new refcount block needs to be written to disk */
424     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
425     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
426     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
427     if (ret < 0) {
428         goto fail;
429     }
430 
431     /* If the refcount table is big enough, just hook the block up there */
432     if (refcount_table_index < s->refcount_table_size) {
433         uint64_t data64 = cpu_to_be64(new_block);
434         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
435         ret = bdrv_pwrite_sync(bs->file,
436             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
437             &data64, sizeof(data64));
438         if (ret < 0) {
439             goto fail;
440         }
441 
442         s->refcount_table[refcount_table_index] = new_block;
443         /* If there's a hole in s->refcount_table then it can happen
444          * that refcount_table_index < s->max_refcount_table_index */
445         s->max_refcount_table_index =
446             MAX(s->max_refcount_table_index, refcount_table_index);
447 
448         /* The new refcount block may be where the caller intended to put its
449          * data, so let it restart the search. */
450         return -EAGAIN;
451     }
452 
453     qcow2_cache_put(s->refcount_block_cache, refcount_block);
454 
455     /*
456      * If we come here, we need to grow the refcount table. Again, a new
457      * refcount table needs some space and we can't simply allocate to avoid
458      * endless recursion.
459      *
460      * Therefore let's grab new refcount blocks at the end of the image, which
461      * will describe themselves and the new refcount table. This way we can
462      * reference them only in the new table and do the switch to the new
463      * refcount table at once without producing an inconsistent state in
464      * between.
465      */
466     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
467 
468     /* Calculate the number of refcount blocks needed so far; this will be the
469      * basis for calculating the index of the first cluster used for the
470      * self-describing refcount structures which we are about to create.
471      *
472      * Because we reached this point, there cannot be any refcount entries for
473      * cluster_index or higher indices yet. However, because new_block has been
474      * allocated to describe that cluster (and it will assume this role later
475      * on), we cannot use that index; also, new_block may actually have a higher
476      * cluster index than cluster_index, so it needs to be taken into account
477      * here (and 1 needs to be added to its value because that cluster is used).
478      */
479     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
480                                             (new_block >> s->cluster_bits) + 1),
481                                         s->refcount_block_size);
482 
483     /* Create the new refcount table and blocks */
484     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
485         s->cluster_size;
486 
487     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
488                               refcount_table_index, new_block);
489     if (ret < 0) {
490         return ret;
491     }
492 
493     ret = load_refcount_block(bs, new_block, refcount_block);
494     if (ret < 0) {
495         return ret;
496     }
497 
498     /* If we were trying to do the initial refcount update for some cluster
499      * allocation, we might have used the same clusters to store newly
500      * allocated metadata. Make the caller search some new space. */
501     return -EAGAIN;
502 
503 fail:
504     if (*refcount_block != NULL) {
505         qcow2_cache_put(s->refcount_block_cache, refcount_block);
506     }
507     return ret;
508 }
509 
510 /*
511  * Starting at @start_offset, this function creates new self-covering refcount
512  * structures: A new refcount table and refcount blocks which cover all of
513  * themselves, and a number of @additional_clusters beyond their end.
514  * @start_offset must be at the end of the image file, that is, there must be
515  * only empty space beyond it.
516  * If @exact_size is false, the refcount table will have 50 % more entries than
517  * necessary so it will not need to grow again soon.
518  * If @new_refblock_offset is not zero, it contains the offset of a refcount
519  * block that should be entered into the new refcount table at index
520  * @new_refblock_index.
521  *
522  * Returns: The offset after the new refcount structures (i.e. where the
523  *          @additional_clusters may be placed) on success, -errno on error.
524  */
525 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
526                             uint64_t additional_clusters, bool exact_size,
527                             int new_refblock_index,
528                             uint64_t new_refblock_offset)
529 {
530     BDRVQcow2State *s = bs->opaque;
531     uint64_t total_refblock_count_u64, additional_refblock_count;
532     int total_refblock_count, table_size, area_reftable_index, table_clusters;
533     int i;
534     uint64_t table_offset, block_offset, end_offset;
535     int ret;
536     uint64_t *new_table;
537 
538     assert(!(start_offset % s->cluster_size));
539 
540     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
541                                  additional_clusters,
542                                  s->cluster_size, s->refcount_order,
543                                  !exact_size, &total_refblock_count_u64);
544     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
545         return -EFBIG;
546     }
547     total_refblock_count = total_refblock_count_u64;
548 
549     /* Index in the refcount table of the first refcount block to cover the area
550      * of refcount structures we are about to create; we know that
551      * @total_refblock_count can cover @start_offset, so this will definitely
552      * fit into an int. */
553     area_reftable_index = (start_offset / s->cluster_size) /
554                           s->refcount_block_size;
555 
556     if (exact_size) {
557         table_size = total_refblock_count;
558     } else {
559         table_size = total_refblock_count +
560                      DIV_ROUND_UP(total_refblock_count, 2);
561     }
562     /* The qcow2 file can only store the reftable size in number of clusters */
563     table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
564     table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
565 
566     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
567         return -EFBIG;
568     }
569 
570     new_table = g_try_new0(uint64_t, table_size);
571 
572     assert(table_size > 0);
573     if (new_table == NULL) {
574         ret = -ENOMEM;
575         goto fail;
576     }
577 
578     /* Fill the new refcount table */
579     if (table_size > s->max_refcount_table_index) {
580         /* We're actually growing the reftable */
581         memcpy(new_table, s->refcount_table,
582                (s->max_refcount_table_index + 1) * sizeof(uint64_t));
583     } else {
584         /* Improbable case: We're shrinking the reftable. However, the caller
585          * has assured us that there is only empty space beyond @start_offset,
586          * so we can simply drop all of the refblocks that won't fit into the
587          * new reftable. */
588         memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
589     }
590 
591     if (new_refblock_offset) {
592         assert(new_refblock_index < total_refblock_count);
593         new_table[new_refblock_index] = new_refblock_offset;
594     }
595 
596     /* Count how many new refblocks we have to create */
597     additional_refblock_count = 0;
598     for (i = area_reftable_index; i < total_refblock_count; i++) {
599         if (!new_table[i]) {
600             additional_refblock_count++;
601         }
602     }
603 
604     table_offset = start_offset + additional_refblock_count * s->cluster_size;
605     end_offset = table_offset + table_clusters * s->cluster_size;
606 
607     /* Fill the refcount blocks, and create new ones, if necessary */
608     block_offset = start_offset;
609     for (i = area_reftable_index; i < total_refblock_count; i++) {
610         void *refblock_data;
611         uint64_t first_offset_covered;
612 
613         /* Reuse an existing refblock if possible, create a new one otherwise */
614         if (new_table[i]) {
615             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
616                                   &refblock_data);
617             if (ret < 0) {
618                 goto fail;
619             }
620         } else {
621             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
622                                         block_offset, &refblock_data);
623             if (ret < 0) {
624                 goto fail;
625             }
626             memset(refblock_data, 0, s->cluster_size);
627             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
628                                          refblock_data);
629 
630             new_table[i] = block_offset;
631             block_offset += s->cluster_size;
632         }
633 
634         /* First host offset covered by this refblock */
635         first_offset_covered = (uint64_t)i * s->refcount_block_size *
636                                s->cluster_size;
637         if (first_offset_covered < end_offset) {
638             int j, end_index;
639 
640             /* Set the refcount of all of the new refcount structures to 1 */
641 
642             if (first_offset_covered < start_offset) {
643                 assert(i == area_reftable_index);
644                 j = (start_offset - first_offset_covered) / s->cluster_size;
645                 assert(j < s->refcount_block_size);
646             } else {
647                 j = 0;
648             }
649 
650             end_index = MIN((end_offset - first_offset_covered) /
651                             s->cluster_size,
652                             s->refcount_block_size);
653 
654             for (; j < end_index; j++) {
655                 /* The caller guaranteed us this space would be empty */
656                 assert(s->get_refcount(refblock_data, j) == 0);
657                 s->set_refcount(refblock_data, j, 1);
658             }
659 
660             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
661                                          refblock_data);
662         }
663 
664         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
665     }
666 
667     assert(block_offset == table_offset);
668 
669     /* Write refcount blocks to disk */
670     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
671     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
672     if (ret < 0) {
673         goto fail;
674     }
675 
676     /* Write refcount table to disk */
677     for (i = 0; i < total_refblock_count; i++) {
678         cpu_to_be64s(&new_table[i]);
679     }
680 
681     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
682     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
683         table_size * sizeof(uint64_t));
684     if (ret < 0) {
685         goto fail;
686     }
687 
688     for (i = 0; i < total_refblock_count; i++) {
689         be64_to_cpus(&new_table[i]);
690     }
691 
692     /* Hook up the new refcount table in the qcow2 header */
693     struct QEMU_PACKED {
694         uint64_t d64;
695         uint32_t d32;
696     } data;
697     data.d64 = cpu_to_be64(table_offset);
698     data.d32 = cpu_to_be32(table_clusters);
699     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
700     ret = bdrv_pwrite_sync(bs->file,
701                            offsetof(QCowHeader, refcount_table_offset),
702                            &data, sizeof(data));
703     if (ret < 0) {
704         goto fail;
705     }
706 
707     /* And switch it in memory */
708     uint64_t old_table_offset = s->refcount_table_offset;
709     uint64_t old_table_size = s->refcount_table_size;
710 
711     g_free(s->refcount_table);
712     s->refcount_table = new_table;
713     s->refcount_table_size = table_size;
714     s->refcount_table_offset = table_offset;
715     update_max_refcount_table_index(s);
716 
717     /* Free old table. */
718     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
719                         QCOW2_DISCARD_OTHER);
720 
721     return end_offset;
722 
723 fail:
724     g_free(new_table);
725     return ret;
726 }
727 
728 void qcow2_process_discards(BlockDriverState *bs, int ret)
729 {
730     BDRVQcow2State *s = bs->opaque;
731     Qcow2DiscardRegion *d, *next;
732 
733     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
734         QTAILQ_REMOVE(&s->discards, d, next);
735 
736         /* Discard is optional, ignore the return value */
737         if (ret >= 0) {
738             bdrv_pdiscard(bs->file, d->offset, d->bytes);
739         }
740 
741         g_free(d);
742     }
743 }
744 
745 static void update_refcount_discard(BlockDriverState *bs,
746                                     uint64_t offset, uint64_t length)
747 {
748     BDRVQcow2State *s = bs->opaque;
749     Qcow2DiscardRegion *d, *p, *next;
750 
751     QTAILQ_FOREACH(d, &s->discards, next) {
752         uint64_t new_start = MIN(offset, d->offset);
753         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
754 
755         if (new_end - new_start <= length + d->bytes) {
756             /* There can't be any overlap, areas ending up here have no
757              * references any more and therefore shouldn't get freed another
758              * time. */
759             assert(d->bytes + length == new_end - new_start);
760             d->offset = new_start;
761             d->bytes = new_end - new_start;
762             goto found;
763         }
764     }
765 
766     d = g_malloc(sizeof(*d));
767     *d = (Qcow2DiscardRegion) {
768         .bs     = bs,
769         .offset = offset,
770         .bytes  = length,
771     };
772     QTAILQ_INSERT_TAIL(&s->discards, d, next);
773 
774 found:
775     /* Merge discard requests if they are adjacent now */
776     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
777         if (p == d
778             || p->offset > d->offset + d->bytes
779             || d->offset > p->offset + p->bytes)
780         {
781             continue;
782         }
783 
784         /* Still no overlap possible */
785         assert(p->offset == d->offset + d->bytes
786             || d->offset == p->offset + p->bytes);
787 
788         QTAILQ_REMOVE(&s->discards, p, next);
789         d->offset = MIN(d->offset, p->offset);
790         d->bytes += p->bytes;
791         g_free(p);
792     }
793 }
794 
795 /* XXX: cache several refcount block clusters ? */
796 /* @addend is the absolute value of the addend; if @decrease is set, @addend
797  * will be subtracted from the current refcount, otherwise it will be added */
798 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
799                                                    int64_t offset,
800                                                    int64_t length,
801                                                    uint64_t addend,
802                                                    bool decrease,
803                                                    enum qcow2_discard_type type)
804 {
805     BDRVQcow2State *s = bs->opaque;
806     int64_t start, last, cluster_offset;
807     void *refcount_block = NULL;
808     int64_t old_table_index = -1;
809     int ret;
810 
811 #ifdef DEBUG_ALLOC2
812     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
813             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
814             addend);
815 #endif
816     if (length < 0) {
817         return -EINVAL;
818     } else if (length == 0) {
819         return 0;
820     }
821 
822     if (decrease) {
823         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
824             s->l2_table_cache);
825     }
826 
827     start = start_of_cluster(s, offset);
828     last = start_of_cluster(s, offset + length - 1);
829     for(cluster_offset = start; cluster_offset <= last;
830         cluster_offset += s->cluster_size)
831     {
832         int block_index;
833         uint64_t refcount;
834         int64_t cluster_index = cluster_offset >> s->cluster_bits;
835         int64_t table_index = cluster_index >> s->refcount_block_bits;
836 
837         /* Load the refcount block and allocate it if needed */
838         if (table_index != old_table_index) {
839             if (refcount_block) {
840                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
841             }
842             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
843             /* If the caller needs to restart the search for free clusters,
844              * try the same ones first to see if they're still free. */
845             if (ret == -EAGAIN) {
846                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
847                     s->free_cluster_index = (start >> s->cluster_bits);
848                 }
849             }
850             if (ret < 0) {
851                 goto fail;
852             }
853         }
854         old_table_index = table_index;
855 
856         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
857 
858         /* we can update the count and save it */
859         block_index = cluster_index & (s->refcount_block_size - 1);
860 
861         refcount = s->get_refcount(refcount_block, block_index);
862         if (decrease ? (refcount - addend > refcount)
863                      : (refcount + addend < refcount ||
864                         refcount + addend > s->refcount_max))
865         {
866             ret = -EINVAL;
867             goto fail;
868         }
869         if (decrease) {
870             refcount -= addend;
871         } else {
872             refcount += addend;
873         }
874         if (refcount == 0 && cluster_index < s->free_cluster_index) {
875             s->free_cluster_index = cluster_index;
876         }
877         s->set_refcount(refcount_block, block_index, refcount);
878 
879         if (refcount == 0) {
880             void *table;
881 
882             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
883                                                 offset);
884             if (table != NULL) {
885                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
886                 qcow2_cache_discard(s->refcount_block_cache, table);
887             }
888 
889             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
890             if (table != NULL) {
891                 qcow2_cache_discard(s->l2_table_cache, table);
892             }
893 
894             if (s->discard_passthrough[type]) {
895                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
896             }
897         }
898     }
899 
900     ret = 0;
901 fail:
902     if (!s->cache_discards) {
903         qcow2_process_discards(bs, ret);
904     }
905 
906     /* Write last changed block to disk */
907     if (refcount_block) {
908         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
909     }
910 
911     /*
912      * Try do undo any updates if an error is returned (This may succeed in
913      * some cases like ENOSPC for allocating a new refcount block)
914      */
915     if (ret < 0) {
916         int dummy;
917         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
918                                 !decrease, QCOW2_DISCARD_NEVER);
919         (void)dummy;
920     }
921 
922     return ret;
923 }
924 
925 /*
926  * Increases or decreases the refcount of a given cluster.
927  *
928  * @addend is the absolute value of the addend; if @decrease is set, @addend
929  * will be subtracted from the current refcount, otherwise it will be added.
930  *
931  * On success 0 is returned; on failure -errno is returned.
932  */
933 int qcow2_update_cluster_refcount(BlockDriverState *bs,
934                                   int64_t cluster_index,
935                                   uint64_t addend, bool decrease,
936                                   enum qcow2_discard_type type)
937 {
938     BDRVQcow2State *s = bs->opaque;
939     int ret;
940 
941     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
942                           decrease, type);
943     if (ret < 0) {
944         return ret;
945     }
946 
947     return 0;
948 }
949 
950 
951 
952 /*********************************************************/
953 /* cluster allocation functions */
954 
955 
956 
957 /* return < 0 if error */
958 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
959                                     uint64_t max)
960 {
961     BDRVQcow2State *s = bs->opaque;
962     uint64_t i, nb_clusters, refcount;
963     int ret;
964 
965     /* We can't allocate clusters if they may still be queued for discard. */
966     if (s->cache_discards) {
967         qcow2_process_discards(bs, 0);
968     }
969 
970     nb_clusters = size_to_clusters(s, size);
971 retry:
972     for(i = 0; i < nb_clusters; i++) {
973         uint64_t next_cluster_index = s->free_cluster_index++;
974         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
975 
976         if (ret < 0) {
977             return ret;
978         } else if (refcount != 0) {
979             goto retry;
980         }
981     }
982 
983     /* Make sure that all offsets in the "allocated" range are representable
984      * in the requested max */
985     if (s->free_cluster_index > 0 &&
986         s->free_cluster_index - 1 > (max >> s->cluster_bits))
987     {
988         return -EFBIG;
989     }
990 
991 #ifdef DEBUG_ALLOC2
992     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
993             size,
994             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
995 #endif
996     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
997 }
998 
999 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1000 {
1001     int64_t offset;
1002     int ret;
1003 
1004     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1005     do {
1006         offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1007         if (offset < 0) {
1008             return offset;
1009         }
1010 
1011         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1012     } while (ret == -EAGAIN);
1013 
1014     if (ret < 0) {
1015         return ret;
1016     }
1017 
1018     return offset;
1019 }
1020 
1021 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1022                                 int64_t nb_clusters)
1023 {
1024     BDRVQcow2State *s = bs->opaque;
1025     uint64_t cluster_index, refcount;
1026     uint64_t i;
1027     int ret;
1028 
1029     assert(nb_clusters >= 0);
1030     if (nb_clusters == 0) {
1031         return 0;
1032     }
1033 
1034     do {
1035         /* Check how many clusters there are free */
1036         cluster_index = offset >> s->cluster_bits;
1037         for(i = 0; i < nb_clusters; i++) {
1038             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1039             if (ret < 0) {
1040                 return ret;
1041             } else if (refcount != 0) {
1042                 break;
1043             }
1044         }
1045 
1046         /* And then allocate them */
1047         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1048                               QCOW2_DISCARD_NEVER);
1049     } while (ret == -EAGAIN);
1050 
1051     if (ret < 0) {
1052         return ret;
1053     }
1054 
1055     return i;
1056 }
1057 
1058 /* only used to allocate compressed sectors. We try to allocate
1059    contiguous sectors. size must be <= cluster_size */
1060 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1061 {
1062     BDRVQcow2State *s = bs->opaque;
1063     int64_t offset;
1064     size_t free_in_cluster;
1065     int ret;
1066 
1067     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1068     assert(size > 0 && size <= s->cluster_size);
1069     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1070 
1071     offset = s->free_byte_offset;
1072 
1073     if (offset) {
1074         uint64_t refcount;
1075         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1076         if (ret < 0) {
1077             return ret;
1078         }
1079 
1080         if (refcount == s->refcount_max) {
1081             offset = 0;
1082         }
1083     }
1084 
1085     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1086     do {
1087         if (!offset || free_in_cluster < size) {
1088             int64_t new_cluster;
1089 
1090             new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1091                                                MIN(s->cluster_offset_mask,
1092                                                    QCOW_MAX_CLUSTER_OFFSET));
1093             if (new_cluster < 0) {
1094                 return new_cluster;
1095             }
1096 
1097             if (new_cluster == 0) {
1098                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1099                                         "allocation of compressed cluster "
1100                                         "at offset 0");
1101                 return -EIO;
1102             }
1103 
1104             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1105                 offset = new_cluster;
1106                 free_in_cluster = s->cluster_size;
1107             } else {
1108                 free_in_cluster += s->cluster_size;
1109             }
1110         }
1111 
1112         assert(offset);
1113         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1114         if (ret < 0) {
1115             offset = 0;
1116         }
1117     } while (ret == -EAGAIN);
1118     if (ret < 0) {
1119         return ret;
1120     }
1121 
1122     /* The cluster refcount was incremented; refcount blocks must be flushed
1123      * before the caller's L2 table updates. */
1124     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1125 
1126     s->free_byte_offset = offset + size;
1127     if (!offset_into_cluster(s, s->free_byte_offset)) {
1128         s->free_byte_offset = 0;
1129     }
1130 
1131     return offset;
1132 }
1133 
1134 void qcow2_free_clusters(BlockDriverState *bs,
1135                           int64_t offset, int64_t size,
1136                           enum qcow2_discard_type type)
1137 {
1138     int ret;
1139 
1140     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1141     ret = update_refcount(bs, offset, size, 1, true, type);
1142     if (ret < 0) {
1143         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1144         /* TODO Remember the clusters to free them later and avoid leaking */
1145     }
1146 }
1147 
1148 /*
1149  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1150  * normal cluster, compressed cluster, etc.)
1151  */
1152 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1153                              int nb_clusters, enum qcow2_discard_type type)
1154 {
1155     BDRVQcow2State *s = bs->opaque;
1156 
1157     switch (qcow2_get_cluster_type(l2_entry)) {
1158     case QCOW2_CLUSTER_COMPRESSED:
1159         {
1160             int nb_csectors;
1161             nb_csectors = ((l2_entry >> s->csize_shift) &
1162                            s->csize_mask) + 1;
1163             qcow2_free_clusters(bs,
1164                 (l2_entry & s->cluster_offset_mask) & ~511,
1165                 nb_csectors * 512, type);
1166         }
1167         break;
1168     case QCOW2_CLUSTER_NORMAL:
1169     case QCOW2_CLUSTER_ZERO_ALLOC:
1170         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1171             qcow2_signal_corruption(bs, false, -1, -1,
1172                                     "Cannot free unaligned cluster %#llx",
1173                                     l2_entry & L2E_OFFSET_MASK);
1174         } else {
1175             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1176                                 nb_clusters << s->cluster_bits, type);
1177         }
1178         break;
1179     case QCOW2_CLUSTER_ZERO_PLAIN:
1180     case QCOW2_CLUSTER_UNALLOCATED:
1181         break;
1182     default:
1183         abort();
1184     }
1185 }
1186 
1187 int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1188 {
1189     BDRVQcow2State *s = bs->opaque;
1190     int ret;
1191 
1192     ret = qcow2_cache_write(bs, s->l2_table_cache);
1193     if (ret < 0) {
1194         return ret;
1195     }
1196 
1197     if (qcow2_need_accurate_refcounts(s)) {
1198         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1199         if (ret < 0) {
1200             return ret;
1201         }
1202     }
1203 
1204     return 0;
1205 }
1206 
1207 int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1208 {
1209     int ret = qcow2_write_caches(bs);
1210     if (ret < 0) {
1211         return ret;
1212     }
1213 
1214     return bdrv_flush(bs->file->bs);
1215 }
1216 
1217 /*********************************************************/
1218 /* snapshots and image creation */
1219 
1220 
1221 
1222 /* update the refcounts of snapshots and the copied flag */
1223 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1224     int64_t l1_table_offset, int l1_size, int addend)
1225 {
1226     BDRVQcow2State *s = bs->opaque;
1227     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1228     bool l1_allocated = false;
1229     int64_t old_entry, old_l2_offset;
1230     unsigned slice, slice_size2, n_slices;
1231     int i, j, l1_modified = 0, nb_csectors;
1232     int ret;
1233 
1234     assert(addend >= -1 && addend <= 1);
1235 
1236     l2_slice = NULL;
1237     l1_table = NULL;
1238     l1_size2 = l1_size * sizeof(uint64_t);
1239     slice_size2 = s->l2_slice_size * sizeof(uint64_t);
1240     n_slices = s->cluster_size / slice_size2;
1241 
1242     s->cache_discards = true;
1243 
1244     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1245      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1246      * when changing this! */
1247     if (l1_table_offset != s->l1_table_offset) {
1248         l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
1249         if (l1_size2 && l1_table == NULL) {
1250             ret = -ENOMEM;
1251             goto fail;
1252         }
1253         l1_allocated = true;
1254 
1255         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1256         if (ret < 0) {
1257             goto fail;
1258         }
1259 
1260         for (i = 0; i < l1_size; i++) {
1261             be64_to_cpus(&l1_table[i]);
1262         }
1263     } else {
1264         assert(l1_size == s->l1_size);
1265         l1_table = s->l1_table;
1266         l1_allocated = false;
1267     }
1268 
1269     for (i = 0; i < l1_size; i++) {
1270         l2_offset = l1_table[i];
1271         if (l2_offset) {
1272             old_l2_offset = l2_offset;
1273             l2_offset &= L1E_OFFSET_MASK;
1274 
1275             if (offset_into_cluster(s, l2_offset)) {
1276                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1277                                         PRIx64 " unaligned (L1 index: %#x)",
1278                                         l2_offset, i);
1279                 ret = -EIO;
1280                 goto fail;
1281             }
1282 
1283             for (slice = 0; slice < n_slices; slice++) {
1284                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1285                                       l2_offset + slice * slice_size2,
1286                                       (void **) &l2_slice);
1287                 if (ret < 0) {
1288                     goto fail;
1289                 }
1290 
1291                 for (j = 0; j < s->l2_slice_size; j++) {
1292                     uint64_t cluster_index;
1293                     uint64_t offset;
1294 
1295                     entry = be64_to_cpu(l2_slice[j]);
1296                     old_entry = entry;
1297                     entry &= ~QCOW_OFLAG_COPIED;
1298                     offset = entry & L2E_OFFSET_MASK;
1299 
1300                     switch (qcow2_get_cluster_type(entry)) {
1301                     case QCOW2_CLUSTER_COMPRESSED:
1302                         nb_csectors = ((entry >> s->csize_shift) &
1303                                        s->csize_mask) + 1;
1304                         if (addend != 0) {
1305                             ret = update_refcount(
1306                                 bs, (entry & s->cluster_offset_mask) & ~511,
1307                                 nb_csectors * 512, abs(addend), addend < 0,
1308                                 QCOW2_DISCARD_SNAPSHOT);
1309                             if (ret < 0) {
1310                                 goto fail;
1311                             }
1312                         }
1313                         /* compressed clusters are never modified */
1314                         refcount = 2;
1315                         break;
1316 
1317                     case QCOW2_CLUSTER_NORMAL:
1318                     case QCOW2_CLUSTER_ZERO_ALLOC:
1319                         if (offset_into_cluster(s, offset)) {
1320                             /* Here l2_index means table (not slice) index */
1321                             int l2_index = slice * s->l2_slice_size + j;
1322                             qcow2_signal_corruption(
1323                                 bs, true, -1, -1, "Cluster "
1324                                 "allocation offset %#" PRIx64
1325                                 " unaligned (L2 offset: %#"
1326                                 PRIx64 ", L2 index: %#x)",
1327                                 offset, l2_offset, l2_index);
1328                             ret = -EIO;
1329                             goto fail;
1330                         }
1331 
1332                         cluster_index = offset >> s->cluster_bits;
1333                         assert(cluster_index);
1334                         if (addend != 0) {
1335                             ret = qcow2_update_cluster_refcount(
1336                                 bs, cluster_index, abs(addend), addend < 0,
1337                                 QCOW2_DISCARD_SNAPSHOT);
1338                             if (ret < 0) {
1339                                 goto fail;
1340                             }
1341                         }
1342 
1343                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1344                         if (ret < 0) {
1345                             goto fail;
1346                         }
1347                         break;
1348 
1349                     case QCOW2_CLUSTER_ZERO_PLAIN:
1350                     case QCOW2_CLUSTER_UNALLOCATED:
1351                         refcount = 0;
1352                         break;
1353 
1354                     default:
1355                         abort();
1356                     }
1357 
1358                     if (refcount == 1) {
1359                         entry |= QCOW_OFLAG_COPIED;
1360                     }
1361                     if (entry != old_entry) {
1362                         if (addend > 0) {
1363                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1364                                                        s->refcount_block_cache);
1365                         }
1366                         l2_slice[j] = cpu_to_be64(entry);
1367                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1368                                                      l2_slice);
1369                     }
1370                 }
1371 
1372                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1373             }
1374 
1375             if (addend != 0) {
1376                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1377                                                         s->cluster_bits,
1378                                                     abs(addend), addend < 0,
1379                                                     QCOW2_DISCARD_SNAPSHOT);
1380                 if (ret < 0) {
1381                     goto fail;
1382                 }
1383             }
1384             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1385                                      &refcount);
1386             if (ret < 0) {
1387                 goto fail;
1388             } else if (refcount == 1) {
1389                 l2_offset |= QCOW_OFLAG_COPIED;
1390             }
1391             if (l2_offset != old_l2_offset) {
1392                 l1_table[i] = l2_offset;
1393                 l1_modified = 1;
1394             }
1395         }
1396     }
1397 
1398     ret = bdrv_flush(bs);
1399 fail:
1400     if (l2_slice) {
1401         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1402     }
1403 
1404     s->cache_discards = false;
1405     qcow2_process_discards(bs, ret);
1406 
1407     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1408     if (ret == 0 && addend >= 0 && l1_modified) {
1409         for (i = 0; i < l1_size; i++) {
1410             cpu_to_be64s(&l1_table[i]);
1411         }
1412 
1413         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1414                                l1_table, l1_size2);
1415 
1416         for (i = 0; i < l1_size; i++) {
1417             be64_to_cpus(&l1_table[i]);
1418         }
1419     }
1420     if (l1_allocated)
1421         g_free(l1_table);
1422     return ret;
1423 }
1424 
1425 
1426 
1427 
1428 /*********************************************************/
1429 /* refcount checking functions */
1430 
1431 
1432 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1433 {
1434     /* This assertion holds because there is no way we can address more than
1435      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1436      * offsets have to be representable in bytes); due to every cluster
1437      * corresponding to one refcount entry, we are well below that limit */
1438     assert(entries < (UINT64_C(1) << (64 - 9)));
1439 
1440     /* Thanks to the assertion this will not overflow, because
1441      * s->refcount_order < 7.
1442      * (note: x << s->refcount_order == x * s->refcount_bits) */
1443     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1444 }
1445 
1446 /**
1447  * Reallocates *array so that it can hold new_size entries. *size must contain
1448  * the current number of entries in *array. If the reallocation fails, *array
1449  * and *size will not be modified and -errno will be returned. If the
1450  * reallocation is successful, *array will be set to the new buffer, *size
1451  * will be set to new_size and 0 will be returned. The size of the reallocated
1452  * refcount array buffer will be aligned to a cluster boundary, and the newly
1453  * allocated area will be zeroed.
1454  */
1455 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1456                                   int64_t *size, int64_t new_size)
1457 {
1458     int64_t old_byte_size, new_byte_size;
1459     void *new_ptr;
1460 
1461     /* Round to clusters so the array can be directly written to disk */
1462     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1463                     * s->cluster_size;
1464     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1465                     * s->cluster_size;
1466 
1467     if (new_byte_size == old_byte_size) {
1468         *size = new_size;
1469         return 0;
1470     }
1471 
1472     assert(new_byte_size > 0);
1473 
1474     if (new_byte_size > SIZE_MAX) {
1475         return -ENOMEM;
1476     }
1477 
1478     new_ptr = g_try_realloc(*array, new_byte_size);
1479     if (!new_ptr) {
1480         return -ENOMEM;
1481     }
1482 
1483     if (new_byte_size > old_byte_size) {
1484         memset((char *)new_ptr + old_byte_size, 0,
1485                new_byte_size - old_byte_size);
1486     }
1487 
1488     *array = new_ptr;
1489     *size  = new_size;
1490 
1491     return 0;
1492 }
1493 
1494 /*
1495  * Increases the refcount for a range of clusters in a given refcount table.
1496  * This is used to construct a temporary refcount table out of L1 and L2 tables
1497  * which can be compared to the refcount table saved in the image.
1498  *
1499  * Modifies the number of errors in res.
1500  */
1501 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1502                              void **refcount_table,
1503                              int64_t *refcount_table_size,
1504                              int64_t offset, int64_t size)
1505 {
1506     BDRVQcow2State *s = bs->opaque;
1507     uint64_t start, last, cluster_offset, k, refcount;
1508     int ret;
1509 
1510     if (size <= 0) {
1511         return 0;
1512     }
1513 
1514     start = start_of_cluster(s, offset);
1515     last = start_of_cluster(s, offset + size - 1);
1516     for(cluster_offset = start; cluster_offset <= last;
1517         cluster_offset += s->cluster_size) {
1518         k = cluster_offset >> s->cluster_bits;
1519         if (k >= *refcount_table_size) {
1520             ret = realloc_refcount_array(s, refcount_table,
1521                                          refcount_table_size, k + 1);
1522             if (ret < 0) {
1523                 res->check_errors++;
1524                 return ret;
1525             }
1526         }
1527 
1528         refcount = s->get_refcount(*refcount_table, k);
1529         if (refcount == s->refcount_max) {
1530             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1531                     "\n", cluster_offset);
1532             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1533                     "width or qemu-img convert to create a clean copy if the "
1534                     "image cannot be opened for writing\n");
1535             res->corruptions++;
1536             continue;
1537         }
1538         s->set_refcount(*refcount_table, k, refcount + 1);
1539     }
1540 
1541     return 0;
1542 }
1543 
1544 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1545 enum {
1546     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1547 };
1548 
1549 /*
1550  * Increases the refcount in the given refcount table for the all clusters
1551  * referenced in the L2 table. While doing so, performs some checks on L2
1552  * entries.
1553  *
1554  * Returns the number of errors found by the checks or -errno if an internal
1555  * error occurred.
1556  */
1557 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1558                               void **refcount_table,
1559                               int64_t *refcount_table_size, int64_t l2_offset,
1560                               int flags, BdrvCheckMode fix)
1561 {
1562     BDRVQcow2State *s = bs->opaque;
1563     uint64_t *l2_table, l2_entry;
1564     uint64_t next_contiguous_offset = 0;
1565     int i, l2_size, nb_csectors, ret;
1566 
1567     /* Read L2 table from disk */
1568     l2_size = s->l2_size * sizeof(uint64_t);
1569     l2_table = g_malloc(l2_size);
1570 
1571     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1572     if (ret < 0) {
1573         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1574         res->check_errors++;
1575         goto fail;
1576     }
1577 
1578     /* Do the actual checks */
1579     for(i = 0; i < s->l2_size; i++) {
1580         l2_entry = be64_to_cpu(l2_table[i]);
1581 
1582         switch (qcow2_get_cluster_type(l2_entry)) {
1583         case QCOW2_CLUSTER_COMPRESSED:
1584             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1585             if (l2_entry & QCOW_OFLAG_COPIED) {
1586                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1587                     "copied flag must never be set for compressed "
1588                     "clusters\n", l2_entry & s->cluster_offset_mask);
1589                 l2_entry &= ~QCOW_OFLAG_COPIED;
1590                 res->corruptions++;
1591             }
1592 
1593             /* Mark cluster as used */
1594             nb_csectors = ((l2_entry >> s->csize_shift) &
1595                            s->csize_mask) + 1;
1596             l2_entry &= s->cluster_offset_mask;
1597             ret = qcow2_inc_refcounts_imrt(bs, res,
1598                                            refcount_table, refcount_table_size,
1599                                            l2_entry & ~511, nb_csectors * 512);
1600             if (ret < 0) {
1601                 goto fail;
1602             }
1603 
1604             if (flags & CHECK_FRAG_INFO) {
1605                 res->bfi.allocated_clusters++;
1606                 res->bfi.compressed_clusters++;
1607 
1608                 /* Compressed clusters are fragmented by nature.  Since they
1609                  * take up sub-sector space but we only have sector granularity
1610                  * I/O we need to re-read the same sectors even for adjacent
1611                  * compressed clusters.
1612                  */
1613                 res->bfi.fragmented_clusters++;
1614             }
1615             break;
1616 
1617         case QCOW2_CLUSTER_ZERO_ALLOC:
1618         case QCOW2_CLUSTER_NORMAL:
1619         {
1620             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1621 
1622             if (flags & CHECK_FRAG_INFO) {
1623                 res->bfi.allocated_clusters++;
1624                 if (next_contiguous_offset &&
1625                     offset != next_contiguous_offset) {
1626                     res->bfi.fragmented_clusters++;
1627                 }
1628                 next_contiguous_offset = offset + s->cluster_size;
1629             }
1630 
1631             /* Correct offsets are cluster aligned */
1632             if (offset_into_cluster(s, offset)) {
1633                 if (qcow2_get_cluster_type(l2_entry) ==
1634                     QCOW2_CLUSTER_ZERO_ALLOC)
1635                 {
1636                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
1637                             "cluster is not properly aligned; L2 entry "
1638                             "corrupted.\n",
1639                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1640                             offset);
1641                     if (fix & BDRV_FIX_ERRORS) {
1642                         uint64_t l2e_offset =
1643                             l2_offset + (uint64_t)i * sizeof(uint64_t);
1644 
1645                         l2_entry = QCOW_OFLAG_ZERO;
1646                         l2_table[i] = cpu_to_be64(l2_entry);
1647                         ret = qcow2_pre_write_overlap_check(bs,
1648                                 QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
1649                                 l2e_offset, sizeof(uint64_t));
1650                         if (ret < 0) {
1651                             fprintf(stderr, "ERROR: Overlap check failed\n");
1652                             res->check_errors++;
1653                             /* Something is seriously wrong, so abort checking
1654                              * this L2 table */
1655                             goto fail;
1656                         }
1657 
1658                         ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1659                                                &l2_table[i], sizeof(uint64_t));
1660                         if (ret < 0) {
1661                             fprintf(stderr, "ERROR: Failed to overwrite L2 "
1662                                     "table entry: %s\n", strerror(-ret));
1663                             res->check_errors++;
1664                             /* Do not abort, continue checking the rest of this
1665                              * L2 table's entries */
1666                         } else {
1667                             res->corruptions_fixed++;
1668                             /* Skip marking the cluster as used
1669                              * (it is unused now) */
1670                             continue;
1671                         }
1672                     } else {
1673                         res->corruptions++;
1674                     }
1675                 } else {
1676                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1677                         "not properly aligned; L2 entry corrupted.\n", offset);
1678                     res->corruptions++;
1679                 }
1680             }
1681 
1682             /* Mark cluster as used */
1683             ret = qcow2_inc_refcounts_imrt(bs, res,
1684                                            refcount_table, refcount_table_size,
1685                                            offset, s->cluster_size);
1686             if (ret < 0) {
1687                 goto fail;
1688             }
1689             break;
1690         }
1691 
1692         case QCOW2_CLUSTER_ZERO_PLAIN:
1693         case QCOW2_CLUSTER_UNALLOCATED:
1694             break;
1695 
1696         default:
1697             abort();
1698         }
1699     }
1700 
1701     g_free(l2_table);
1702     return 0;
1703 
1704 fail:
1705     g_free(l2_table);
1706     return ret;
1707 }
1708 
1709 /*
1710  * Increases the refcount for the L1 table, its L2 tables and all referenced
1711  * clusters in the given refcount table. While doing so, performs some checks
1712  * on L1 and L2 entries.
1713  *
1714  * Returns the number of errors found by the checks or -errno if an internal
1715  * error occurred.
1716  */
1717 static int check_refcounts_l1(BlockDriverState *bs,
1718                               BdrvCheckResult *res,
1719                               void **refcount_table,
1720                               int64_t *refcount_table_size,
1721                               int64_t l1_table_offset, int l1_size,
1722                               int flags, BdrvCheckMode fix)
1723 {
1724     BDRVQcow2State *s = bs->opaque;
1725     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1726     int i, ret;
1727 
1728     l1_size2 = l1_size * sizeof(uint64_t);
1729 
1730     /* Mark L1 table as used */
1731     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1732                                    l1_table_offset, l1_size2);
1733     if (ret < 0) {
1734         goto fail;
1735     }
1736 
1737     /* Read L1 table entries from disk */
1738     if (l1_size2 > 0) {
1739         l1_table = g_try_malloc(l1_size2);
1740         if (l1_table == NULL) {
1741             ret = -ENOMEM;
1742             res->check_errors++;
1743             goto fail;
1744         }
1745         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1746         if (ret < 0) {
1747             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1748             res->check_errors++;
1749             goto fail;
1750         }
1751         for(i = 0;i < l1_size; i++)
1752             be64_to_cpus(&l1_table[i]);
1753     }
1754 
1755     /* Do the actual checks */
1756     for(i = 0; i < l1_size; i++) {
1757         l2_offset = l1_table[i];
1758         if (l2_offset) {
1759             /* Mark L2 table as used */
1760             l2_offset &= L1E_OFFSET_MASK;
1761             ret = qcow2_inc_refcounts_imrt(bs, res,
1762                                            refcount_table, refcount_table_size,
1763                                            l2_offset, s->cluster_size);
1764             if (ret < 0) {
1765                 goto fail;
1766             }
1767 
1768             /* L2 tables are cluster aligned */
1769             if (offset_into_cluster(s, l2_offset)) {
1770                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1771                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1772                 res->corruptions++;
1773             }
1774 
1775             /* Process and check L2 entries */
1776             ret = check_refcounts_l2(bs, res, refcount_table,
1777                                      refcount_table_size, l2_offset, flags,
1778                                      fix);
1779             if (ret < 0) {
1780                 goto fail;
1781             }
1782         }
1783     }
1784     g_free(l1_table);
1785     return 0;
1786 
1787 fail:
1788     g_free(l1_table);
1789     return ret;
1790 }
1791 
1792 /*
1793  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1794  *
1795  * This function does not print an error message nor does it increment
1796  * check_errors if qcow2_get_refcount fails (this is because such an error will
1797  * have been already detected and sufficiently signaled by the calling function
1798  * (qcow2_check_refcounts) by the time this function is called).
1799  */
1800 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1801                               BdrvCheckMode fix)
1802 {
1803     BDRVQcow2State *s = bs->opaque;
1804     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1805     int ret;
1806     uint64_t refcount;
1807     int i, j;
1808     bool repair;
1809 
1810     if (fix & BDRV_FIX_ERRORS) {
1811         /* Always repair */
1812         repair = true;
1813     } else if (fix & BDRV_FIX_LEAKS) {
1814         /* Repair only if that seems safe: This function is always
1815          * called after the refcounts have been fixed, so the refcount
1816          * is accurate if that repair was successful */
1817         repair = !res->check_errors && !res->corruptions && !res->leaks;
1818     } else {
1819         repair = false;
1820     }
1821 
1822     for (i = 0; i < s->l1_size; i++) {
1823         uint64_t l1_entry = s->l1_table[i];
1824         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1825         bool l2_dirty = false;
1826 
1827         if (!l2_offset) {
1828             continue;
1829         }
1830 
1831         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1832                                  &refcount);
1833         if (ret < 0) {
1834             /* don't print message nor increment check_errors */
1835             continue;
1836         }
1837         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1838             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1839                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1840                     repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1841             if (repair) {
1842                 s->l1_table[i] = refcount == 1
1843                                ? l1_entry |  QCOW_OFLAG_COPIED
1844                                : l1_entry & ~QCOW_OFLAG_COPIED;
1845                 ret = qcow2_write_l1_entry(bs, i);
1846                 if (ret < 0) {
1847                     res->check_errors++;
1848                     goto fail;
1849                 }
1850                 res->corruptions_fixed++;
1851             } else {
1852                 res->corruptions++;
1853             }
1854         }
1855 
1856         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1857                          s->l2_size * sizeof(uint64_t));
1858         if (ret < 0) {
1859             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1860                     strerror(-ret));
1861             res->check_errors++;
1862             goto fail;
1863         }
1864 
1865         for (j = 0; j < s->l2_size; j++) {
1866             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1867             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1868             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1869 
1870             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1871                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1872                 ret = qcow2_get_refcount(bs,
1873                                          data_offset >> s->cluster_bits,
1874                                          &refcount);
1875                 if (ret < 0) {
1876                     /* don't print message nor increment check_errors */
1877                     continue;
1878                 }
1879                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1880                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1881                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1882                             repair ? "Repairing" : "ERROR", l2_entry, refcount);
1883                     if (repair) {
1884                         l2_table[j] = cpu_to_be64(refcount == 1
1885                                     ? l2_entry |  QCOW_OFLAG_COPIED
1886                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1887                         l2_dirty = true;
1888                         res->corruptions_fixed++;
1889                     } else {
1890                         res->corruptions++;
1891                     }
1892                 }
1893             }
1894         }
1895 
1896         if (l2_dirty) {
1897             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1898                                                 l2_offset, s->cluster_size);
1899             if (ret < 0) {
1900                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1901                         "overlap check failed: %s\n", strerror(-ret));
1902                 res->check_errors++;
1903                 goto fail;
1904             }
1905 
1906             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1907                               s->cluster_size);
1908             if (ret < 0) {
1909                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1910                         strerror(-ret));
1911                 res->check_errors++;
1912                 goto fail;
1913             }
1914         }
1915     }
1916 
1917     ret = 0;
1918 
1919 fail:
1920     qemu_vfree(l2_table);
1921     return ret;
1922 }
1923 
1924 /*
1925  * Checks consistency of refblocks and accounts for each refblock in
1926  * *refcount_table.
1927  */
1928 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1929                            BdrvCheckMode fix, bool *rebuild,
1930                            void **refcount_table, int64_t *nb_clusters)
1931 {
1932     BDRVQcow2State *s = bs->opaque;
1933     int64_t i, size;
1934     int ret;
1935 
1936     for(i = 0; i < s->refcount_table_size; i++) {
1937         uint64_t offset, cluster;
1938         offset = s->refcount_table[i];
1939         cluster = offset >> s->cluster_bits;
1940 
1941         /* Refcount blocks are cluster aligned */
1942         if (offset_into_cluster(s, offset)) {
1943             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1944                 "cluster aligned; refcount table entry corrupted\n", i);
1945             res->corruptions++;
1946             *rebuild = true;
1947             continue;
1948         }
1949 
1950         if (cluster >= *nb_clusters) {
1951             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1952                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1953 
1954             if (fix & BDRV_FIX_ERRORS) {
1955                 int64_t new_nb_clusters;
1956                 Error *local_err = NULL;
1957 
1958                 if (offset > INT64_MAX - s->cluster_size) {
1959                     ret = -EINVAL;
1960                     goto resize_fail;
1961                 }
1962 
1963                 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
1964                                     PREALLOC_MODE_OFF, &local_err);
1965                 if (ret < 0) {
1966                     error_report_err(local_err);
1967                     goto resize_fail;
1968                 }
1969                 size = bdrv_getlength(bs->file->bs);
1970                 if (size < 0) {
1971                     ret = size;
1972                     goto resize_fail;
1973                 }
1974 
1975                 new_nb_clusters = size_to_clusters(s, size);
1976                 assert(new_nb_clusters >= *nb_clusters);
1977 
1978                 ret = realloc_refcount_array(s, refcount_table,
1979                                              nb_clusters, new_nb_clusters);
1980                 if (ret < 0) {
1981                     res->check_errors++;
1982                     return ret;
1983                 }
1984 
1985                 if (cluster >= *nb_clusters) {
1986                     ret = -EINVAL;
1987                     goto resize_fail;
1988                 }
1989 
1990                 res->corruptions_fixed++;
1991                 ret = qcow2_inc_refcounts_imrt(bs, res,
1992                                                refcount_table, nb_clusters,
1993                                                offset, s->cluster_size);
1994                 if (ret < 0) {
1995                     return ret;
1996                 }
1997                 /* No need to check whether the refcount is now greater than 1:
1998                  * This area was just allocated and zeroed, so it can only be
1999                  * exactly 1 after qcow2_inc_refcounts_imrt() */
2000                 continue;
2001 
2002 resize_fail:
2003                 res->corruptions++;
2004                 *rebuild = true;
2005                 fprintf(stderr, "ERROR could not resize image: %s\n",
2006                         strerror(-ret));
2007             } else {
2008                 res->corruptions++;
2009             }
2010             continue;
2011         }
2012 
2013         if (offset != 0) {
2014             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2015                                            offset, s->cluster_size);
2016             if (ret < 0) {
2017                 return ret;
2018             }
2019             if (s->get_refcount(*refcount_table, cluster) != 1) {
2020                 fprintf(stderr, "ERROR refcount block %" PRId64
2021                         " refcount=%" PRIu64 "\n", i,
2022                         s->get_refcount(*refcount_table, cluster));
2023                 res->corruptions++;
2024                 *rebuild = true;
2025             }
2026         }
2027     }
2028 
2029     return 0;
2030 }
2031 
2032 /*
2033  * Calculates an in-memory refcount table.
2034  */
2035 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2036                                BdrvCheckMode fix, bool *rebuild,
2037                                void **refcount_table, int64_t *nb_clusters)
2038 {
2039     BDRVQcow2State *s = bs->opaque;
2040     int64_t i;
2041     QCowSnapshot *sn;
2042     int ret;
2043 
2044     if (!*refcount_table) {
2045         int64_t old_size = 0;
2046         ret = realloc_refcount_array(s, refcount_table,
2047                                      &old_size, *nb_clusters);
2048         if (ret < 0) {
2049             res->check_errors++;
2050             return ret;
2051         }
2052     }
2053 
2054     /* header */
2055     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2056                                    0, s->cluster_size);
2057     if (ret < 0) {
2058         return ret;
2059     }
2060 
2061     /* current L1 table */
2062     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2063                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2064                              fix);
2065     if (ret < 0) {
2066         return ret;
2067     }
2068 
2069     /* snapshots */
2070     for (i = 0; i < s->nb_snapshots; i++) {
2071         sn = s->snapshots + i;
2072         if (offset_into_cluster(s, sn->l1_table_offset)) {
2073             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2074                     "L1 table is not cluster aligned; snapshot table entry "
2075                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2076             res->corruptions++;
2077             continue;
2078         }
2079         if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
2080             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2081                     "L1 table is too large; snapshot table entry corrupted\n",
2082                     sn->id_str, sn->name, sn->l1_size);
2083             res->corruptions++;
2084             continue;
2085         }
2086         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2087                                  sn->l1_table_offset, sn->l1_size, 0, fix);
2088         if (ret < 0) {
2089             return ret;
2090         }
2091     }
2092     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2093                                    s->snapshots_offset, s->snapshots_size);
2094     if (ret < 0) {
2095         return ret;
2096     }
2097 
2098     /* refcount data */
2099     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2100                                    s->refcount_table_offset,
2101                                    s->refcount_table_size * sizeof(uint64_t));
2102     if (ret < 0) {
2103         return ret;
2104     }
2105 
2106     /* encryption */
2107     if (s->crypto_header.length) {
2108         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2109                                        s->crypto_header.offset,
2110                                        s->crypto_header.length);
2111         if (ret < 0) {
2112             return ret;
2113         }
2114     }
2115 
2116     /* bitmaps */
2117     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2118     if (ret < 0) {
2119         return ret;
2120     }
2121 
2122     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2123 }
2124 
2125 /*
2126  * Compares the actual reference count for each cluster in the image against the
2127  * refcount as reported by the refcount structures on-disk.
2128  */
2129 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2130                               BdrvCheckMode fix, bool *rebuild,
2131                               int64_t *highest_cluster,
2132                               void *refcount_table, int64_t nb_clusters)
2133 {
2134     BDRVQcow2State *s = bs->opaque;
2135     int64_t i;
2136     uint64_t refcount1, refcount2;
2137     int ret;
2138 
2139     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2140         ret = qcow2_get_refcount(bs, i, &refcount1);
2141         if (ret < 0) {
2142             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2143                     i, strerror(-ret));
2144             res->check_errors++;
2145             continue;
2146         }
2147 
2148         refcount2 = s->get_refcount(refcount_table, i);
2149 
2150         if (refcount1 > 0 || refcount2 > 0) {
2151             *highest_cluster = i;
2152         }
2153 
2154         if (refcount1 != refcount2) {
2155             /* Check if we're allowed to fix the mismatch */
2156             int *num_fixed = NULL;
2157             if (refcount1 == 0) {
2158                 *rebuild = true;
2159             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2160                 num_fixed = &res->leaks_fixed;
2161             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2162                 num_fixed = &res->corruptions_fixed;
2163             }
2164 
2165             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2166                     " reference=%" PRIu64 "\n",
2167                    num_fixed != NULL     ? "Repairing" :
2168                    refcount1 < refcount2 ? "ERROR" :
2169                                            "Leaked",
2170                    i, refcount1, refcount2);
2171 
2172             if (num_fixed) {
2173                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2174                                       refcount_diff(refcount1, refcount2),
2175                                       refcount1 > refcount2,
2176                                       QCOW2_DISCARD_ALWAYS);
2177                 if (ret >= 0) {
2178                     (*num_fixed)++;
2179                     continue;
2180                 }
2181             }
2182 
2183             /* And if we couldn't, print an error */
2184             if (refcount1 < refcount2) {
2185                 res->corruptions++;
2186             } else {
2187                 res->leaks++;
2188             }
2189         }
2190     }
2191 }
2192 
2193 /*
2194  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2195  * the on-disk refcount structures.
2196  *
2197  * On input, *first_free_cluster tells where to start looking, and need not
2198  * actually be a free cluster; the returned offset will not be before that
2199  * cluster.  On output, *first_free_cluster points to the first gap found, even
2200  * if that gap was too small to be used as the returned offset.
2201  *
2202  * Note that *first_free_cluster is a cluster index whereas the return value is
2203  * an offset.
2204  */
2205 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2206                                    int cluster_count,
2207                                    void **refcount_table,
2208                                    int64_t *imrt_nb_clusters,
2209                                    int64_t *first_free_cluster)
2210 {
2211     BDRVQcow2State *s = bs->opaque;
2212     int64_t cluster = *first_free_cluster, i;
2213     bool first_gap = true;
2214     int contiguous_free_clusters;
2215     int ret;
2216 
2217     /* Starting at *first_free_cluster, find a range of at least cluster_count
2218      * continuously free clusters */
2219     for (contiguous_free_clusters = 0;
2220          cluster < *imrt_nb_clusters &&
2221          contiguous_free_clusters < cluster_count;
2222          cluster++)
2223     {
2224         if (!s->get_refcount(*refcount_table, cluster)) {
2225             contiguous_free_clusters++;
2226             if (first_gap) {
2227                 /* If this is the first free cluster found, update
2228                  * *first_free_cluster accordingly */
2229                 *first_free_cluster = cluster;
2230                 first_gap = false;
2231             }
2232         } else if (contiguous_free_clusters) {
2233             contiguous_free_clusters = 0;
2234         }
2235     }
2236 
2237     /* If contiguous_free_clusters is greater than zero, it contains the number
2238      * of continuously free clusters until the current cluster; the first free
2239      * cluster in the current "gap" is therefore
2240      * cluster - contiguous_free_clusters */
2241 
2242     /* If no such range could be found, grow the in-memory refcount table
2243      * accordingly to append free clusters at the end of the image */
2244     if (contiguous_free_clusters < cluster_count) {
2245         /* contiguous_free_clusters clusters are already empty at the image end;
2246          * we need cluster_count clusters; therefore, we have to allocate
2247          * cluster_count - contiguous_free_clusters new clusters at the end of
2248          * the image (which is the current value of cluster; note that cluster
2249          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2250          * the image end) */
2251         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2252                                      cluster + cluster_count
2253                                      - contiguous_free_clusters);
2254         if (ret < 0) {
2255             return ret;
2256         }
2257     }
2258 
2259     /* Go back to the first free cluster */
2260     cluster -= contiguous_free_clusters;
2261     for (i = 0; i < cluster_count; i++) {
2262         s->set_refcount(*refcount_table, cluster + i, 1);
2263     }
2264 
2265     return cluster << s->cluster_bits;
2266 }
2267 
2268 /*
2269  * Creates a new refcount structure based solely on the in-memory information
2270  * given through *refcount_table. All necessary allocations will be reflected
2271  * in that array.
2272  *
2273  * On success, the old refcount structure is leaked (it will be covered by the
2274  * new refcount structure).
2275  */
2276 static int rebuild_refcount_structure(BlockDriverState *bs,
2277                                       BdrvCheckResult *res,
2278                                       void **refcount_table,
2279                                       int64_t *nb_clusters)
2280 {
2281     BDRVQcow2State *s = bs->opaque;
2282     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2283     int64_t refblock_offset, refblock_start, refblock_index;
2284     uint32_t reftable_size = 0;
2285     uint64_t *on_disk_reftable = NULL;
2286     void *on_disk_refblock;
2287     int ret = 0;
2288     struct {
2289         uint64_t reftable_offset;
2290         uint32_t reftable_clusters;
2291     } QEMU_PACKED reftable_offset_and_clusters;
2292 
2293     qcow2_cache_empty(bs, s->refcount_block_cache);
2294 
2295 write_refblocks:
2296     for (; cluster < *nb_clusters; cluster++) {
2297         if (!s->get_refcount(*refcount_table, cluster)) {
2298             continue;
2299         }
2300 
2301         refblock_index = cluster >> s->refcount_block_bits;
2302         refblock_start = refblock_index << s->refcount_block_bits;
2303 
2304         /* Don't allocate a cluster in a refblock already written to disk */
2305         if (first_free_cluster < refblock_start) {
2306             first_free_cluster = refblock_start;
2307         }
2308         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2309                                               nb_clusters, &first_free_cluster);
2310         if (refblock_offset < 0) {
2311             fprintf(stderr, "ERROR allocating refblock: %s\n",
2312                     strerror(-refblock_offset));
2313             res->check_errors++;
2314             ret = refblock_offset;
2315             goto fail;
2316         }
2317 
2318         if (reftable_size <= refblock_index) {
2319             uint32_t old_reftable_size = reftable_size;
2320             uint64_t *new_on_disk_reftable;
2321 
2322             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2323                                      s->cluster_size) / sizeof(uint64_t);
2324             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2325                                                  reftable_size *
2326                                                  sizeof(uint64_t));
2327             if (!new_on_disk_reftable) {
2328                 res->check_errors++;
2329                 ret = -ENOMEM;
2330                 goto fail;
2331             }
2332             on_disk_reftable = new_on_disk_reftable;
2333 
2334             memset(on_disk_reftable + old_reftable_size, 0,
2335                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2336 
2337             /* The offset we have for the reftable is now no longer valid;
2338              * this will leak that range, but we can easily fix that by running
2339              * a leak-fixing check after this rebuild operation */
2340             reftable_offset = -1;
2341         } else {
2342             assert(on_disk_reftable);
2343         }
2344         on_disk_reftable[refblock_index] = refblock_offset;
2345 
2346         /* If this is apparently the last refblock (for now), try to squeeze the
2347          * reftable in */
2348         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2349             reftable_offset < 0)
2350         {
2351             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2352                                                           sizeof(uint64_t));
2353             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2354                                                   refcount_table, nb_clusters,
2355                                                   &first_free_cluster);
2356             if (reftable_offset < 0) {
2357                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2358                         strerror(-reftable_offset));
2359                 res->check_errors++;
2360                 ret = reftable_offset;
2361                 goto fail;
2362             }
2363         }
2364 
2365         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2366                                             s->cluster_size);
2367         if (ret < 0) {
2368             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2369             goto fail;
2370         }
2371 
2372         /* The size of *refcount_table is always cluster-aligned, therefore the
2373          * write operation will not overflow */
2374         on_disk_refblock = (void *)((char *) *refcount_table +
2375                                     refblock_index * s->cluster_size);
2376 
2377         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2378                          on_disk_refblock, s->cluster_sectors);
2379         if (ret < 0) {
2380             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2381             goto fail;
2382         }
2383 
2384         /* Go to the end of this refblock */
2385         cluster = refblock_start + s->refcount_block_size - 1;
2386     }
2387 
2388     if (reftable_offset < 0) {
2389         uint64_t post_refblock_start, reftable_clusters;
2390 
2391         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2392         reftable_clusters = size_to_clusters(s,
2393                                              reftable_size * sizeof(uint64_t));
2394         /* Not pretty but simple */
2395         if (first_free_cluster < post_refblock_start) {
2396             first_free_cluster = post_refblock_start;
2397         }
2398         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2399                                               refcount_table, nb_clusters,
2400                                               &first_free_cluster);
2401         if (reftable_offset < 0) {
2402             fprintf(stderr, "ERROR allocating reftable: %s\n",
2403                     strerror(-reftable_offset));
2404             res->check_errors++;
2405             ret = reftable_offset;
2406             goto fail;
2407         }
2408 
2409         goto write_refblocks;
2410     }
2411 
2412     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2413         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2414     }
2415 
2416     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2417                                         reftable_size * sizeof(uint64_t));
2418     if (ret < 0) {
2419         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2420         goto fail;
2421     }
2422 
2423     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2424     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2425                       reftable_size * sizeof(uint64_t));
2426     if (ret < 0) {
2427         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2428         goto fail;
2429     }
2430 
2431     /* Enter new reftable into the image header */
2432     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2433     reftable_offset_and_clusters.reftable_clusters =
2434         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2435     ret = bdrv_pwrite_sync(bs->file,
2436                            offsetof(QCowHeader, refcount_table_offset),
2437                            &reftable_offset_and_clusters,
2438                            sizeof(reftable_offset_and_clusters));
2439     if (ret < 0) {
2440         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2441         goto fail;
2442     }
2443 
2444     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2445         be64_to_cpus(&on_disk_reftable[refblock_index]);
2446     }
2447     s->refcount_table = on_disk_reftable;
2448     s->refcount_table_offset = reftable_offset;
2449     s->refcount_table_size = reftable_size;
2450     update_max_refcount_table_index(s);
2451 
2452     return 0;
2453 
2454 fail:
2455     g_free(on_disk_reftable);
2456     return ret;
2457 }
2458 
2459 /*
2460  * Checks an image for refcount consistency.
2461  *
2462  * Returns 0 if no errors are found, the number of errors in case the image is
2463  * detected as corrupted, and -errno when an internal error occurred.
2464  */
2465 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2466                           BdrvCheckMode fix)
2467 {
2468     BDRVQcow2State *s = bs->opaque;
2469     BdrvCheckResult pre_compare_res;
2470     int64_t size, highest_cluster, nb_clusters;
2471     void *refcount_table = NULL;
2472     bool rebuild = false;
2473     int ret;
2474 
2475     size = bdrv_getlength(bs->file->bs);
2476     if (size < 0) {
2477         res->check_errors++;
2478         return size;
2479     }
2480 
2481     nb_clusters = size_to_clusters(s, size);
2482     if (nb_clusters > INT_MAX) {
2483         res->check_errors++;
2484         return -EFBIG;
2485     }
2486 
2487     res->bfi.total_clusters =
2488         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2489 
2490     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2491                               &nb_clusters);
2492     if (ret < 0) {
2493         goto fail;
2494     }
2495 
2496     /* In case we don't need to rebuild the refcount structure (but want to fix
2497      * something), this function is immediately called again, in which case the
2498      * result should be ignored */
2499     pre_compare_res = *res;
2500     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2501                       nb_clusters);
2502 
2503     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2504         BdrvCheckResult old_res = *res;
2505         int fresh_leaks = 0;
2506 
2507         fprintf(stderr, "Rebuilding refcount structure\n");
2508         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2509                                          &nb_clusters);
2510         if (ret < 0) {
2511             goto fail;
2512         }
2513 
2514         res->corruptions = 0;
2515         res->leaks = 0;
2516 
2517         /* Because the old reftable has been exchanged for a new one the
2518          * references have to be recalculated */
2519         rebuild = false;
2520         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2521         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2522                                   &nb_clusters);
2523         if (ret < 0) {
2524             goto fail;
2525         }
2526 
2527         if (fix & BDRV_FIX_LEAKS) {
2528             /* The old refcount structures are now leaked, fix it; the result
2529              * can be ignored, aside from leaks which were introduced by
2530              * rebuild_refcount_structure() that could not be fixed */
2531             BdrvCheckResult saved_res = *res;
2532             *res = (BdrvCheckResult){ 0 };
2533 
2534             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2535                               &highest_cluster, refcount_table, nb_clusters);
2536             if (rebuild) {
2537                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2538                         "broken\n");
2539             }
2540 
2541             /* Any leaks accounted for here were introduced by
2542              * rebuild_refcount_structure() because that function has created a
2543              * new refcount structure from scratch */
2544             fresh_leaks = res->leaks;
2545             *res = saved_res;
2546         }
2547 
2548         if (res->corruptions < old_res.corruptions) {
2549             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2550         }
2551         if (res->leaks < old_res.leaks) {
2552             res->leaks_fixed += old_res.leaks - res->leaks;
2553         }
2554         res->leaks += fresh_leaks;
2555     } else if (fix) {
2556         if (rebuild) {
2557             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2558             res->check_errors++;
2559             ret = -EIO;
2560             goto fail;
2561         }
2562 
2563         if (res->leaks || res->corruptions) {
2564             *res = pre_compare_res;
2565             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2566                               refcount_table, nb_clusters);
2567         }
2568     }
2569 
2570     /* check OFLAG_COPIED */
2571     ret = check_oflag_copied(bs, res, fix);
2572     if (ret < 0) {
2573         goto fail;
2574     }
2575 
2576     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2577     ret = 0;
2578 
2579 fail:
2580     g_free(refcount_table);
2581 
2582     return ret;
2583 }
2584 
2585 #define overlaps_with(ofs, sz) \
2586     ranges_overlap(offset, size, ofs, sz)
2587 
2588 /*
2589  * Checks if the given offset into the image file is actually free to use by
2590  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2591  * i.e. a sanity check without relying on the refcount tables.
2592  *
2593  * The ign parameter specifies what checks not to perform (being a bitmask of
2594  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2595  *
2596  * Returns:
2597  * - 0 if writing to this offset will not affect the mentioned metadata
2598  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2599  * - a negative value (-errno) indicating an error while performing a check,
2600  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2601  */
2602 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2603                                  int64_t size)
2604 {
2605     BDRVQcow2State *s = bs->opaque;
2606     int chk = s->overlap_check & ~ign;
2607     int i, j;
2608 
2609     if (!size) {
2610         return 0;
2611     }
2612 
2613     if (chk & QCOW2_OL_MAIN_HEADER) {
2614         if (offset < s->cluster_size) {
2615             return QCOW2_OL_MAIN_HEADER;
2616         }
2617     }
2618 
2619     /* align range to test to cluster boundaries */
2620     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2621     offset = start_of_cluster(s, offset);
2622 
2623     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2624         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2625             return QCOW2_OL_ACTIVE_L1;
2626         }
2627     }
2628 
2629     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2630         if (overlaps_with(s->refcount_table_offset,
2631             s->refcount_table_size * sizeof(uint64_t))) {
2632             return QCOW2_OL_REFCOUNT_TABLE;
2633         }
2634     }
2635 
2636     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2637         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2638             return QCOW2_OL_SNAPSHOT_TABLE;
2639         }
2640     }
2641 
2642     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2643         for (i = 0; i < s->nb_snapshots; i++) {
2644             if (s->snapshots[i].l1_size &&
2645                 overlaps_with(s->snapshots[i].l1_table_offset,
2646                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2647                 return QCOW2_OL_INACTIVE_L1;
2648             }
2649         }
2650     }
2651 
2652     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2653         for (i = 0; i < s->l1_size; i++) {
2654             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2655                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2656                 s->cluster_size)) {
2657                 return QCOW2_OL_ACTIVE_L2;
2658             }
2659         }
2660     }
2661 
2662     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2663         unsigned last_entry = s->max_refcount_table_index;
2664         assert(last_entry < s->refcount_table_size);
2665         assert(last_entry + 1 == s->refcount_table_size ||
2666                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2667         for (i = 0; i <= last_entry; i++) {
2668             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2669                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2670                 s->cluster_size)) {
2671                 return QCOW2_OL_REFCOUNT_BLOCK;
2672             }
2673         }
2674     }
2675 
2676     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2677         for (i = 0; i < s->nb_snapshots; i++) {
2678             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2679             uint32_t l1_sz  = s->snapshots[i].l1_size;
2680             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2681             uint64_t *l1;
2682             int ret;
2683 
2684             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t),
2685                                        QCOW_MAX_L1_SIZE, "", NULL);
2686             if (ret < 0) {
2687                 return ret;
2688             }
2689 
2690             l1 = g_try_malloc(l1_sz2);
2691 
2692             if (l1_sz2 && l1 == NULL) {
2693                 return -ENOMEM;
2694             }
2695 
2696             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2697             if (ret < 0) {
2698                 g_free(l1);
2699                 return ret;
2700             }
2701 
2702             for (j = 0; j < l1_sz; j++) {
2703                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2704                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2705                     g_free(l1);
2706                     return QCOW2_OL_INACTIVE_L2;
2707                 }
2708             }
2709 
2710             g_free(l1);
2711         }
2712     }
2713 
2714     if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
2715         (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
2716     {
2717         if (overlaps_with(s->bitmap_directory_offset,
2718                           s->bitmap_directory_size))
2719         {
2720             return QCOW2_OL_BITMAP_DIRECTORY;
2721         }
2722     }
2723 
2724     return 0;
2725 }
2726 
2727 static const char *metadata_ol_names[] = {
2728     [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
2729     [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
2730     [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
2731     [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
2732     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
2733     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
2734     [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
2735     [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
2736     [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
2737 };
2738 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
2739 
2740 /*
2741  * First performs a check for metadata overlaps (through
2742  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2743  * while performing a check), that value is returned. If an impending overlap
2744  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2745  * and -EIO returned.
2746  *
2747  * Returns 0 if there were neither overlaps nor errors while checking for
2748  * overlaps; or a negative value (-errno) on error.
2749  */
2750 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2751                                   int64_t size)
2752 {
2753     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2754 
2755     if (ret < 0) {
2756         return ret;
2757     } else if (ret > 0) {
2758         int metadata_ol_bitnr = ctz32(ret);
2759         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2760 
2761         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2762                                 "write on metadata (overlaps with %s)",
2763                                 metadata_ol_names[metadata_ol_bitnr]);
2764         return -EIO;
2765     }
2766 
2767     return 0;
2768 }
2769 
2770 /* A pointer to a function of this type is given to walk_over_reftable(). That
2771  * function will create refblocks and pass them to a RefblockFinishOp once they
2772  * are completed (@refblock). @refblock_empty is set if the refblock is
2773  * completely empty.
2774  *
2775  * Along with the refblock, a corresponding reftable entry is passed, in the
2776  * reftable @reftable (which may be reallocated) at @reftable_index.
2777  *
2778  * @allocated should be set to true if a new cluster has been allocated.
2779  */
2780 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2781                                uint64_t reftable_index, uint64_t *reftable_size,
2782                                void *refblock, bool refblock_empty,
2783                                bool *allocated, Error **errp);
2784 
2785 /**
2786  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2787  * it is not empty) and inserts its offset into the new reftable. The size of
2788  * this new reftable is increased as required.
2789  */
2790 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2791                           uint64_t reftable_index, uint64_t *reftable_size,
2792                           void *refblock, bool refblock_empty, bool *allocated,
2793                           Error **errp)
2794 {
2795     BDRVQcow2State *s = bs->opaque;
2796     int64_t offset;
2797 
2798     if (!refblock_empty && reftable_index >= *reftable_size) {
2799         uint64_t *new_reftable;
2800         uint64_t new_reftable_size;
2801 
2802         new_reftable_size = ROUND_UP(reftable_index + 1,
2803                                      s->cluster_size / sizeof(uint64_t));
2804         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2805             error_setg(errp,
2806                        "This operation would make the refcount table grow "
2807                        "beyond the maximum size supported by QEMU, aborting");
2808             return -ENOTSUP;
2809         }
2810 
2811         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2812                                                 sizeof(uint64_t));
2813         if (!new_reftable) {
2814             error_setg(errp, "Failed to increase reftable buffer size");
2815             return -ENOMEM;
2816         }
2817 
2818         memset(new_reftable + *reftable_size, 0,
2819                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2820 
2821         *reftable      = new_reftable;
2822         *reftable_size = new_reftable_size;
2823     }
2824 
2825     if (!refblock_empty && !(*reftable)[reftable_index]) {
2826         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2827         if (offset < 0) {
2828             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2829             return offset;
2830         }
2831         (*reftable)[reftable_index] = offset;
2832         *allocated = true;
2833     }
2834 
2835     return 0;
2836 }
2837 
2838 /**
2839  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2840  * offset specified by the new reftable's entry. It does not modify the new
2841  * reftable or change any refcounts.
2842  */
2843 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2844                           uint64_t reftable_index, uint64_t *reftable_size,
2845                           void *refblock, bool refblock_empty, bool *allocated,
2846                           Error **errp)
2847 {
2848     BDRVQcow2State *s = bs->opaque;
2849     int64_t offset;
2850     int ret;
2851 
2852     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2853         offset = (*reftable)[reftable_index];
2854 
2855         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2856         if (ret < 0) {
2857             error_setg_errno(errp, -ret, "Overlap check failed");
2858             return ret;
2859         }
2860 
2861         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2862         if (ret < 0) {
2863             error_setg_errno(errp, -ret, "Failed to write refblock");
2864             return ret;
2865         }
2866     } else {
2867         assert(refblock_empty);
2868     }
2869 
2870     return 0;
2871 }
2872 
2873 /**
2874  * This function walks over the existing reftable and every referenced refblock;
2875  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2876  * create an equal new entry in the passed @new_refblock. Once that
2877  * @new_refblock is completely filled, @operation will be called.
2878  *
2879  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2880  * @index is the index of the walk_over_reftable() calls and @total is the total
2881  * number of walk_over_reftable() calls per amend operation. Both are used for
2882  * calculating the parameters for the status callback.
2883  *
2884  * @allocated is set to true if a new cluster has been allocated.
2885  */
2886 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2887                               uint64_t *new_reftable_index,
2888                               uint64_t *new_reftable_size,
2889                               void *new_refblock, int new_refblock_size,
2890                               int new_refcount_bits,
2891                               RefblockFinishOp *operation, bool *allocated,
2892                               Qcow2SetRefcountFunc *new_set_refcount,
2893                               BlockDriverAmendStatusCB *status_cb,
2894                               void *cb_opaque, int index, int total,
2895                               Error **errp)
2896 {
2897     BDRVQcow2State *s = bs->opaque;
2898     uint64_t reftable_index;
2899     bool new_refblock_empty = true;
2900     int refblock_index;
2901     int new_refblock_index = 0;
2902     int ret;
2903 
2904     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2905          reftable_index++)
2906     {
2907         uint64_t refblock_offset = s->refcount_table[reftable_index]
2908                                  & REFT_OFFSET_MASK;
2909 
2910         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2911                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2912 
2913         if (refblock_offset) {
2914             void *refblock;
2915 
2916             if (offset_into_cluster(s, refblock_offset)) {
2917                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2918                                         PRIx64 " unaligned (reftable index: %#"
2919                                         PRIx64 ")", refblock_offset,
2920                                         reftable_index);
2921                 error_setg(errp,
2922                            "Image is corrupt (unaligned refblock offset)");
2923                 return -EIO;
2924             }
2925 
2926             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2927                                   &refblock);
2928             if (ret < 0) {
2929                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2930                 return ret;
2931             }
2932 
2933             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2934                  refblock_index++)
2935             {
2936                 uint64_t refcount;
2937 
2938                 if (new_refblock_index >= new_refblock_size) {
2939                     /* new_refblock is now complete */
2940                     ret = operation(bs, new_reftable, *new_reftable_index,
2941                                     new_reftable_size, new_refblock,
2942                                     new_refblock_empty, allocated, errp);
2943                     if (ret < 0) {
2944                         qcow2_cache_put(s->refcount_block_cache, &refblock);
2945                         return ret;
2946                     }
2947 
2948                     (*new_reftable_index)++;
2949                     new_refblock_index = 0;
2950                     new_refblock_empty = true;
2951                 }
2952 
2953                 refcount = s->get_refcount(refblock, refblock_index);
2954                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2955                     uint64_t offset;
2956 
2957                     qcow2_cache_put(s->refcount_block_cache, &refblock);
2958 
2959                     offset = ((reftable_index << s->refcount_block_bits)
2960                               + refblock_index) << s->cluster_bits;
2961 
2962                     error_setg(errp, "Cannot decrease refcount entry width to "
2963                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2964                                "refcount of %" PRIu64, new_refcount_bits,
2965                                offset, refcount);
2966                     return -EINVAL;
2967                 }
2968 
2969                 if (new_set_refcount) {
2970                     new_set_refcount(new_refblock, new_refblock_index++,
2971                                      refcount);
2972                 } else {
2973                     new_refblock_index++;
2974                 }
2975                 new_refblock_empty = new_refblock_empty && refcount == 0;
2976             }
2977 
2978             qcow2_cache_put(s->refcount_block_cache, &refblock);
2979         } else {
2980             /* No refblock means every refcount is 0 */
2981             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2982                  refblock_index++)
2983             {
2984                 if (new_refblock_index >= new_refblock_size) {
2985                     /* new_refblock is now complete */
2986                     ret = operation(bs, new_reftable, *new_reftable_index,
2987                                     new_reftable_size, new_refblock,
2988                                     new_refblock_empty, allocated, errp);
2989                     if (ret < 0) {
2990                         return ret;
2991                     }
2992 
2993                     (*new_reftable_index)++;
2994                     new_refblock_index = 0;
2995                     new_refblock_empty = true;
2996                 }
2997 
2998                 if (new_set_refcount) {
2999                     new_set_refcount(new_refblock, new_refblock_index++, 0);
3000                 } else {
3001                     new_refblock_index++;
3002                 }
3003             }
3004         }
3005     }
3006 
3007     if (new_refblock_index > 0) {
3008         /* Complete the potentially existing partially filled final refblock */
3009         if (new_set_refcount) {
3010             for (; new_refblock_index < new_refblock_size;
3011                  new_refblock_index++)
3012             {
3013                 new_set_refcount(new_refblock, new_refblock_index, 0);
3014             }
3015         }
3016 
3017         ret = operation(bs, new_reftable, *new_reftable_index,
3018                         new_reftable_size, new_refblock, new_refblock_empty,
3019                         allocated, errp);
3020         if (ret < 0) {
3021             return ret;
3022         }
3023 
3024         (*new_reftable_index)++;
3025     }
3026 
3027     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3028               (uint64_t)total * s->refcount_table_size, cb_opaque);
3029 
3030     return 0;
3031 }
3032 
3033 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3034                                 BlockDriverAmendStatusCB *status_cb,
3035                                 void *cb_opaque, Error **errp)
3036 {
3037     BDRVQcow2State *s = bs->opaque;
3038     Qcow2GetRefcountFunc *new_get_refcount;
3039     Qcow2SetRefcountFunc *new_set_refcount;
3040     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3041     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3042     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3043     uint64_t new_reftable_index = 0;
3044     uint64_t i;
3045     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3046     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3047     int old_refcount_order;
3048     int walk_index = 0;
3049     int ret;
3050     bool new_allocation;
3051 
3052     assert(s->qcow_version >= 3);
3053     assert(refcount_order >= 0 && refcount_order <= 6);
3054 
3055     /* see qcow2_open() */
3056     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3057 
3058     new_get_refcount = get_refcount_funcs[refcount_order];
3059     new_set_refcount = set_refcount_funcs[refcount_order];
3060 
3061 
3062     do {
3063         int total_walks;
3064 
3065         new_allocation = false;
3066 
3067         /* At least we have to do this walk and the one which writes the
3068          * refblocks; also, at least we have to do this loop here at least
3069          * twice (normally), first to do the allocations, and second to
3070          * determine that everything is correctly allocated, this then makes
3071          * three walks in total */
3072         total_walks = MAX(walk_index + 2, 3);
3073 
3074         /* First, allocate the structures so they are present in the refcount
3075          * structures */
3076         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3077                                  &new_reftable_size, NULL, new_refblock_size,
3078                                  new_refcount_bits, &alloc_refblock,
3079                                  &new_allocation, NULL, status_cb, cb_opaque,
3080                                  walk_index++, total_walks, errp);
3081         if (ret < 0) {
3082             goto done;
3083         }
3084 
3085         new_reftable_index = 0;
3086 
3087         if (new_allocation) {
3088             if (new_reftable_offset) {
3089                 qcow2_free_clusters(bs, new_reftable_offset,
3090                                     allocated_reftable_size * sizeof(uint64_t),
3091                                     QCOW2_DISCARD_NEVER);
3092             }
3093 
3094             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3095                                                            sizeof(uint64_t));
3096             if (new_reftable_offset < 0) {
3097                 error_setg_errno(errp, -new_reftable_offset,
3098                                  "Failed to allocate the new reftable");
3099                 ret = new_reftable_offset;
3100                 goto done;
3101             }
3102             allocated_reftable_size = new_reftable_size;
3103         }
3104     } while (new_allocation);
3105 
3106     /* Second, write the new refblocks */
3107     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3108                              &new_reftable_size, new_refblock,
3109                              new_refblock_size, new_refcount_bits,
3110                              &flush_refblock, &new_allocation, new_set_refcount,
3111                              status_cb, cb_opaque, walk_index, walk_index + 1,
3112                              errp);
3113     if (ret < 0) {
3114         goto done;
3115     }
3116     assert(!new_allocation);
3117 
3118 
3119     /* Write the new reftable */
3120     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3121                                         new_reftable_size * sizeof(uint64_t));
3122     if (ret < 0) {
3123         error_setg_errno(errp, -ret, "Overlap check failed");
3124         goto done;
3125     }
3126 
3127     for (i = 0; i < new_reftable_size; i++) {
3128         cpu_to_be64s(&new_reftable[i]);
3129     }
3130 
3131     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3132                       new_reftable_size * sizeof(uint64_t));
3133 
3134     for (i = 0; i < new_reftable_size; i++) {
3135         be64_to_cpus(&new_reftable[i]);
3136     }
3137 
3138     if (ret < 0) {
3139         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3140         goto done;
3141     }
3142 
3143 
3144     /* Empty the refcount cache */
3145     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3146     if (ret < 0) {
3147         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3148         goto done;
3149     }
3150 
3151     /* Update the image header to point to the new reftable; this only updates
3152      * the fields which are relevant to qcow2_update_header(); other fields
3153      * such as s->refcount_table or s->refcount_bits stay stale for now
3154      * (because we have to restore everything if qcow2_update_header() fails) */
3155     old_refcount_order  = s->refcount_order;
3156     old_reftable_size   = s->refcount_table_size;
3157     old_reftable_offset = s->refcount_table_offset;
3158 
3159     s->refcount_order        = refcount_order;
3160     s->refcount_table_size   = new_reftable_size;
3161     s->refcount_table_offset = new_reftable_offset;
3162 
3163     ret = qcow2_update_header(bs);
3164     if (ret < 0) {
3165         s->refcount_order        = old_refcount_order;
3166         s->refcount_table_size   = old_reftable_size;
3167         s->refcount_table_offset = old_reftable_offset;
3168         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3169         goto done;
3170     }
3171 
3172     /* Now update the rest of the in-memory information */
3173     old_reftable = s->refcount_table;
3174     s->refcount_table = new_reftable;
3175     update_max_refcount_table_index(s);
3176 
3177     s->refcount_bits = 1 << refcount_order;
3178     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3179     s->refcount_max += s->refcount_max - 1;
3180 
3181     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3182     s->refcount_block_size = 1 << s->refcount_block_bits;
3183 
3184     s->get_refcount = new_get_refcount;
3185     s->set_refcount = new_set_refcount;
3186 
3187     /* For cleaning up all old refblocks and the old reftable below the "done"
3188      * label */
3189     new_reftable        = old_reftable;
3190     new_reftable_size   = old_reftable_size;
3191     new_reftable_offset = old_reftable_offset;
3192 
3193 done:
3194     if (new_reftable) {
3195         /* On success, new_reftable actually points to the old reftable (and
3196          * new_reftable_size is the old reftable's size); but that is just
3197          * fine */
3198         for (i = 0; i < new_reftable_size; i++) {
3199             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3200             if (offset) {
3201                 qcow2_free_clusters(bs, offset, s->cluster_size,
3202                                     QCOW2_DISCARD_OTHER);
3203             }
3204         }
3205         g_free(new_reftable);
3206 
3207         if (new_reftable_offset > 0) {
3208             qcow2_free_clusters(bs, new_reftable_offset,
3209                                 new_reftable_size * sizeof(uint64_t),
3210                                 QCOW2_DISCARD_OTHER);
3211         }
3212     }
3213 
3214     qemu_vfree(new_refblock);
3215     return ret;
3216 }
3217 
3218 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3219 {
3220     BDRVQcow2State *s = bs->opaque;
3221     uint32_t index = offset_to_reftable_index(s, offset);
3222     int64_t covering_refblock_offset = 0;
3223 
3224     if (index < s->refcount_table_size) {
3225         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3226     }
3227     if (!covering_refblock_offset) {
3228         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3229                                 "not covered by the refcount structures",
3230                                 offset);
3231         return -EIO;
3232     }
3233 
3234     return covering_refblock_offset;
3235 }
3236 
3237 static int qcow2_discard_refcount_block(BlockDriverState *bs,
3238                                         uint64_t discard_block_offs)
3239 {
3240     BDRVQcow2State *s = bs->opaque;
3241     int64_t refblock_offs;
3242     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3243     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3244     void *refblock;
3245     int ret;
3246 
3247     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3248     if (refblock_offs < 0) {
3249         return refblock_offs;
3250     }
3251 
3252     assert(discard_block_offs != 0);
3253 
3254     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3255                           &refblock);
3256     if (ret < 0) {
3257         return ret;
3258     }
3259 
3260     if (s->get_refcount(refblock, block_index) != 1) {
3261         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3262                                 " refblock offset %#" PRIx64
3263                                 ", reftable index %u"
3264                                 ", block offset %#" PRIx64
3265                                 ", refcount %#" PRIx64,
3266                                 refblock_offs,
3267                                 offset_to_reftable_index(s, discard_block_offs),
3268                                 discard_block_offs,
3269                                 s->get_refcount(refblock, block_index));
3270         qcow2_cache_put(s->refcount_block_cache, &refblock);
3271         return -EINVAL;
3272     }
3273     s->set_refcount(refblock, block_index, 0);
3274 
3275     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3276 
3277     qcow2_cache_put(s->refcount_block_cache, &refblock);
3278 
3279     if (cluster_index < s->free_cluster_index) {
3280         s->free_cluster_index = cluster_index;
3281     }
3282 
3283     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3284                                            discard_block_offs);
3285     if (refblock) {
3286         /* discard refblock from the cache if refblock is cached */
3287         qcow2_cache_discard(s->refcount_block_cache, refblock);
3288     }
3289     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3290 
3291     return 0;
3292 }
3293 
3294 int qcow2_shrink_reftable(BlockDriverState *bs)
3295 {
3296     BDRVQcow2State *s = bs->opaque;
3297     uint64_t *reftable_tmp =
3298         g_malloc(s->refcount_table_size * sizeof(uint64_t));
3299     int i, ret;
3300 
3301     for (i = 0; i < s->refcount_table_size; i++) {
3302         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3303         void *refblock;
3304         bool unused_block;
3305 
3306         if (refblock_offs == 0) {
3307             reftable_tmp[i] = 0;
3308             continue;
3309         }
3310         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3311                               &refblock);
3312         if (ret < 0) {
3313             goto out;
3314         }
3315 
3316         /* the refblock has own reference */
3317         if (i == offset_to_reftable_index(s, refblock_offs)) {
3318             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3319                                    (s->refcount_block_size - 1);
3320             uint64_t refcount = s->get_refcount(refblock, block_index);
3321 
3322             s->set_refcount(refblock, block_index, 0);
3323 
3324             unused_block = buffer_is_zero(refblock, s->cluster_size);
3325 
3326             s->set_refcount(refblock, block_index, refcount);
3327         } else {
3328             unused_block = buffer_is_zero(refblock, s->cluster_size);
3329         }
3330         qcow2_cache_put(s->refcount_block_cache, &refblock);
3331 
3332         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3333     }
3334 
3335     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3336                            s->refcount_table_size * sizeof(uint64_t));
3337     /*
3338      * If the write in the reftable failed the image may contain a partially
3339      * overwritten reftable. In this case it would be better to clear the
3340      * reftable in memory to avoid possible image corruption.
3341      */
3342     for (i = 0; i < s->refcount_table_size; i++) {
3343         if (s->refcount_table[i] && !reftable_tmp[i]) {
3344             if (ret == 0) {
3345                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3346                                                        REFT_OFFSET_MASK);
3347             }
3348             s->refcount_table[i] = 0;
3349         }
3350     }
3351 
3352     if (!s->cache_discards) {
3353         qcow2_process_discards(bs, ret);
3354     }
3355 
3356 out:
3357     g_free(reftable_tmp);
3358     return ret;
3359 }
3360 
3361 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3362 {
3363     BDRVQcow2State *s = bs->opaque;
3364     int64_t i;
3365 
3366     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3367         uint64_t refcount;
3368         int ret = qcow2_get_refcount(bs, i, &refcount);
3369         if (ret < 0) {
3370             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3371                     i, strerror(-ret));
3372             return ret;
3373         }
3374         if (refcount > 0) {
3375             return i;
3376         }
3377     }
3378     qcow2_signal_corruption(bs, true, -1, -1,
3379                             "There are no references in the refcount table.");
3380     return -EIO;
3381 }
3382